* [PATCH V17 01/11] acpi: apei: read ack upon ghes record consumption
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-06-30 10:10 ` Robert Richter
2017-05-19 20:32 ` [PATCH V17 02/11] ras: acpi/apei: cper: add support for generic data v3 structure Tyler Baicar
` (11 subsequent siblings)
12 siblings, 1 reply; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
A RAS (Reliability, Availability, Serviceability) controller
may be a separate processor running in parallel with OS
execution, and may generate error records for consumption by
the OS. If the RAS controller produces multiple error records,
then they may be overwritten before the OS has consumed them.
The Generic Hardware Error Source (GHES) v2 structure
introduces the capability for the OS to acknowledge the
consumption of the error record generated by the RAS
controller. A RAS controller supporting GHESv2 shall wait for
the acknowledgment before writing a new error record, thus
eliminating the race condition.
Add support for parsing of GHESv2 sub-tables as well.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Reviewed-by: James Morse <james.morse@arm.com>
---
drivers/acpi/apei/ghes.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---
drivers/acpi/apei/hest.c | 7 ++++--
include/acpi/ghes.h | 5 +++-
3 files changed, 65 insertions(+), 6 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d0855c0..c072acf 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -46,6 +46,7 @@
#include <linux/nmi.h>
#include <linux/sched/clock.h>
+#include <acpi/actbl1.h>
#include <acpi/ghes.h>
#include <acpi/apei.h>
#include <asm/tlbflush.h>
@@ -80,6 +81,11 @@
((struct acpi_hest_generic_status *) \
((struct ghes_estatus_node *)(estatus_node) + 1))
+static inline bool is_hest_type_generic_v2(struct ghes *ghes)
+{
+ return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
+}
+
/*
* This driver isn't really modular, however for the time being,
* continuing to use module_param is the easiest way to remain
@@ -240,6 +246,16 @@ static int ghes_estatus_pool_expand(unsigned long len)
return 0;
}
+static int map_gen_v2(struct ghes *ghes)
+{
+ return apei_map_generic_address(&ghes->generic_v2->read_ack_register);
+}
+
+static void unmap_gen_v2(struct ghes *ghes)
+{
+ apei_unmap_generic_address(&ghes->generic_v2->read_ack_register);
+}
+
static struct ghes *ghes_new(struct acpi_hest_generic *generic)
{
struct ghes *ghes;
@@ -249,10 +265,17 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
if (!ghes)
return ERR_PTR(-ENOMEM);
+
ghes->generic = generic;
+ if (is_hest_type_generic_v2(ghes)) {
+ rc = map_gen_v2(ghes);
+ if (rc)
+ goto err_free;
+ }
+
rc = apei_map_generic_address(&generic->error_status_address);
if (rc)
- goto err_free;
+ goto err_unmap_read_ack_addr;
error_block_length = generic->error_block_length;
if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
pr_warning(FW_WARN GHES_PFX
@@ -264,13 +287,16 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
if (!ghes->estatus) {
rc = -ENOMEM;
- goto err_unmap;
+ goto err_unmap_status_addr;
}
return ghes;
-err_unmap:
+err_unmap_status_addr:
apei_unmap_generic_address(&generic->error_status_address);
+err_unmap_read_ack_addr:
+ if (is_hest_type_generic_v2(ghes))
+ unmap_gen_v2(ghes);
err_free:
kfree(ghes);
return ERR_PTR(rc);
@@ -280,6 +306,8 @@ static void ghes_fini(struct ghes *ghes)
{
kfree(ghes->estatus);
apei_unmap_generic_address(&ghes->generic->error_status_address);
+ if (is_hest_type_generic_v2(ghes))
+ unmap_gen_v2(ghes);
}
static inline int ghes_severity(int severity)
@@ -649,6 +677,21 @@ static void ghes_estatus_cache_add(
rcu_read_unlock();
}
+static int ghes_ack_error(struct acpi_hest_generic_v2 *gv2)
+{
+ int rc;
+ u64 val = 0;
+
+ rc = apei_read(&val, &gv2->read_ack_register);
+ if (rc)
+ return rc;
+
+ val &= gv2->read_ack_preserve << gv2->read_ack_register.bit_offset;
+ val |= gv2->read_ack_write << gv2->read_ack_register.bit_offset;
+
+ return apei_write(val, &gv2->read_ack_register);
+}
+
static int ghes_proc(struct ghes *ghes)
{
int rc;
@@ -661,6 +704,16 @@ static int ghes_proc(struct ghes *ghes)
ghes_estatus_cache_add(ghes->generic, ghes->estatus);
}
ghes_do_proc(ghes, ghes->estatus);
+
+ /*
+ * GHESv2 type HEST entries introduce support for error acknowledgment,
+ * so only acknowledge the error if this support is present.
+ */
+ if (is_hest_type_generic_v2(ghes)) {
+ rc = ghes_ack_error(ghes->generic_v2);
+ if (rc)
+ return rc;
+ }
out:
ghes_clear_estatus(ghes);
return rc;
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 8f2a98e..456b488 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -52,6 +52,7 @@
[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+ [ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2),
};
static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
@@ -141,7 +142,8 @@ static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void
{
int *count = data;
- if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR)
+ if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
+ hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2)
(*count)++;
return 0;
}
@@ -152,7 +154,8 @@ static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data)
struct ghes_arr *ghes_arr = data;
int rc, i;
- if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
+ if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR &&
+ hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2)
return 0;
if (!((struct acpi_hest_generic *)hest_hdr)->enabled)
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 720446c..68f088a 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -13,7 +13,10 @@
#define GHES_EXITING 0x0002
struct ghes {
- struct acpi_hest_generic *generic;
+ union {
+ struct acpi_hest_generic *generic;
+ struct acpi_hest_generic_v2 *generic_v2;
+ };
struct acpi_hest_generic_status *estatus;
u64 buffer_paddr;
unsigned long flags;
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH V17 01/11] acpi: apei: read ack upon ghes record consumption
2017-05-19 20:32 ` [PATCH V17 01/11] acpi: apei: read ack upon ghes record consumption Tyler Baicar
@ 2017-06-30 10:10 ` Robert Richter
[not found] ` <20170630101043.GZ658-vWBEXY7mpu582hYKe6nXyg@public.gmane.org>
0 siblings, 1 reply; 24+ messages in thread
From: Robert Richter @ 2017-06-30 10:10 UTC (permalink / raw)
To: Tyler Baicar
Cc: linux-efi, kvm, matt, catalin.marinas, will.deacon, robert.moore,
paul.gortmaker, lv.zheng, kvmarm, fu.wei, rafael, zjzhang, linux,
gengdongjiu, linux-acpi, eun.taik.lee, shijie.huang, labbott,
lenb, harba, john.garry, marc.zyngier, punit.agrawal, rostedt,
nkaje, bp, sandeepa.s.prabhu, linux-arm-kernel, tony.luck, rjw,
rruigrok, linux-kernel, astone, hanjun.guo, joe, pbonzini, akpm,
bristot
Tyler,
On 19.05.17 14:32:03, Tyler Baicar wrote:
> A RAS (Reliability, Availability, Serviceability) controller
> may be a separate processor running in parallel with OS
> execution, and may generate error records for consumption by
> the OS. If the RAS controller produces multiple error records,
> then they may be overwritten before the OS has consumed them.
>
> The Generic Hardware Error Source (GHES) v2 structure
> introduces the capability for the OS to acknowledge the
> consumption of the error record generated by the RAS
> controller. A RAS controller supporting GHESv2 shall wait for
> the acknowledgment before writing a new error record, thus
> eliminating the race condition.
>
> Add support for parsing of GHESv2 sub-tables as well.
>
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
> Reviewed-by: James Morse <james.morse@arm.com>
> ---
> drivers/acpi/apei/ghes.c | 59 +++++++++++++++++++++++++++++++++++++++++++++---
> drivers/acpi/apei/hest.c | 7 ++++--
> include/acpi/ghes.h | 5 +++-
> 3 files changed, 65 insertions(+), 6 deletions(-)
> static int ghes_proc(struct ghes *ghes)
> {
> int rc;
> @@ -661,6 +704,16 @@ static int ghes_proc(struct ghes *ghes)
> ghes_estatus_cache_add(ghes->generic, ghes->estatus);
> }
> ghes_do_proc(ghes, ghes->estatus);
> +
> + /*
> + * GHESv2 type HEST entries introduce support for error acknowledgment,
> + * so only acknowledge the error if this support is present.
> + */
> + if (is_hest_type_generic_v2(ghes)) {
> + rc = ghes_ack_error(ghes->generic_v2);
> + if (rc)
> + return rc;
> + }
> out:
> ghes_clear_estatus(ghes);
> return rc;
was there any specific reason why the ack is sent before clearing the
block status? Spec says the ack should be sent at last.
Also, the block is never cleared if ghes_ack_error() returns an error.
IMO we should fall through and clear the block status (this will
change anyway if the bloc status is cleared first).
-Robert
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH V17 02/11] ras: acpi/apei: cper: add support for generic data v3 structure
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 01/11] acpi: apei: read ack upon ghes record consumption Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-06-02 14:13 ` Will Deacon
2017-05-19 20:32 ` [PATCH V17 03/11] cper: add timestamp print to CPER status printing Tyler Baicar
` (10 subsequent siblings)
12 siblings, 1 reply; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
The ACPI 6.1 spec adds a new revision of the generic error data
entry structure. Add support to handle the new structure as well
as properly verify and iterate through the generic data entries.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
---
drivers/acpi/apei/ghes.c | 11 +++++------
drivers/firmware/efi/cper.c | 37 ++++++++++++++++++++++---------------
include/acpi/ghes.h | 36 ++++++++++++++++++++++++++++++++++++
3 files changed, 63 insertions(+), 21 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index c072acf..626552e 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -428,8 +428,7 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
unsigned long pfn;
int flags = -1;
int sec_sev = ghes_severity(gdata->error_severity);
- struct cper_sec_mem_err *mem_err;
- mem_err = (struct cper_sec_mem_err *)(gdata + 1);
+ struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return;
@@ -465,8 +464,8 @@ static void ghes_do_proc(struct ghes *ghes,
sec_sev = ghes_severity(gdata->error_severity);
if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
CPER_SEC_PLATFORM_MEM)) {
- struct cper_sec_mem_err *mem_err;
- mem_err = (struct cper_sec_mem_err *)(gdata+1);
+ struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
+
ghes_edac_report_mem_error(ghes, sev, mem_err);
arch_apei_report_mem_error(sev, mem_err);
@@ -475,8 +474,8 @@ static void ghes_do_proc(struct ghes *ghes,
#ifdef CONFIG_ACPI_APEI_PCIEAER
else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
CPER_SEC_PCIE)) {
- struct cper_sec_pcie *pcie_err;
- pcie_err = (struct cper_sec_pcie *)(gdata+1);
+ struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
+
if (sev == GHES_SEV_RECOVERABLE &&
sec_sev == GHES_SEV_RECOVERABLE &&
pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index d425374..9024757 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -32,6 +32,7 @@
#include <linux/acpi.h>
#include <linux/pci.h>
#include <linux/aer.h>
+#include <acpi/ghes.h>
#define INDENT_SP " "
@@ -386,8 +387,9 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
pfx, pcie->bridge.secondary_status, pcie->bridge.control);
}
-static void cper_estatus_print_section(
- const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
+static void
+cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
+ int sec_no)
{
uuid_le *sec_type = (uuid_le *)gdata->section_type;
__u16 severity;
@@ -403,14 +405,16 @@ static void cper_estatus_print_section(
snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) {
- struct cper_sec_proc_generic *proc_err = (void *)(gdata + 1);
+ struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
+
printk("%s""section_type: general processor error\n", newpfx);
if (gdata->error_data_length >= sizeof(*proc_err))
cper_print_proc_generic(newpfx, proc_err);
else
goto err_section_too_small;
} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
- struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
+ struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
+
printk("%s""section_type: memory error\n", newpfx);
if (gdata->error_data_length >=
sizeof(struct cper_sec_mem_err_old))
@@ -419,7 +423,8 @@ static void cper_estatus_print_section(
else
goto err_section_too_small;
} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
- struct cper_sec_pcie *pcie = (void *)(gdata + 1);
+ struct cper_sec_pcie *pcie = acpi_hest_get_payload(gdata);
+
printk("%s""section_type: PCIe error\n", newpfx);
if (gdata->error_data_length >= sizeof(*pcie))
cper_print_pcie(newpfx, pcie, gdata);
@@ -438,7 +443,7 @@ void cper_estatus_print(const char *pfx,
const struct acpi_hest_generic_status *estatus)
{
struct acpi_hest_generic_data *gdata;
- unsigned int data_len, gedata_len;
+ unsigned int data_len;
int sec_no = 0;
char newpfx[64];
__u16 severity;
@@ -452,11 +457,11 @@ void cper_estatus_print(const char *pfx,
data_len = estatus->data_length;
gdata = (struct acpi_hest_generic_data *)(estatus + 1);
snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
- while (data_len >= sizeof(*gdata)) {
- gedata_len = gdata->error_data_length;
+
+ while (data_len >= acpi_hest_get_size(gdata)) {
cper_estatus_print_section(newpfx, gdata, sec_no);
- data_len -= gedata_len + sizeof(*gdata);
- gdata = (void *)(gdata + 1) + gedata_len;
+ data_len -= acpi_hest_get_record_size(gdata);
+ gdata = acpi_hest_get_next(gdata);
sec_no++;
}
}
@@ -486,12 +491,14 @@ int cper_estatus_check(const struct acpi_hest_generic_status *estatus)
return rc;
data_len = estatus->data_length;
gdata = (struct acpi_hest_generic_data *)(estatus + 1);
- while (data_len >= sizeof(*gdata)) {
- gedata_len = gdata->error_data_length;
- if (gedata_len > data_len - sizeof(*gdata))
+
+ while (data_len >= acpi_hest_get_size(gdata)) {
+ gedata_len = acpi_hest_get_error_length(gdata);
+ if (gedata_len > data_len - acpi_hest_get_size(gdata))
return -EINVAL;
- data_len -= gedata_len + sizeof(*gdata);
- gdata = (void *)(gdata + 1) + gedata_len;
+
+ data_len -= acpi_hest_get_record_size(gdata);
+ gdata = acpi_hest_get_next(gdata);
}
if (data_len)
return -EINVAL;
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 68f088a..8f8fea00 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -73,3 +73,39 @@ static inline void ghes_edac_unregister(struct ghes *ghes)
{
}
#endif
+
+static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
+{
+ return gdata->revision >> 8;
+}
+
+static inline void *acpi_hest_get_payload(struct acpi_hest_generic_data *gdata)
+{
+ if (acpi_hest_get_version(gdata) >= 3)
+ return (void *)(((struct acpi_hest_generic_data_v300 *)(gdata)) + 1);
+
+ return gdata + 1;
+}
+
+static inline int acpi_hest_get_error_length(struct acpi_hest_generic_data *gdata)
+{
+ return ((struct acpi_hest_generic_data *)(gdata))->error_data_length;
+}
+
+static inline int acpi_hest_get_size(struct acpi_hest_generic_data *gdata)
+{
+ if (acpi_hest_get_version(gdata) >= 3)
+ return sizeof(struct acpi_hest_generic_data_v300);
+
+ return sizeof(struct acpi_hest_generic_data);
+}
+
+static inline int acpi_hest_get_record_size(struct acpi_hest_generic_data *gdata)
+{
+ return (acpi_hest_get_size(gdata) + acpi_hest_get_error_length(gdata));
+}
+
+static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
+{
+ return (void *)(gdata) + acpi_hest_get_record_size(gdata);
+}
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH V17 02/11] ras: acpi/apei: cper: add support for generic data v3 structure
2017-05-19 20:32 ` [PATCH V17 02/11] ras: acpi/apei: cper: add support for generic data v3 structure Tyler Baicar
@ 2017-06-02 14:13 ` Will Deacon
2017-06-06 9:22 ` Ard Biesheuvel
0 siblings, 1 reply; 24+ messages in thread
From: Will Deacon @ 2017-06-02 14:13 UTC (permalink / raw)
To: Tyler Baicar
Cc: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, rjw, lenb, matt, robert.moore, lv.zheng, nkaje,
zjzhang, mark.rutland, james.morse, akpm, eun.taik.lee,
sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose
On Fri, May 19, 2017 at 02:32:04PM -0600, Tyler Baicar wrote:
> The ACPI 6.1 spec adds a new revision of the generic error data
> entry structure. Add support to handle the new structure as well
> as properly verify and iterate through the generic data entries.
>
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
> ---
> drivers/acpi/apei/ghes.c | 11 +++++------
> drivers/firmware/efi/cper.c | 37 ++++++++++++++++++++++---------------
> include/acpi/ghes.h | 36 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 63 insertions(+), 21 deletions(-)
Given that Boris and Rafael are ok with this series, it makes sense to
take this via arm64, but I need an ack from Ard or Matt on the EFI changes
in this patch and the subsequent one.
Will
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH V17 02/11] ras: acpi/apei: cper: add support for generic data v3 structure
2017-06-02 14:13 ` Will Deacon
@ 2017-06-06 9:22 ` Ard Biesheuvel
0 siblings, 0 replies; 24+ messages in thread
From: Ard Biesheuvel @ 2017-06-06 9:22 UTC (permalink / raw)
To: Will Deacon
Cc: linux-efi, KVM devel mailing list, Matt Fleming, Catalin Marinas,
Tyler Baicar, Robert Moore, Paul Gortmaker, Lv Zheng, kvmarm,
Fu Wei, Rafael J. Wysocki, Jonathan (Zhixiong) Zhang,
Russell King, gengdongjiu, linux-acpi, eun.taik.lee,
shijie.huang, Laura Abbott, Len Brown, harba, John Garry,
Marc Zyngier, Punit Agrawal
On 2 June 2017 at 14:13, Will Deacon <will.deacon@arm.com> wrote:
> On Fri, May 19, 2017 at 02:32:04PM -0600, Tyler Baicar wrote:
>> The ACPI 6.1 spec adds a new revision of the generic error data
>> entry structure. Add support to handle the new structure as well
>> as properly verify and iterate through the generic data entries.
>>
>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>> CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
>> ---
>> drivers/acpi/apei/ghes.c | 11 +++++------
>> drivers/firmware/efi/cper.c | 37 ++++++++++++++++++++++---------------
>> include/acpi/ghes.h | 36 ++++++++++++++++++++++++++++++++++++
>> 3 files changed, 63 insertions(+), 21 deletions(-)
>
> Given that Boris and Rafael are ok with this series, it makes sense to
> take this via arm64, but I need an ack from Ard or Matt on the EFI changes
> in this patch and the subsequent one.
>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH V17 03/11] cper: add timestamp print to CPER status printing
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 01/11] acpi: apei: read ack upon ghes record consumption Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 02/11] ras: acpi/apei: cper: add support for generic data v3 structure Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-06-06 9:25 ` Ard Biesheuvel
2017-05-19 20:32 ` [PATCH V17 04/11] efi: parse ARM processor error Tyler Baicar
` (9 subsequent siblings)
12 siblings, 1 reply; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
The ACPI 6.1 spec added a timestamp to the generic error data
entry structure. Print the timestamp out when printing out the
error information.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
---
drivers/firmware/efi/cper.c | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 9024757..229cf92 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -32,6 +32,8 @@
#include <linux/acpi.h>
#include <linux/pci.h>
#include <linux/aer.h>
+#include <linux/printk.h>
+#include <linux/bcd.h>
#include <acpi/ghes.h>
#define INDENT_SP " "
@@ -387,6 +389,27 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
pfx, pcie->bridge.secondary_status, pcie->bridge.control);
}
+static void cper_print_tstamp(const char *pfx,
+ struct acpi_hest_generic_data_v300 *gdata)
+{
+ __u8 hour, min, sec, day, mon, year, century, *timestamp;
+
+ if (gdata->validation_bits & ACPI_HEST_GEN_VALID_TIMESTAMP) {
+ timestamp = (__u8 *)&(gdata->time_stamp);
+ sec = bcd2bin(timestamp[0]);
+ min = bcd2bin(timestamp[1]);
+ hour = bcd2bin(timestamp[2]);
+ day = bcd2bin(timestamp[4]);
+ mon = bcd2bin(timestamp[5]);
+ year = bcd2bin(timestamp[6]);
+ century = bcd2bin(timestamp[7]);
+
+ printk("%s%ststamp: %02d%02d-%02d-%02d %02d:%02d:%02d\n", pfx,
+ (timestamp[3] & 0x1 ? "precise " : "imprecise "),
+ century, year, mon, day, hour, min, sec);
+ }
+}
+
static void
cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
int sec_no)
@@ -395,6 +418,9 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
__u16 severity;
char newpfx[64];
+ if (acpi_hest_get_version(gdata) >= 3)
+ cper_print_tstamp(pfx, (struct acpi_hest_generic_data_v300 *)gdata);
+
severity = gdata->error_severity;
printk("%s""Error %d, type: %s\n", pfx, sec_no,
cper_severity_str(severity));
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH V17 03/11] cper: add timestamp print to CPER status printing
2017-05-19 20:32 ` [PATCH V17 03/11] cper: add timestamp print to CPER status printing Tyler Baicar
@ 2017-06-06 9:25 ` Ard Biesheuvel
0 siblings, 0 replies; 24+ messages in thread
From: Ard Biesheuvel @ 2017-06-06 9:25 UTC (permalink / raw)
To: Tyler Baicar
Cc: linux-efi, KVM devel mailing list, Matt Fleming, Catalin Marinas,
Will Deacon, Robert Moore, Paul Gortmaker, Lv Zheng, kvmarm,
Fu Wei, Rafael J. Wysocki, Jonathan (Zhixiong) Zhang,
Russell King, gengdongjiu, linux-acpi, eun.taik.lee,
shijie.huang, Laura Abbott, Len Brown, harba, John Garry,
Marc Zyngier, Punit Agrawal
On 19 May 2017 at 21:32, Tyler Baicar <tbaicar@codeaurora.org> wrote:
> The ACPI 6.1 spec added a timestamp to the generic error data
> entry structure. Print the timestamp out when printing out the
> error information.
>
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
> drivers/firmware/efi/cper.c | 26 ++++++++++++++++++++++++++
> 1 file changed, 26 insertions(+)
>
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index 9024757..229cf92 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -32,6 +32,8 @@
> #include <linux/acpi.h>
> #include <linux/pci.h>
> #include <linux/aer.h>
> +#include <linux/printk.h>
> +#include <linux/bcd.h>
> #include <acpi/ghes.h>
>
> #define INDENT_SP " "
> @@ -387,6 +389,27 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
> pfx, pcie->bridge.secondary_status, pcie->bridge.control);
> }
>
> +static void cper_print_tstamp(const char *pfx,
> + struct acpi_hest_generic_data_v300 *gdata)
> +{
> + __u8 hour, min, sec, day, mon, year, century, *timestamp;
> +
> + if (gdata->validation_bits & ACPI_HEST_GEN_VALID_TIMESTAMP) {
> + timestamp = (__u8 *)&(gdata->time_stamp);
> + sec = bcd2bin(timestamp[0]);
> + min = bcd2bin(timestamp[1]);
> + hour = bcd2bin(timestamp[2]);
> + day = bcd2bin(timestamp[4]);
> + mon = bcd2bin(timestamp[5]);
> + year = bcd2bin(timestamp[6]);
> + century = bcd2bin(timestamp[7]);
> +
> + printk("%s%ststamp: %02d%02d-%02d-%02d %02d:%02d:%02d\n", pfx,
> + (timestamp[3] & 0x1 ? "precise " : "imprecise "),
> + century, year, mon, day, hour, min, sec);
> + }
> +}
> +
> static void
> cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
> int sec_no)
> @@ -395,6 +418,9 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
> __u16 severity;
> char newpfx[64];
>
> + if (acpi_hest_get_version(gdata) >= 3)
> + cper_print_tstamp(pfx, (struct acpi_hest_generic_data_v300 *)gdata);
> +
> severity = gdata->error_severity;
> printk("%s""Error %d, type: %s\n", pfx, sec_no,
> cper_severity_str(severity));
> --
> Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-efi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH V17 04/11] efi: parse ARM processor error
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (2 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 03/11] cper: add timestamp print to CPER status printing Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 05/11] arm64: exception: handle Synchronous External Abort Tyler Baicar
` (8 subsequent siblings)
12 siblings, 0 replies; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
Add support for ARM Common Platform Error Record (CPER).
UEFI 2.6 specification adds support for ARM specific
processor error information to be reported as part of the
CPER records. This provides more detail on for processor error logs.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Reviewed-by: James Morse <james.morse@arm.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
drivers/firmware/efi/cper.c | 129 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/cper.h | 54 +++++++++++++++++++
2 files changed, 183 insertions(+)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 229cf92..eac0854 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -110,12 +110,15 @@ void cper_print_bits(const char *pfx, unsigned int bits,
static const char * const proc_type_strs[] = {
"IA32/X64",
"IA64",
+ "ARM",
};
static const char * const proc_isa_strs[] = {
"IA32",
"IA64",
"X64",
+ "ARM A32/T32",
+ "ARM A64",
};
static const char * const proc_error_type_strs[] = {
@@ -184,6 +187,122 @@ static void cper_print_proc_generic(const char *pfx,
printk("%s""IP: 0x%016llx\n", pfx, proc->ip);
}
+#if defined(CONFIG_ARM64) || defined(CONFIG_ARM)
+static const char * const arm_reg_ctx_strs[] = {
+ "AArch32 general purpose registers",
+ "AArch32 EL1 context registers",
+ "AArch32 EL2 context registers",
+ "AArch32 secure context registers",
+ "AArch64 general purpose registers",
+ "AArch64 EL1 context registers",
+ "AArch64 EL2 context registers",
+ "AArch64 EL3 context registers",
+ "Misc. system register structure",
+};
+
+static void cper_print_proc_arm(const char *pfx,
+ const struct cper_sec_proc_arm *proc)
+{
+ int i, len, max_ctx_type;
+ struct cper_arm_err_info *err_info;
+ struct cper_arm_ctx_info *ctx_info;
+ char newpfx[64];
+
+ printk("%sMIDR: 0x%016llx\n", pfx, proc->midr);
+
+ len = proc->section_length - (sizeof(*proc) +
+ proc->err_info_num * (sizeof(*err_info)));
+ if (len < 0) {
+ printk("%ssection length: %d\n", pfx, proc->section_length);
+ printk("%ssection length is too small\n", pfx);
+ printk("%sfirmware-generated error record is incorrect\n", pfx);
+ printk("%sERR_INFO_NUM is %d\n", pfx, proc->err_info_num);
+ return;
+ }
+
+ if (proc->validation_bits & CPER_ARM_VALID_MPIDR)
+ printk("%sMultiprocessor Affinity Register (MPIDR): 0x%016llx\n",
+ pfx, proc->mpidr);
+
+ if (proc->validation_bits & CPER_ARM_VALID_AFFINITY_LEVEL)
+ printk("%serror affinity level: %d\n", pfx,
+ proc->affinity_level);
+
+ if (proc->validation_bits & CPER_ARM_VALID_RUNNING_STATE) {
+ printk("%srunning state: 0x%x\n", pfx, proc->running_state);
+ printk("%sPower State Coordination Interface state: %d\n",
+ pfx, proc->psci_state);
+ }
+
+ snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
+
+ err_info = (struct cper_arm_err_info *)(proc + 1);
+ for (i = 0; i < proc->err_info_num; i++) {
+ printk("%sError info structure %d:\n", pfx, i);
+
+ printk("%snum errors: %d\n", pfx, err_info->multiple_error + 1);
+
+ if (err_info->validation_bits & CPER_ARM_INFO_VALID_FLAGS) {
+ if (err_info->flags & CPER_ARM_INFO_FLAGS_FIRST)
+ printk("%sfirst error captured\n", newpfx);
+ if (err_info->flags & CPER_ARM_INFO_FLAGS_LAST)
+ printk("%slast error captured\n", newpfx);
+ if (err_info->flags & CPER_ARM_INFO_FLAGS_PROPAGATED)
+ printk("%spropagated error captured\n",
+ newpfx);
+ if (err_info->flags & CPER_ARM_INFO_FLAGS_OVERFLOW)
+ printk("%soverflow occurred, error info is incomplete\n",
+ newpfx);
+ }
+
+ printk("%serror_type: %d, %s\n", newpfx, err_info->type,
+ err_info->type < ARRAY_SIZE(proc_error_type_strs) ?
+ proc_error_type_strs[err_info->type] : "unknown");
+ if (err_info->validation_bits & CPER_ARM_INFO_VALID_ERR_INFO)
+ printk("%serror_info: 0x%016llx\n", newpfx,
+ err_info->error_info);
+ if (err_info->validation_bits & CPER_ARM_INFO_VALID_VIRT_ADDR)
+ printk("%svirtual fault address: 0x%016llx\n",
+ newpfx, err_info->virt_fault_addr);
+ if (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR)
+ printk("%sphysical fault address: 0x%016llx\n",
+ newpfx, err_info->physical_fault_addr);
+ err_info += 1;
+ }
+
+ ctx_info = (struct cper_arm_ctx_info *)err_info;
+ max_ctx_type = ARRAY_SIZE(arm_reg_ctx_strs) - 1;
+ for (i = 0; i < proc->context_info_num; i++) {
+ int size = sizeof(*ctx_info) + ctx_info->size;
+
+ printk("%sContext info structure %d:\n", pfx, i);
+ if (len < size) {
+ printk("%ssection length is too small\n", newpfx);
+ printk("%sfirmware-generated error record is incorrect\n", pfx);
+ return;
+ }
+ if (ctx_info->type > max_ctx_type) {
+ printk("%sInvalid context type: %d (max: %d)\n",
+ newpfx, ctx_info->type, max_ctx_type);
+ return;
+ }
+ printk("%sregister context type: %s\n", newpfx,
+ arm_reg_ctx_strs[ctx_info->type]);
+ print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4,
+ (ctx_info + 1), ctx_info->size, 0);
+ len -= size;
+ ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + size);
+ }
+
+ if (len > 0) {
+ printk("%sVendor specific error info has %u bytes:\n", pfx,
+ len);
+ print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4, ctx_info,
+ len, true);
+ }
+}
+#endif
+
static const char * const mem_err_type_strs[] = {
"unknown",
"no error",
@@ -456,6 +575,16 @@ static void cper_print_tstamp(const char *pfx,
cper_print_pcie(newpfx, pcie, gdata);
else
goto err_section_too_small;
+#if defined(CONFIG_ARM64) || defined(CONFIG_ARM)
+ } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_ARM)) {
+ struct cper_sec_proc_arm *arm_err = acpi_hest_get_payload(gdata);
+
+ printk("%ssection_type: ARM processor error\n", newpfx);
+ if (gdata->error_data_length >= sizeof(*arm_err))
+ cper_print_proc_arm(newpfx, arm_err);
+ else
+ goto err_section_too_small;
+#endif
} else
printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index dcacb1a..4c671fc 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -180,6 +180,10 @@ enum {
#define CPER_SEC_PROC_IPF \
UUID_LE(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00, \
0x80, 0xC7, 0x3C, 0x88, 0x81)
+/* Processor Specific: ARM */
+#define CPER_SEC_PROC_ARM \
+ UUID_LE(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05, \
+ 0x1D, 0x5D, 0x46, 0xB0)
/* Platform Memory */
#define CPER_SEC_PLATFORM_MEM \
UUID_LE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \
@@ -255,6 +259,22 @@ enum {
#define CPER_PCIE_SLOT_SHIFT 3
+#define CPER_ARM_VALID_MPIDR BIT(0)
+#define CPER_ARM_VALID_AFFINITY_LEVEL BIT(1)
+#define CPER_ARM_VALID_RUNNING_STATE BIT(2)
+#define CPER_ARM_VALID_VENDOR_INFO BIT(3)
+
+#define CPER_ARM_INFO_VALID_MULTI_ERR BIT(0)
+#define CPER_ARM_INFO_VALID_FLAGS BIT(1)
+#define CPER_ARM_INFO_VALID_ERR_INFO BIT(2)
+#define CPER_ARM_INFO_VALID_VIRT_ADDR BIT(3)
+#define CPER_ARM_INFO_VALID_PHYSICAL_ADDR BIT(4)
+
+#define CPER_ARM_INFO_FLAGS_FIRST BIT(0)
+#define CPER_ARM_INFO_FLAGS_LAST BIT(1)
+#define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2)
+#define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3)
+
/*
* All tables and structs must be byte-packed to match CPER
* specification, since the tables are provided by the system BIOS
@@ -340,6 +360,40 @@ struct cper_ia_proc_ctx {
__u64 mm_reg_addr;
};
+/* ARM Processor Error Section */
+struct cper_sec_proc_arm {
+ __u32 validation_bits;
+ __u16 err_info_num; /* Number of Processor Error Info */
+ __u16 context_info_num; /* Number of Processor Context Info Records*/
+ __u32 section_length;
+ __u8 affinity_level;
+ __u8 reserved[3]; /* must be zero */
+ __u64 mpidr;
+ __u64 midr;
+ __u32 running_state; /* Bit 0 set - Processor running. PSCI = 0 */
+ __u32 psci_state;
+};
+
+/* ARM Processor Error Information Structure */
+struct cper_arm_err_info {
+ __u8 version;
+ __u8 length;
+ __u16 validation_bits;
+ __u8 type;
+ __u16 multiple_error;
+ __u8 flags;
+ __u64 error_info;
+ __u64 virt_fault_addr;
+ __u64 physical_fault_addr;
+};
+
+/* ARM Processor Context Information Structure */
+struct cper_arm_ctx_info {
+ __u16 version;
+ __u16 type;
+ __u32 size;
+};
+
/* Old Memory Error Section UEFI 2.1, 2.2 */
struct cper_sec_mem_err_old {
__u64 validation_bits;
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH V17 05/11] arm64: exception: handle Synchronous External Abort
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (3 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 04/11] efi: parse ARM processor error Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 06/11] acpi: apei: handle SEA notification type for ARMv8 Tyler Baicar
` (7 subsequent siblings)
12 siblings, 0 replies; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
SEA exceptions are often caused by an uncorrected hardware
error, and are handled when data abort and instruction abort
exception classes have specific values for their Fault Status
Code.
When SEA occurs, before killing the process, report the error
in the kernel logs.
Update fault_info[] with specific SEA faults so that the
new SEA handler is used.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Reviewed-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
---
arch/arm64/include/asm/esr.h | 1 +
arch/arm64/mm/fault.c | 45 ++++++++++++++++++++++++++++++++++----------
2 files changed, 36 insertions(+), 10 deletions(-)
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 85997c0..28bf02e 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -83,6 +83,7 @@
#define ESR_ELx_WNR (UL(1) << 6)
/* Shared ISS field definitions for Data/Instruction aborts */
+#define ESR_ELx_FnV (UL(1) << 10)
#define ESR_ELx_EA (UL(1) << 9)
#define ESR_ELx_S1PTW (UL(1) << 7)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 37b95df..6697816 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -522,6 +522,31 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
return 1;
}
+/*
+ * This abort handler deals with Synchronous External Abort.
+ * It calls notifiers, and then returns "fault".
+ */
+static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+{
+ struct siginfo info;
+ const struct fault_info *inf;
+
+ inf = esr_to_fault_info(esr);
+ pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
+ inf->name, esr, addr);
+
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = 0;
+ if (esr & ESR_ELx_FnV)
+ info.si_addr = 0;
+ else
+ info.si_addr = (void __user *)addr;
+ arm64_notify_die("", regs, &info, esr);
+
+ return 0;
+}
+
static const struct fault_info fault_info[] = {
{ do_bad, SIGBUS, 0, "ttbr address size fault" },
{ do_bad, SIGBUS, 0, "level 1 address size fault" },
@@ -539,22 +564,22 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
- { do_bad, SIGBUS, 0, "synchronous external abort" },
+ { do_sea, SIGBUS, 0, "synchronous external abort" },
{ do_bad, SIGBUS, 0, "unknown 17" },
{ do_bad, SIGBUS, 0, "unknown 18" },
{ do_bad, SIGBUS, 0, "unknown 19" },
- { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" },
- { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" },
- { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" },
- { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" },
- { do_bad, SIGBUS, 0, "synchronous parity error" },
+ { do_sea, SIGBUS, 0, "level 0 (translation table walk)" },
+ { do_sea, SIGBUS, 0, "level 1 (translation table walk)" },
+ { do_sea, SIGBUS, 0, "level 2 (translation table walk)" },
+ { do_sea, SIGBUS, 0, "level 3 (translation table walk)" },
+ { do_sea, SIGBUS, 0, "synchronous parity or ECC error" },
{ do_bad, SIGBUS, 0, "unknown 25" },
{ do_bad, SIGBUS, 0, "unknown 26" },
{ do_bad, SIGBUS, 0, "unknown 27" },
- { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" },
- { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" },
- { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" },
- { do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" },
+ { do_sea, SIGBUS, 0, "level 0 synchronous parity error (translation table walk)" },
+ { do_sea, SIGBUS, 0, "level 1 synchronous parity error (translation table walk)" },
+ { do_sea, SIGBUS, 0, "level 2 synchronous parity error (translation table walk)" },
+ { do_sea, SIGBUS, 0, "level 3 synchronous parity error (translation table walk)" },
{ do_bad, SIGBUS, 0, "unknown 32" },
{ do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" },
{ do_bad, SIGBUS, 0, "unknown 34" },
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH V17 06/11] acpi: apei: handle SEA notification type for ARMv8
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (4 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 05/11] arm64: exception: handle Synchronous External Abort Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 07/11] acpi: apei: panic OS with fatal error status block Tyler Baicar
` (6 subsequent siblings)
12 siblings, 0 replies; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
ARM APEI extension proposal added SEA (Synchronous External Abort)
notification type for ARMv8.
Add a new GHES error source handling function for SEA. If an error
source's notification type is SEA, then this function can be registered
into the SEA exception handler. That way GHES will parse and report
SEA exceptions when they occur.
An SEA can interrupt code that had interrupts masked and is treated as
an NMI. To aid this the page of address space for mapping APEI buffers
while in_nmi() is always reserved, and ghes_ioremap_pfn_nmi() is
changed to use the helper methods to find the prot_t to map with in
the same way as ghes_ioremap_pfn_irq().
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Reviewed-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
---
arch/arm64/Kconfig | 2 ++
arch/arm64/mm/fault.c | 17 ++++++++++++
drivers/acpi/apei/Kconfig | 15 ++++++++++
drivers/acpi/apei/ghes.c | 70 +++++++++++++++++++++++++++++++++++++++++++----
include/acpi/ghes.h | 7 +++++
5 files changed, 105 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3dcd7ec..8055997 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -19,6 +19,7 @@ config ARM64
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG if ACPI_APEI_SEA
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING
@@ -92,6 +93,7 @@ config ARM64
select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP if NUMA
+ select HAVE_NMI if ACPI_APEI_SEA
select HAVE_PATA_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_PERF_REGS
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 6697816..9b4e26f 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -42,6 +42,8 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
+#include <acpi/ghes.h>
+
struct fault_info {
int (*fn)(unsigned long addr, unsigned int esr,
struct pt_regs *regs);
@@ -535,6 +537,21 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
inf->name, esr, addr);
+ /*
+ * Synchronous aborts may interrupt code which had interrupts masked.
+ * Before calling out into the wider kernel tell the interested
+ * subsystems.
+ */
+ if (IS_ENABLED(CONFIG_ACPI_APEI_SEA)) {
+ if (interrupts_enabled(regs))
+ nmi_enter();
+
+ ghes_notify_sea();
+
+ if (interrupts_enabled(regs))
+ nmi_exit();
+ }
+
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = 0;
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index b0140c8..de14d49 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -39,6 +39,21 @@ config ACPI_APEI_PCIEAER
PCIe AER errors may be reported via APEI firmware first mode.
Turn on this option to enable the corresponding support.
+config ACPI_APEI_SEA
+ bool "APEI Synchronous External Abort logging/recovering support"
+ depends on ARM64 && ACPI_APEI_GHES
+ default y
+ help
+ This option should be enabled if the system supports
+ firmware first handling of SEA (Synchronous External Abort).
+ SEA happens with certain faults of data abort or instruction
+ abort synchronous exceptions on ARMv8 systems. If a system
+ supports firmware first handling of SEA, the platform analyzes
+ and handles hardware error notifications from SEA, and it may then
+ form a HW error record for the OS to parse and handle. This
+ option allows the OS to look for such hardware error record, and
+ take appropriate action.
+
config ACPI_APEI_MEMORY_FAILURE
bool "APEI memory error recovering support"
depends on ACPI_APEI && MEMORY_FAILURE
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 626552e..7e59a73 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -116,11 +116,7 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes)
* Two virtual pages are used, one for IRQ/PROCESS context, the other for
* NMI context (optionally).
*/
-#ifdef CONFIG_HAVE_ACPI_APEI_NMI
#define GHES_IOREMAP_PAGES 2
-#else
-#define GHES_IOREMAP_PAGES 1
-#endif
#define GHES_IOREMAP_IRQ_PAGE(base) (base)
#define GHES_IOREMAP_NMI_PAGE(base) ((base) + PAGE_SIZE)
@@ -159,10 +155,14 @@ static void ghes_ioremap_exit(void)
static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
{
unsigned long vaddr;
+ phys_addr_t paddr;
+ pgprot_t prot;
vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr);
- ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
- pfn << PAGE_SHIFT, PAGE_KERNEL);
+
+ paddr = pfn << PAGE_SHIFT;
+ prot = arch_apei_get_mem_attribute(paddr);
+ ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot);
return (void __iomem *)vaddr;
}
@@ -774,6 +774,50 @@ static int ghes_notify_sci(struct notifier_block *this,
.notifier_call = ghes_notify_sci,
};
+#ifdef CONFIG_ACPI_APEI_SEA
+static LIST_HEAD(ghes_sea);
+
+void ghes_notify_sea(void)
+{
+ struct ghes *ghes;
+
+ /*
+ * synchronize_rcu() will wait for nmi_exit(), so no need to
+ * rcu_read_lock().
+ */
+ list_for_each_entry_rcu(ghes, &ghes_sea, list) {
+ ghes_proc(ghes);
+ }
+}
+
+static void ghes_sea_add(struct ghes *ghes)
+{
+ mutex_lock(&ghes_list_mutex);
+ list_add_rcu(&ghes->list, &ghes_sea);
+ mutex_unlock(&ghes_list_mutex);
+}
+
+static void ghes_sea_remove(struct ghes *ghes)
+{
+ mutex_lock(&ghes_list_mutex);
+ list_del_rcu(&ghes->list);
+ mutex_unlock(&ghes_list_mutex);
+ synchronize_rcu();
+}
+#else /* CONFIG_ACPI_APEI_SEA */
+static inline void ghes_sea_add(struct ghes *ghes)
+{
+ pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n",
+ ghes->generic->header.source_id);
+}
+
+static inline void ghes_sea_remove(struct ghes *ghes)
+{
+ pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n",
+ ghes->generic->header.source_id);
+}
+#endif /* CONFIG_ACPI_APEI_SEA */
+
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
/*
* printk is not safe in NMI context. So in NMI handler, we allocate
@@ -1019,6 +1063,14 @@ static int ghes_probe(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_EXTERNAL:
case ACPI_HEST_NOTIFY_SCI:
break;
+ case ACPI_HEST_NOTIFY_SEA:
+ if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA)) {
+ pr_warn(GHES_PFX "Generic hardware error source: %d notified via SEA is not supported\n",
+ generic->header.source_id);
+ rc = -ENOTSUPP;
+ goto err;
+ }
+ break;
case ACPI_HEST_NOTIFY_NMI:
if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
@@ -1083,6 +1135,9 @@ static int ghes_probe(struct platform_device *ghes_dev)
list_add_rcu(&ghes->list, &ghes_sci);
mutex_unlock(&ghes_list_mutex);
break;
+ case ACPI_HEST_NOTIFY_SEA:
+ ghes_sea_add(ghes);
+ break;
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_add(ghes);
break;
@@ -1126,6 +1181,9 @@ static int ghes_remove(struct platform_device *ghes_dev)
mutex_unlock(&ghes_list_mutex);
synchronize_rcu();
break;
+ case ACPI_HEST_NOTIFY_SEA:
+ ghes_sea_remove(ghes);
+ break;
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_remove(ghes);
break;
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 8f8fea00..95305ae 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -1,3 +1,6 @@
+#ifndef GHES_H
+#define GHES_H
+
#include <acpi/apei.h>
#include <acpi/hed.h>
@@ -109,3 +112,7 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
{
return (void *)(gdata) + acpi_hest_get_record_size(gdata);
}
+
+void ghes_notify_sea(void);
+
+#endif /* GHES_H */
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH V17 07/11] acpi: apei: panic OS with fatal error status block
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (5 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 06/11] acpi: apei: handle SEA notification type for ARMv8 Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 08/11] efi: print unrecognized CPER section Tyler Baicar
` (5 subsequent siblings)
12 siblings, 0 replies; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
From: "Jonathan (Zhixiong) Zhang" <zjzhang@codeaurora.org>
Even if an error status block's severity is fatal, the kernel does not
honor the severity level and panic.
With the firmware first model, the platform could inform the OS about a
fatal hardware error through the non-NMI GHES notification type. The OS
should panic when a hardware error record is received with this
severity.
Call panic() after CPER data in error status block is printed if
severity is fatal, before each error section is handled.
Signed-off-by: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
Reviewed-by: James Morse <james.morse@arm.com>
---
drivers/acpi/apei/ghes.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 7e59a73..fadfb43 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -135,6 +135,8 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes)
static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
static atomic_t ghes_estatus_cache_alloced;
+static int ghes_panic_timeout __read_mostly = 30;
+
static int ghes_ioremap_init(void)
{
ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
@@ -691,6 +693,16 @@ static int ghes_ack_error(struct acpi_hest_generic_v2 *gv2)
return apei_write(val, &gv2->read_ack_register);
}
+static void __ghes_panic(struct ghes *ghes)
+{
+ __ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
+
+ /* reboot to log the error! */
+ if (!panic_timeout)
+ panic_timeout = ghes_panic_timeout;
+ panic("Fatal hardware error!");
+}
+
static int ghes_proc(struct ghes *ghes)
{
int rc;
@@ -698,6 +710,11 @@ static int ghes_proc(struct ghes *ghes)
rc = ghes_read_estatus(ghes, 0);
if (rc)
goto out;
+
+ if (ghes_severity(ghes->estatus->error_severity) >= GHES_SEV_PANIC) {
+ __ghes_panic(ghes);
+ }
+
if (!ghes_estatus_cached(ghes->estatus)) {
if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
ghes_estatus_cache_add(ghes->generic, ghes->estatus);
@@ -838,8 +855,6 @@ static inline void ghes_sea_remove(struct ghes *ghes)
static LIST_HEAD(ghes_nmi);
-static int ghes_panic_timeout __read_mostly = 30;
-
static void ghes_proc_in_irq(struct irq_work *irq_work)
{
struct llist_node *llnode, *next;
@@ -925,18 +940,6 @@ static void __process_error(struct ghes *ghes)
#endif
}
-static void __ghes_panic(struct ghes *ghes)
-{
- oops_begin();
- ghes_print_queued_estatus();
- __ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
-
- /* reboot to log the error! */
- if (panic_timeout == 0)
- panic_timeout = ghes_panic_timeout;
- panic("Fatal hardware error!");
-}
-
static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
{
struct ghes *ghes;
@@ -954,8 +957,11 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
}
sev = ghes_severity(ghes->estatus->error_severity);
- if (sev >= GHES_SEV_PANIC)
+ if (sev >= GHES_SEV_PANIC) {
+ oops_begin();
+ ghes_print_queued_estatus();
__ghes_panic(ghes);
+ }
if (!(ghes->flags & GHES_TO_CLEAR))
continue;
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH V17 08/11] efi: print unrecognized CPER section
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (6 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 07/11] acpi: apei: panic OS with fatal error status block Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-06-07 10:08 ` Ard Biesheuvel
2017-05-19 20:32 ` [PATCH V17 09/11] ras: acpi / apei: generate trace event for " Tyler Baicar
` (4 subsequent siblings)
12 siblings, 1 reply; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
UEFI spec allows for non-standard section in Common Platform Error
Record. This is defined in section N.2.3 of UEFI version 2.5.
Currently if the CPER section's type (UUID) does not match with
one of the section types that the kernel knows how to parse, the
section is skipped. Therefore, user is not able to see
such CPER data, for instance, error record of non-standard section.
This change prints out the raw data in hex in the dmesg buffer so
that non-standard sections are reported to the user. Non-standard
section type errors should be reported to the user because these
can include errors which are vendor specific. The data length is
taken from Error Data length field of Generic Error Data Entry.
The following is a sample output from dmesg:
Hardware error from APEI Generic Hardware Error Source: 2
It has been corrected by h/w and requires no further action
event severity: corrected
time: precise 2017-03-15 20:37:35
Error 0, type: corrected
section type: unknown, d2e2621c-f936-468d-0d84-15a4ed015c8b
section length: 0x238
00000000: 4d415201 4d492031 453a4d45 435f4343 .RAM1 IMEM:ECC_C
00000010: 53515f45 44525f42 00000000 00000000 E_QSB_RD........
00000020: 00000000 00000000 00000000 00000000 ................
00000030: 00000000 00000000 01010000 01010000 ................
00000040: 00000000 00000000 00000005 00000000 ................
00000050: 01010000 00000000 00000001 00dddd00 ................
...
The raw data from the error can then be decoded using vendor
specific tools.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Reviewed-by: James Morse <james.morse@arm.com>
---
drivers/firmware/efi/cper.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index eac0854..d5a5855 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -585,8 +585,15 @@ static void cper_print_tstamp(const char *pfx,
else
goto err_section_too_small;
#endif
- } else
- printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
+ } else {
+ const void *err = acpi_hest_get_payload(gdata);
+
+ printk("%ssection type: unknown, %pUl\n", newpfx, sec_type);
+ printk("%ssection length: %#x\n", newpfx,
+ gdata->error_data_length);
+ print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4, err,
+ gdata->error_data_length, true);
+ }
return;
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH V17 08/11] efi: print unrecognized CPER section
2017-05-19 20:32 ` [PATCH V17 08/11] efi: print unrecognized CPER section Tyler Baicar
@ 2017-06-07 10:08 ` Ard Biesheuvel
0 siblings, 0 replies; 24+ messages in thread
From: Ard Biesheuvel @ 2017-06-07 10:08 UTC (permalink / raw)
To: Tyler Baicar
Cc: linux-efi, KVM devel mailing list, Matt Fleming, Catalin Marinas,
Will Deacon, Robert Moore, Paul Gortmaker, Lv Zheng, kvmarm,
Fu Wei, Rafael J. Wysocki, Jonathan (Zhixiong) Zhang,
Russell King, gengdongjiu, linux-acpi, eun.taik.lee,
shijie.huang, Laura Abbott, Len Brown, harba, John Garry,
Marc Zyngier, Punit Agrawal
On 19 May 2017 at 21:32, Tyler Baicar <tbaicar@codeaurora.org> wrote:
> UEFI spec allows for non-standard section in Common Platform Error
> Record. This is defined in section N.2.3 of UEFI version 2.5.
>
> Currently if the CPER section's type (UUID) does not match with
> one of the section types that the kernel knows how to parse, the
> section is skipped. Therefore, user is not able to see
> such CPER data, for instance, error record of non-standard section.
>
> This change prints out the raw data in hex in the dmesg buffer so
> that non-standard sections are reported to the user. Non-standard
> section type errors should be reported to the user because these
> can include errors which are vendor specific. The data length is
> taken from Error Data length field of Generic Error Data Entry.
>
> The following is a sample output from dmesg:
> Hardware error from APEI Generic Hardware Error Source: 2
> It has been corrected by h/w and requires no further action
> event severity: corrected
> time: precise 2017-03-15 20:37:35
> Error 0, type: corrected
> section type: unknown, d2e2621c-f936-468d-0d84-15a4ed015c8b
> section length: 0x238
> 00000000: 4d415201 4d492031 453a4d45 435f4343 .RAM1 IMEM:ECC_C
> 00000010: 53515f45 44525f42 00000000 00000000 E_QSB_RD........
> 00000020: 00000000 00000000 00000000 00000000 ................
> 00000030: 00000000 00000000 01010000 01010000 ................
> 00000040: 00000000 00000000 00000005 00000000 ................
> 00000050: 01010000 00000000 00000001 00dddd00 ................
> ...
>
> The raw data from the error can then be decoded using vendor
> specific tools.
>
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
> Reviewed-by: James Morse <james.morse@arm.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
> drivers/firmware/efi/cper.c | 11 +++++++++--
> 1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
> index eac0854..d5a5855 100644
> --- a/drivers/firmware/efi/cper.c
> +++ b/drivers/firmware/efi/cper.c
> @@ -585,8 +585,15 @@ static void cper_print_tstamp(const char *pfx,
> else
> goto err_section_too_small;
> #endif
> - } else
> - printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
> + } else {
> + const void *err = acpi_hest_get_payload(gdata);
> +
> + printk("%ssection type: unknown, %pUl\n", newpfx, sec_type);
> + printk("%ssection length: %#x\n", newpfx,
> + gdata->error_data_length);
> + print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, 4, err,
> + gdata->error_data_length, true);
> + }
>
> return;
>
> --
> Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-efi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH V17 09/11] ras: acpi / apei: generate trace event for unrecognized CPER section
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (7 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 08/11] efi: print unrecognized CPER section Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 10/11] trace, ras: add ARM processor error trace event Tyler Baicar
` (3 subsequent siblings)
12 siblings, 0 replies; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
The UEFI spec includes non-standard section type support in the
Common Platform Error Record. This is defined in section N.2.3 of
UEFI version 2.5.
Currently if the CPER section's type (UUID) does not match any
section type that the kernel knows how to parse, a trace event is
not generated.
Generate a trace event which contains the raw error data for
non-standard section type error records.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Tested-by: Shiju Jose <shiju.jose@huawei.com>
---
drivers/acpi/apei/ghes.c | 27 +++++++++++++++++++++++----
drivers/ras/ras.c | 10 +++++++++-
include/linux/ras.h | 12 ++++++++++++
include/ras/ras_event.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/uuid.h | 6 ++++--
5 files changed, 93 insertions(+), 7 deletions(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fadfb43..16adbc8 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -45,11 +45,14 @@
#include <linux/aer.h>
#include <linux/nmi.h>
#include <linux/sched/clock.h>
+#include <linux/uuid.h>
+#include <linux/ras.h>
#include <acpi/actbl1.h>
#include <acpi/ghes.h>
#include <acpi/apei.h>
#include <asm/tlbflush.h>
+#include <ras/ras_event.h>
#include "apei-internal.h"
@@ -460,12 +463,22 @@ static void ghes_do_proc(struct ghes *ghes,
{
int sev, sec_sev;
struct acpi_hest_generic_data *gdata;
+ uuid_le sec_type;
+ uuid_le *fru_id = &NULL_UUID_LE;
+ char *fru_text = "";
sev = ghes_severity(estatus->error_severity);
apei_estatus_for_each_section(estatus, gdata) {
sec_sev = ghes_severity(gdata->error_severity);
- if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
- CPER_SEC_PLATFORM_MEM)) {
+ sec_type = *(uuid_le *)gdata->section_type;
+
+ if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+ fru_id = (uuid_le *)gdata->fru_id;
+
+ if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+ fru_text = gdata->fru_text;
+
+ if (!uuid_le_cmp(sec_type, CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
ghes_edac_report_mem_error(ghes, sev, mem_err);
@@ -474,8 +487,7 @@ static void ghes_do_proc(struct ghes *ghes,
ghes_handle_memory_failure(gdata, sev);
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
- else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
- CPER_SEC_PCIE)) {
+ else if (!uuid_le_cmp(sec_type, CPER_SEC_PCIE)) {
struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
if (sev == GHES_SEV_RECOVERABLE &&
@@ -506,6 +518,13 @@ static void ghes_do_proc(struct ghes *ghes,
}
#endif
+ else {
+ void *err = acpi_hest_get_payload(gdata);
+
+ log_non_standard_event(&sec_type, fru_id, fru_text,
+ sec_sev, err,
+ gdata->error_data_length);
+ }
}
}
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 94f8038..e87fd9e 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -7,11 +7,19 @@
#include <linux/init.h>
#include <linux/ras.h>
+#include <linux/uuid.h>
#define CREATE_TRACE_POINTS
#define TRACE_INCLUDE_PATH ../../include/ras
#include <ras/ras_event.h>
+void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id,
+ const char *fru_text, const u8 sev, const u8 *err,
+ const u32 len)
+{
+ trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len);
+}
+
static int __init ras_init(void)
{
int rc = 0;
@@ -27,7 +35,7 @@ static int __init ras_init(void)
EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
-
+EXPORT_TRACEPOINT_SYMBOL_GPL(non_standard_event);
int __init parse_ras_param(char *str)
{
diff --git a/include/linux/ras.h b/include/linux/ras.h
index ffb1471..a7f3ed3 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -2,6 +2,7 @@
#define __RAS_H__
#include <asm/errno.h>
+#include <linux/uuid.h>
#ifdef CONFIG_DEBUG_FS
int ras_userspace_consumers(void);
@@ -22,4 +23,15 @@ static inline void __init cec_init(void) { }
static inline int cec_add_elem(u64 pfn) { return -ENODEV; }
#endif
+#ifdef CONFIG_RAS
+void log_non_standard_event(const uuid_le *sec_type,
+ const uuid_le *fru_id, const char *fru_text,
+ const u8 sev, const u8 *err, const u32 len);
+#else
+static void log_non_standard_event(const uuid_le *sec_type,
+ const uuid_le *fru_id, const char *fru_text,
+ const u8 sev, const u8 *err,
+ const u32 len) { return; }
+#endif
+
#endif /* __RAS_H__ */
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 1791a12..4f79ba9 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -162,6 +162,51 @@
);
/*
+ * Non-Standard Section Report
+ *
+ * This event is generated when hardware detected a hardware
+ * error event, which may be of non-standard section as defined
+ * in UEFI spec appendix "Common Platform Error Record", or may
+ * be of sections for which TRACE_EVENT is not defined.
+ *
+ */
+TRACE_EVENT(non_standard_event,
+
+ TP_PROTO(const uuid_le *sec_type,
+ const uuid_le *fru_id,
+ const char *fru_text,
+ const u8 sev,
+ const u8 *err,
+ const u32 len),
+
+ TP_ARGS(sec_type, fru_id, fru_text, sev, err, len),
+
+ TP_STRUCT__entry(
+ __array(char, sec_type, UUID_SIZE)
+ __array(char, fru_id, UUID_SIZE)
+ __string(fru_text, fru_text)
+ __field(u8, sev)
+ __field(u32, len)
+ __dynamic_array(u8, buf, len)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->sec_type, sec_type, UUID_SIZE);
+ memcpy(__entry->fru_id, fru_id, UUID_SIZE);
+ __assign_str(fru_text, fru_text);
+ __entry->sev = sev;
+ __entry->len = len;
+ memcpy(__get_dynamic_array(buf), err, len);
+ ),
+
+ TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s",
+ __entry->sev, __entry->sec_type,
+ __entry->fru_id, __get_str(fru_text),
+ __entry->len,
+ __print_hex(__get_dynamic_array(buf), __entry->len))
+);
+
+/*
* PCIe AER Trace event
*
* These events are generated when hardware detects a corrected or
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 3738e5f..c477464 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -20,12 +20,14 @@
#include <linux/types.h>
#include <linux/string.h>
+#define UUID_SIZE 16
+
typedef struct {
- __u8 b[16];
+ __u8 b[UUID_SIZE];
} uuid_le;
typedef struct {
- __u8 b[16];
+ __u8 b[UUID_SIZE];
} uuid_be;
#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH V17 10/11] trace, ras: add ARM processor error trace event
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (8 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 09/11] ras: acpi / apei: generate trace event for " Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-05-19 20:32 ` [PATCH V17 11/11] arm/arm64: KVM: add guest SEA support Tyler Baicar
` (2 subsequent siblings)
12 siblings, 0 replies; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
Currently there are trace events for the various RAS
errors with the exception of ARM processor type errors.
Add a new trace event for such errors so that the user
will know when they occur. These trace events are
consistent with the ARM processor error section type
defined in UEFI 2.6 spec section N.2.4.4.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Xie XiuQi <xiexiuqi@huawei.com>
---
drivers/acpi/apei/ghes.c | 6 +++++-
drivers/firmware/efi/cper.c | 1 +
drivers/ras/ras.c | 6 ++++++
include/linux/ras.h | 3 +++
include/ras/ras_event.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 16adbc8..a0ab5f3 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -518,7 +518,11 @@ static void ghes_do_proc(struct ghes *ghes,
}
#endif
- else {
+ else if (!uuid_le_cmp(sec_type, CPER_SEC_PROC_ARM)) {
+ struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
+
+ log_arm_hw_error(err);
+ } else {
void *err = acpi_hest_get_payload(gdata);
log_non_standard_event(&sec_type, fru_id, fru_text,
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index d5a5855..48a8f69 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -35,6 +35,7 @@
#include <linux/printk.h>
#include <linux/bcd.h>
#include <acpi/ghes.h>
+#include <ras/ras_event.h>
#define INDENT_SP " "
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index e87fd9e..39701a5 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -20,6 +20,11 @@ void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id,
trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len);
}
+void log_arm_hw_error(struct cper_sec_proc_arm *err)
+{
+ trace_arm_event(err);
+}
+
static int __init ras_init(void)
{
int rc = 0;
@@ -36,6 +41,7 @@ static int __init ras_init(void)
#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
EXPORT_TRACEPOINT_SYMBOL_GPL(non_standard_event);
+EXPORT_TRACEPOINT_SYMBOL_GPL(arm_event);
int __init parse_ras_param(char *str)
{
diff --git a/include/linux/ras.h b/include/linux/ras.h
index a7f3ed3..7a14658 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -3,6 +3,7 @@
#include <asm/errno.h>
#include <linux/uuid.h>
+#include <linux/cper.h>
#ifdef CONFIG_DEBUG_FS
int ras_userspace_consumers(void);
@@ -27,11 +28,13 @@ static inline void __init cec_init(void) { }
void log_non_standard_event(const uuid_le *sec_type,
const uuid_le *fru_id, const char *fru_text,
const u8 sev, const u8 *err, const u32 len);
+void log_arm_hw_error(struct cper_sec_proc_arm *err);
#else
static void log_non_standard_event(const uuid_le *sec_type,
const uuid_le *fru_id, const char *fru_text,
const u8 sev, const u8 *err,
const u32 len) { return; }
+static void log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
#endif
#endif /* __RAS_H__ */
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 4f79ba9..429f46f 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -162,6 +162,51 @@
);
/*
+ * ARM Processor Events Report
+ *
+ * This event is generated when hardware detects an ARM processor error
+ * has occurred. UEFI 2.6 spec section N.2.4.4.
+ */
+TRACE_EVENT(arm_event,
+
+ TP_PROTO(const struct cper_sec_proc_arm *proc),
+
+ TP_ARGS(proc),
+
+ TP_STRUCT__entry(
+ __field(u64, mpidr)
+ __field(u64, midr)
+ __field(u32, running_state)
+ __field(u32, psci_state)
+ __field(u8, affinity)
+ ),
+
+ TP_fast_assign(
+ if (proc->validation_bits & CPER_ARM_VALID_AFFINITY_LEVEL)
+ __entry->affinity = proc->affinity_level;
+ else
+ __entry->affinity = ~0;
+ if (proc->validation_bits & CPER_ARM_VALID_MPIDR)
+ __entry->mpidr = proc->mpidr;
+ else
+ __entry->mpidr = 0ULL;
+ __entry->midr = proc->midr;
+ if (proc->validation_bits & CPER_ARM_VALID_RUNNING_STATE) {
+ __entry->running_state = proc->running_state;
+ __entry->psci_state = proc->psci_state;
+ } else {
+ __entry->running_state = ~0;
+ __entry->psci_state = ~0;
+ }
+ ),
+
+ TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
+ "running state: %d; PSCI state: %d",
+ __entry->affinity, __entry->mpidr, __entry->midr,
+ __entry->running_state, __entry->psci_state)
+);
+
+/*
* Non-Standard Section Report
*
* This event is generated when hardware detected a hardware
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH V17 11/11] arm/arm64: KVM: add guest SEA support
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (9 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 10/11] trace, ras: add ARM processor error trace event Tyler Baicar
@ 2017-05-19 20:32 ` Tyler Baicar
2017-05-23 9:30 ` [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Borislav Petkov
2017-06-07 11:50 ` Will Deacon
12 siblings, 0 replies; 24+ messages in thread
From: Tyler Baicar @ 2017-05-19 20:32 UTC (permalink / raw)
To: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, will.deacon, rjw, lenb, matt, robert.moore,
lv.zheng, nkaje, zjzhang, mark.rutland, james.morse, akpm,
eun.taik.lee, sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose,
punit.agrawal, astone, harba, hanjun.gu
Cc: Tyler Baicar
Currently external aborts are unsupported by the guest abort
handling. Add handling for SEAs so that the host kernel reports
SEAs which occur in the guest kernel.
When an SEA occurs in the guest kernel, the guest exits and is
routed to kvm_handle_guest_abort(). Prior to this patch, a print
message of an unsupported FSC would be printed and nothing else
would happen. With this patch, the code gets routed to the APEI
handling of SEAs in the host kernel to report the SEA information.
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Christoffer Dall <cdall@linaro.org>
---
arch/arm/include/asm/kvm_arm.h | 10 ++++++++++
arch/arm/include/asm/system_misc.h | 5 +++++
arch/arm64/include/asm/kvm_arm.h | 10 ++++++++++
arch/arm64/include/asm/system_misc.h | 2 ++
arch/arm64/mm/fault.c | 22 ++++++++++++++++++++--
drivers/acpi/apei/ghes.c | 17 +++++++++++------
include/acpi/ghes.h | 2 +-
virt/kvm/arm/mmu.c | 36 +++++++++++++++++++++++++++++++++---
8 files changed, 92 insertions(+), 12 deletions(-)
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index a3f0b3d..ebf020b 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -187,6 +187,16 @@
#define FSC_FAULT (0x04)
#define FSC_ACCESS (0x08)
#define FSC_PERM (0x0c)
+#define FSC_SEA (0x10)
+#define FSC_SEA_TTW0 (0x14)
+#define FSC_SEA_TTW1 (0x15)
+#define FSC_SEA_TTW2 (0x16)
+#define FSC_SEA_TTW3 (0x17)
+#define FSC_SECC (0x18)
+#define FSC_SECC_TTW0 (0x1c)
+#define FSC_SECC_TTW1 (0x1d)
+#define FSC_SECC_TTW2 (0x1e)
+#define FSC_SECC_TTW3 (0x1f)
/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~0xf)
diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h
index a3d61ad..8c4a89f 100644
--- a/arch/arm/include/asm/system_misc.h
+++ b/arch/arm/include/asm/system_misc.h
@@ -22,6 +22,11 @@
extern unsigned int user_debug;
+static inline int handle_guest_sea(phys_addr_t addr, unsigned int esr)
+{
+ return -1;
+}
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_ARM_SYSTEM_MISC_H */
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 6e99978..61d694c 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -204,6 +204,16 @@
#define FSC_FAULT ESR_ELx_FSC_FAULT
#define FSC_ACCESS ESR_ELx_FSC_ACCESS
#define FSC_PERM ESR_ELx_FSC_PERM
+#define FSC_SEA ESR_ELx_FSC_EXTABT
+#define FSC_SEA_TTW0 (0x14)
+#define FSC_SEA_TTW1 (0x15)
+#define FSC_SEA_TTW2 (0x16)
+#define FSC_SEA_TTW3 (0x17)
+#define FSC_SECC (0x18)
+#define FSC_SECC_TTW0 (0x1c)
+#define FSC_SECC_TTW1 (0x1d)
+#define FSC_SECC_TTW2 (0x1e)
+#define FSC_SECC_TTW3 (0x1f)
/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~UL(0xf))
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
index bc81243..95aa442 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -56,6 +56,8 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int,
__show_ratelimited; \
})
+int handle_guest_sea(phys_addr_t addr, unsigned int esr);
+
#endif /* __ASSEMBLY__ */
#endif /* __ASM_SYSTEM_MISC_H */
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9b4e26f..1ce62cc 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -532,6 +532,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
struct siginfo info;
const struct fault_info *inf;
+ int ret = 0;
inf = esr_to_fault_info(esr);
pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
@@ -546,7 +547,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
if (interrupts_enabled(regs))
nmi_enter();
- ghes_notify_sea();
+ ret = ghes_notify_sea();
if (interrupts_enabled(regs))
nmi_exit();
@@ -561,7 +562,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
info.si_addr = (void __user *)addr;
arm64_notify_die("", regs, &info, esr);
- return 0;
+ return ret;
}
static const struct fault_info fault_info[] = {
@@ -632,6 +633,23 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
};
/*
+ * Handle Synchronous External Aborts that occur in a guest kernel.
+ *
+ * The return value will be zero if the SEA was successfully handled
+ * and non-zero if there was an error processing the error or there was
+ * no error to process.
+ */
+int handle_guest_sea(phys_addr_t addr, unsigned int esr)
+{
+ int ret = -ENOENT;
+
+ if (IS_ENABLED(CONFIG_ACPI_APEI_SEA))
+ ret = ghes_notify_sea();
+
+ return ret;
+}
+
+/*
* Dispatch a data abort to the relevant handler.
*/
asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a0ab5f3..7084c97 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -817,17 +817,22 @@ static int ghes_notify_sci(struct notifier_block *this,
#ifdef CONFIG_ACPI_APEI_SEA
static LIST_HEAD(ghes_sea);
-void ghes_notify_sea(void)
+/*
+ * Return 0 only if one of the SEA error sources successfully reported an error
+ * record sent from the firmware.
+ */
+int ghes_notify_sea(void)
{
struct ghes *ghes;
+ int ret = -ENOENT;
- /*
- * synchronize_rcu() will wait for nmi_exit(), so no need to
- * rcu_read_lock().
- */
+ rcu_read_lock();
list_for_each_entry_rcu(ghes, &ghes_sea, list) {
- ghes_proc(ghes);
+ if (!ghes_proc(ghes))
+ ret = 0;
}
+ rcu_read_unlock();
+ return ret;
}
static void ghes_sea_add(struct ghes *ghes)
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 95305ae..9f26e01 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -113,6 +113,6 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
return (void *)(gdata) + acpi_hest_get_record_size(gdata);
}
-void ghes_notify_sea(void);
+int ghes_notify_sea(void);
#endif /* GHES_H */
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 313ee64..521e737 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -29,6 +29,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>
+#include <asm/system_misc.h>
#include "trace.h"
@@ -1418,6 +1419,25 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
kvm_set_pfn_accessed(pfn);
}
+static bool is_abort_sea(unsigned long fault_status)
+{
+ switch (fault_status) {
+ case FSC_SEA:
+ case FSC_SEA_TTW0:
+ case FSC_SEA_TTW1:
+ case FSC_SEA_TTW2:
+ case FSC_SEA_TTW3:
+ case FSC_SECC:
+ case FSC_SECC_TTW0:
+ case FSC_SECC_TTW1:
+ case FSC_SECC_TTW2:
+ case FSC_SECC_TTW3:
+ return true;
+ default:
+ return false;
+ }
+}
+
/**
* kvm_handle_guest_abort - handles all 2nd stage aborts
* @vcpu: the VCPU pointer
@@ -1440,19 +1460,29 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
gfn_t gfn;
int ret, idx;
+ fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+
+ fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+
+ /*
+ * The host kernel will handle the synchronous external abort. There
+ * is no need to pass the error into the guest.
+ */
+ if (is_abort_sea(fault_status)) {
+ if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
+ return 1;
+ }
+
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) {
kvm_inject_vabt(vcpu);
return 1;
}
- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
-
trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
/* Check the stage-2 fault is trans. fault or write fault */
- fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
fault_status != FSC_ACCESS) {
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (10 preceding siblings ...)
2017-05-19 20:32 ` [PATCH V17 11/11] arm/arm64: KVM: add guest SEA support Tyler Baicar
@ 2017-05-23 9:30 ` Borislav Petkov
2017-06-07 11:50 ` Will Deacon
12 siblings, 0 replies; 24+ messages in thread
From: Borislav Petkov @ 2017-05-23 9:30 UTC (permalink / raw)
To: Tyler Baicar
Cc: linux-efi, kvm, matt, catalin.marinas, will.deacon, robert.moore,
paul.gortmaker, lv.zheng, kvmarm, fu.wei, rafael, zjzhang, linux,
gengdongjiu, linux-acpi, eun.taik.lee, shijie.huang, labbott,
lenb, harba, john.garry, marc.zyngier, punit.agrawal, rostedt,
nkaje, sandeepa.s.prabhu, linux-arm-kernel, tony.luck, rjw,
rruigrok, linux-kernel, astone, hanjun.guo, joe, pbonzini, akpm,
bristot, shiju.jose
On Fri, May 19, 2017 at 02:32:02PM -0600, Tyler Baicar wrote:
> When a memory error, CPU error, PCIe error, or other type of hardware error
> that's covered by RAS occurs, firmware should populate the shared GHES memory
> location with the proper GHES structures to notify the OS of the error.
...
> V17:Rebase on tip
> Change trace event helper function names
> Remove unneeded prefixes from commit text
The whole series:
Reviewed-by: Borislav Petkov <bp@suse.de>
Thanks!
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64
2017-05-19 20:32 [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Tyler Baicar
` (11 preceding siblings ...)
2017-05-23 9:30 ` [PATCH V17 00/11] Add UEFI 2.6 and ACPI 6.1 updates for RAS on ARM64 Borislav Petkov
@ 2017-06-07 11:50 ` Will Deacon
[not found] ` <20170607115012.GZ30263-5wv7dgnIgG8@public.gmane.org>
12 siblings, 1 reply; 24+ messages in thread
From: Will Deacon @ 2017-06-07 11:50 UTC (permalink / raw)
To: Tyler Baicar
Cc: christoffer.dall, marc.zyngier, pbonzini, rkrcmar, linux,
catalin.marinas, rjw, lenb, matt, robert.moore, lv.zheng, nkaje,
zjzhang, mark.rutland, james.morse, akpm, eun.taik.lee,
sandeepa.s.prabhu, labbott, shijie.huang, rruigrok,
paul.gortmaker, tn, fu.wei, rostedt, bristot, linux-arm-kernel,
kvmarm, kvm, linux-kernel, linux-acpi, linux-efi, Suzuki.Poulose
On Fri, May 19, 2017 at 02:32:02PM -0600, Tyler Baicar wrote:
> When a memory error, CPU error, PCIe error, or other type of hardware error
> that's covered by RAS occurs, firmware should populate the shared GHES memory
> location with the proper GHES structures to notify the OS of the error.
> For example, platforms that implement firmware first handling may implement
> separate GHES sources for corrected errors and uncorrected errors. If the
> error is an uncorrectable error, then the firmware will notify the OS
> immediately since the error needs to be handled ASAP. The OS will then be able
> to take the appropriate action needed such as offlining a page. If the error
> is a corrected error, then the firmware will not interrupt the OS immediately.
> Instead, the OS will see and report the error the next time it's GHES timer
> expires. The kernel will first parse the GHES structures and report the errors
> through the kernel logs and then notify the user space through RAS trace
> events. This allows user space applications such as RAS Daemon to see the
> errors and report them however the user desires. This patchset extends the
> kernel functionality for RAS errors based on updates in the UEFI 2.6 and
> ACPI 6.1 specifications.
Thanks, I've pushed this out as:
git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/ras-apei
which I'll merge into for-next/core (and therefore linux-next) either the
end of this week or the beginning of next week. Please take a look if you
get a chance.
Will
^ permalink raw reply [flat|nested] 24+ messages in thread