linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error
       [not found] <20180430212836.7807-1-mr.nuke.me@gmail.com>
@ 2018-04-30 21:33 ` Alexandru Gagniuc
  2018-04-30 21:33   ` [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
                     ` (2 more replies)
  2018-05-14 14:59 ` [PATCH v5 0/2] acpi: apei: Improve PCIe error handling with FFS Alexandru Gagniuc
  1 sibling, 3 replies; 23+ messages in thread
From: Alexandru Gagniuc @ 2018-04-30 21:33 UTC (permalink / raw)
  To: bp
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Alexandru Gagniuc,
	Rafael J. Wysocki, Len Brown, Tony Luck, Mauro Carvalho Chehab,
	Robert Moore, Erik Schmauss, Tyler Baicar, Will Deacon,
	James Morse, Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng,
	linux-acpi, linux-kernel, linux-edac, devel

The use of the 'ghes' argument was removed in a previous commit, but
function signature was not updated to reflect this.

Fixes: 0fe5f281f749 ("EDAC, ghes: Model a single, logical memory controller")
Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
 drivers/acpi/apei/ghes.c | 2 +-
 drivers/edac/ghes_edac.c | 3 +--
 include/acpi/ghes.h      | 5 ++---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 1efefe919555..f9b53a6f55f3 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -481,7 +481,7 @@ static void ghes_do_proc(struct ghes *ghes,
 		if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
-			ghes_edac_report_mem_error(ghes, sev, mem_err);
+			ghes_edac_report_mem_error(sev, mem_err);
 
 			arch_apei_report_mem_error(sev, mem_err);
 			ghes_handle_memory_failure(gdata, sev);
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 68b6ee18bea6..32bb8c5f47dc 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -172,8 +172,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 	}
 }
 
-void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
-				struct cper_sec_mem_err *mem_err)
+void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 {
 	enum hw_event_mc_err_type type;
 	struct edac_raw_error_desc *e;
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 8feb0c866ee0..e096a4e7f611 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -55,15 +55,14 @@ enum {
 /* From drivers/edac/ghes_edac.c */
 
 #ifdef CONFIG_EDAC_GHES
-void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
-				struct cper_sec_mem_err *mem_err);
+void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err);
 
 int ghes_edac_register(struct ghes *ghes, struct device *dev);
 
 void ghes_edac_unregister(struct ghes *ghes);
 
 #else
-static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
+static inline void ghes_edac_report_mem_error(int sev,
 				       struct cper_sec_mem_err *mem_err)
 {
 }
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-04-30 21:33 ` [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error Alexandru Gagniuc
@ 2018-04-30 21:33   ` Alexandru Gagniuc
  2018-05-04 11:56     ` Shiju Jose
  2018-05-11 15:39     ` Borislav Petkov
  2018-04-30 21:33   ` [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES Alexandru Gagniuc
  2018-05-12  9:00   ` [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error Borislav Petkov
  2 siblings, 2 replies; 23+ messages in thread
From: Alexandru Gagniuc @ 2018-04-30 21:33 UTC (permalink / raw)
  To: bp
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Alexandru Gagniuc,
	Rafael J. Wysocki, Len Brown, Tony Luck, Mauro Carvalho Chehab,
	Robert Moore, Erik Schmauss, Tyler Baicar, Will Deacon,
	James Morse, Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng,
	linux-acpi, linux-kernel, linux-edac, devel

ghes_severity() is a misnomer in this case, as it implies the severity
of the entire GHES structure. Instead, it maps one CPER value to a
monotonically increasing number. ghes_cper_severity() is clearer.

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
 drivers/acpi/apei/ghes.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f9b53a6f55f3..c9f1971333c1 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -271,7 +271,7 @@ static void ghes_fini(struct ghes *ghes)
 		unmap_gen_v2(ghes);
 }
 
-static inline int ghes_severity(int severity)
+static inline int ghes_cper_severity(int severity)
 {
 	switch (severity) {
 	case CPER_SEV_INFORMATIONAL:
@@ -388,7 +388,7 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
 #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
 	unsigned long pfn;
 	int flags = -1;
-	int sec_sev = ghes_severity(gdata->error_severity);
+	int sec_sev = ghes_cper_severity(gdata->error_severity);
 	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
 	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
@@ -468,10 +468,10 @@ static void ghes_do_proc(struct ghes *ghes,
 	guid_t *fru_id = &NULL_UUID_LE;
 	char *fru_text = "";
 
-	sev = ghes_severity(estatus->error_severity);
+	sev = ghes_cper_severity(estatus->error_severity);
 	apei_estatus_for_each_section(estatus, gdata) {
 		sec_type = (guid_t *)gdata->section_type;
-		sec_sev = ghes_severity(gdata->error_severity);
+		sec_sev = ghes_cper_severity(gdata->error_severity);
 		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
 			fru_id = (guid_t *)gdata->fru_id;
 
@@ -512,7 +512,7 @@ static void __ghes_print_estatus(const char *pfx,
 	char pfx_seq[64];
 
 	if (pfx == NULL) {
-		if (ghes_severity(estatus->error_severity) <=
+		if (ghes_cper_severity(estatus->error_severity) <=
 		    GHES_SEV_CORRECTED)
 			pfx = KERN_WARNING;
 		else
@@ -534,7 +534,7 @@ static int ghes_print_estatus(const char *pfx,
 	static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
 	struct ratelimit_state *ratelimit;
 
-	if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
+	if (ghes_cper_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
 		ratelimit = &ratelimit_corrected;
 	else
 		ratelimit = &ratelimit_uncorrected;
@@ -705,9 +705,8 @@ static int ghes_proc(struct ghes *ghes)
 	if (rc)
 		goto out;
 
-	if (ghes_severity(ghes->estatus->error_severity) >= GHES_SEV_PANIC) {
+	if (ghes_cper_severity(ghes->estatus->error_severity) >= GHES_SEV_PANIC)
 		__ghes_panic(ghes);
-	}
 
 	if (!ghes_estatus_cached(ghes->estatus)) {
 		if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
@@ -945,7 +944,7 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 			ret = NMI_HANDLED;
 		}
 
-		sev = ghes_severity(ghes->estatus->error_severity);
+		sev = ghes_cper_severity(ghes->estatus->error_severity);
 		if (sev >= GHES_SEV_PANIC) {
 			oops_begin();
 			ghes_print_queued_estatus();
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-04-30 21:33 ` [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error Alexandru Gagniuc
  2018-04-30 21:33   ` [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
@ 2018-04-30 21:33   ` Alexandru Gagniuc
  2018-05-11 15:40     ` Borislav Petkov
  2018-05-12  9:00   ` [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error Borislav Petkov
  2 siblings, 1 reply; 23+ messages in thread
From: Alexandru Gagniuc @ 2018-04-30 21:33 UTC (permalink / raw)
  To: bp
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Alexandru Gagniuc,
	Rafael J. Wysocki, Len Brown, Tony Luck, Mauro Carvalho Chehab,
	Robert Moore, Erik Schmauss, Tyler Baicar, Will Deacon,
	James Morse, Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng,
	linux-acpi, linux-kernel, linux-edac, devel

The policy was to panic() when GHES said that an error is "Fatal".
This logic is wrong for several reasons, as it doesn't take into
account what caused the error.

PCIe fatal errors indicate that the link to a device is either
unstable or unusable. They don't indicate that the machine is on fire,
and they are not severe enough that we need to panic(). Instead of
relying on crackmonkey firmware, evaluate the error severity based on
what caused the error (GHES subsections).

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
 drivers/acpi/apei/ghes.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index c9f1971333c1..49318fba409c 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -425,8 +425,7 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
  * GHES_SEV_RECOVERABLE -> AER_NONFATAL
  * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
  *     These both need to be reported and recovered from by the AER driver.
- * GHES_SEV_PANIC does not make it to this handling since the kernel must
- *     panic.
+ * GHES_SEV_PANIC -> AER_FATAL
  */
 static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 {
@@ -459,6 +458,46 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 #endif
 }
 
+/* PCIe errors should not cause a panic. */
+static int ghes_sec_pcie_severity(struct acpi_hest_generic_data *gdata)
+{
+	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
+
+	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO &&
+	    IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER))
+		return CPER_SEV_RECOVERABLE;
+
+	return ghes_cper_severity(gdata->error_severity);
+}
+/*
+ * The severity field in the status block is oftentimes more severe than it
+ * needs to be. This makes it an unreliable metric for the severity. A more
+ * reliable way is to look at each subsection and correlate it with how well
+ * the error can be handled.
+ *   - SEC_PCIE: All PCIe errors can be handled by AER.
+ */
+static int ghes_severity(struct ghes *ghes)
+{
+	int worst_sev, sec_sev;
+	struct acpi_hest_generic_data *gdata;
+	const guid_t *section_type;
+	const struct acpi_hest_generic_status *estatus = ghes->estatus;
+
+	worst_sev = GHES_SEV_NO;
+	apei_estatus_for_each_section(estatus, gdata) {
+		section_type = (guid_t *)gdata->section_type;
+		sec_sev = ghes_cper_severity(gdata->error_severity);
+
+		if (guid_equal(section_type, &CPER_SEC_PCIE))
+			sec_sev = ghes_sec_pcie_severity(gdata);
+
+		worst_sev = max(worst_sev, sec_sev);
+	}
+
+	return worst_sev;
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -944,7 +983,7 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 			ret = NMI_HANDLED;
 		}
 
-		sev = ghes_cper_severity(ghes->estatus->error_severity);
+		sev = ghes_severity(ghes);
 		if (sev >= GHES_SEV_PANIC) {
 			oops_begin();
 			ghes_print_queued_estatus();
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* RE: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-04-30 21:33   ` [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
@ 2018-05-04 11:56     ` Shiju Jose
  2018-05-04 23:33       ` Alex G.
  2018-05-11 15:39     ` Borislav Petkov
  1 sibling, 1 reply; 23+ messages in thread
From: Shiju Jose @ 2018-05-04 11:56 UTC (permalink / raw)
  To: Alexandru Gagniuc, bp
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Jonathan (Zhixiong) Zhang, gengdongjiu, linux-acpi, linux-kernel,
	linux-edac, devel

Hi Alex,

> -----Original Message-----
> From: Alexandru Gagniuc [mailto:mr.nuke.me@gmail.com]
> Sent: 30 April 2018 22:34
> To: bp@alien8.de
> Cc: alex_gagniuc@dellteam.com; austin_bolen@dell.com;
> shyam_iyer@dell.com; Alexandru Gagniuc; Rafael J. Wysocki; Len Brown;
> Tony Luck; Mauro Carvalho Chehab; Robert Moore; Erik Schmauss; Tyler
> Baicar; Will Deacon; James Morse; Shiju Jose; Jonathan (Zhixiong) Zhang;
> gengdongjiu; linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-edac@vger.kernel.org; devel@acpica.org
> Subject: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to
> ghes_cper_severity()
> 
> ghes_severity() is a misnomer in this case, as it implies the severity
> of the entire GHES structure. Instead, it maps one CPER value to a
> monotonically increasing number. ghes_cper_severity() is clearer.
> 
> Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> ---
>  drivers/acpi/apei/ghes.c | 17 ++++++++---------
>  1 file changed, 8 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index f9b53a6f55f3..c9f1971333c1 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -271,7 +271,7 @@ static void ghes_fini(struct ghes *ghes)
>  		unmap_gen_v2(ghes);
>  }
> 
> -static inline int ghes_severity(int severity)
> +static inline int ghes_cper_severity(int severity)

[...]
>  	else
>  		ratelimit = &ratelimit_uncorrected;
> @@ -705,9 +705,8 @@ static int ghes_proc(struct ghes *ghes)
>  	if (rc)
>  		goto out;
> 
> -	if (ghes_severity(ghes->estatus->error_severity) >=
> GHES_SEV_PANIC) {
> +	if (ghes_cper_severity(ghes->estatus->error_severity) >=
> GHES_SEV_PANIC)
>  		__ghes_panic(ghes);

PCIe AER fatal errors result panic here.
I think ghes_cper_severity to be replaced with ghes_severity in the ghes_proc function as well in the patch 
"acpi: apei: Do not panic() on PCIe errors reported through GHES"?
> -	}
> 
[...]
> 2.14.3

Thanks,
Shiju

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-05-04 11:56     ` Shiju Jose
@ 2018-05-04 23:33       ` Alex G.
  0 siblings, 0 replies; 23+ messages in thread
From: Alex G. @ 2018-05-04 23:33 UTC (permalink / raw)
  To: Shiju Jose, bp
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Jonathan (Zhixiong) Zhang, gengdongjiu, linux-acpi, linux-kernel,
	linux-edac, devel



On 05/04/2018 06:56 AM, Shiju Jose wrote:
> Hi Alex,

Hi

>> -----Original Message-----
>> From: Alexandru Gagniuc [mailto:mr.nuke.me@gmail.com]

[snip]

>> -static inline int ghes_severity(int severity)
>> +static inline int ghes_cper_severity(int severity)
> 
> [...]
>>   	else
>>   		ratelimit = &ratelimit_uncorrected;
>> @@ -705,9 +705,8 @@ static int ghes_proc(struct ghes *ghes)
>>   	if (rc)
>>   		goto out;
>>
>> -	if (ghes_severity(ghes->estatus->error_severity) >=
>> GHES_SEV_PANIC) {
>> +	if (ghes_cper_severity(ghes->estatus->error_severity) >=
>> GHES_SEV_PANIC)
>>   		__ghes_panic(ghes);
> 
> PCIe AER fatal errors result panic here.
> I think ghes_cper_severity to be replaced with ghes_severity in the ghes_proc function as well in the patch
> "acpi: apei: Do not panic() on PCIe errors reported through GHES"?

Hmm.
ghes_proc calls ghes_do_proc, which is not irq safe. So the entire 
concern we had in v1 about deferring to a less restrictive context is 
moot in this case.

ghes_proc is related, but a little beyond the scope of this series. I'd 
love to fix all cases, but I'd prefer someone who has specific interests 
take ownership of changing ghes_proc.

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-04-30 21:33   ` [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
  2018-05-04 11:56     ` Shiju Jose
@ 2018-05-11 15:39     ` Borislav Petkov
  2018-05-11 15:45       ` Alex G.
  1 sibling, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2018-05-11 15:39 UTC (permalink / raw)
  To: Alexandru Gagniuc
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On Mon, Apr 30, 2018 at 04:33:51PM -0500, Alexandru Gagniuc wrote:
> ghes_severity() is a misnomer in this case, as it implies the severity
> of the entire GHES structure. Instead, it maps one CPER value to a
> monotonically increasing number.

... as opposed to CPER severity which is something else or what is this
formulation trying to express?

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-04-30 21:33   ` [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES Alexandru Gagniuc
@ 2018-05-11 15:40     ` Borislav Petkov
  2018-05-11 15:54       ` Alex G.
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2018-05-11 15:40 UTC (permalink / raw)
  To: Alexandru Gagniuc
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On Mon, Apr 30, 2018 at 04:33:52PM -0500, Alexandru Gagniuc wrote:
> The policy was to panic() when GHES said that an error is "Fatal".
> This logic is wrong for several reasons, as it doesn't take into
> account what caused the error.
> 
> PCIe fatal errors indicate that the link to a device is either
> unstable or unusable. They don't indicate that the machine is on fire,
> and they are not severe enough that we need to panic(). Instead of
> relying on crackmonkey firmware, evaluate the error severity based on
	     ^^^^^^^^^^^^

Please keep the smartass formulations for the ML only and do not let
them leak into commit messages.

> Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> ---
>  drivers/acpi/apei/ghes.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 42 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index c9f1971333c1..49318fba409c 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -425,8 +425,7 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>   * GHES_SEV_RECOVERABLE -> AER_NONFATAL
>   * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
>   *     These both need to be reported and recovered from by the AER driver.
> - * GHES_SEV_PANIC does not make it to this handling since the kernel must
> - *     panic.
> + * GHES_SEV_PANIC -> AER_FATAL
>   */
>  static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
>  {
> @@ -459,6 +458,46 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
>  #endif
>  }
>  
> +/* PCIe errors should not cause a panic. */
> +static int ghes_sec_pcie_severity(struct acpi_hest_generic_data *gdata)
> +{
> +	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
> +
> +	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
> +	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO &&
> +	    IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER))

How is PCIe error severity dependent on whether the AER error reporting
driver is enabled (and possibly not even loaded) on the system?

> +		return CPER_SEV_RECOVERABLE;
> +
> +	return ghes_cper_severity(gdata->error_severity);
> +}
> +/*

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-05-11 15:39     ` Borislav Petkov
@ 2018-05-11 15:45       ` Alex G.
  2018-05-11 15:58         ` Borislav Petkov
  0 siblings, 1 reply; 23+ messages in thread
From: Alex G. @ 2018-05-11 15:45 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel



On 05/11/2018 10:39 AM, Borislav Petkov wrote:
> On Mon, Apr 30, 2018 at 04:33:51PM -0500, Alexandru Gagniuc wrote:
>> ghes_severity() is a misnomer in this case, as it implies the severity
>> of the entire GHES structure. Instead, it maps one CPER value to a
>> monotonically increasing number.
> 
> ... as opposed to CPER severity which is something else or what is this
> formulation trying to express?
> 

CPER madness goes like this:
	0 - Recoverable
	1 - Fatal
	2 - Corrected
	3 - None

As you can see, the numbering was created by crackmonkeys. GHES_* is an
internal enum that goes up in order of severity, as you'd expect.

If you're confused, you're not alone. I've seen several commit messages
that get this terminology wrong.

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-05-11 15:40     ` Borislav Petkov
@ 2018-05-11 15:54       ` Alex G.
  2018-05-11 16:02         ` Borislav Petkov
  0 siblings, 1 reply; 23+ messages in thread
From: Alex G. @ 2018-05-11 15:54 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel



On 05/11/2018 10:40 AM, Borislav Petkov wrote:
> On Mon, Apr 30, 2018 at 04:33:52PM -0500, Alexandru Gagniuc wrote:
>> The policy was to panic() when GHES said that an error is "Fatal".
>> This logic is wrong for several reasons, as it doesn't take into
>> account what caused the error.
>>
>> PCIe fatal errors indicate that the link to a device is either
>> unstable or unusable. They don't indicate that the machine is on fire,
>> and they are not severe enough that we need to panic(). Instead of
>> relying on crackmonkey firmware, evaluate the error severity based on
> 	     ^^^^^^^^^^^^
> 
> Please keep the smartass formulations for the ML only and do not let
> them leak into commit messages.

You're right. The monkeys are not crack. Instead, what a lot of
manufacturers do is maintain large monkey farms with electronic
typewriters. Given a sufficiently large farm, they take those results
which compile. Of those results, they pick and ship the one that takes
longest to boot, without the customers complaining.

That being clarified, should I replace "crackmonkey" with "broken" in
the commit message?

(snip)

>> +/* PCIe errors should not cause a panic. */
>> +static int ghes_sec_pcie_severity(struct acpi_hest_generic_data *gdata)
>> +{
>> +	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
>> +
>> +	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
>> +	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO &&
>> +	    IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER))
> 
> How is PCIe error severity dependent on whether the AER error reporting
> driver is enabled (and possibly not even loaded) on the system?

Borislav, I sense some confusion. AER is not a "reporting" driver. It
handles the errors. You can't leave these errors unhandled. They
propagate to the root complex and can cause fatal MCEs when not handled.
The window to handle the error is pretty large, so it's not a concern
when you're handling it.

Alex

>> +		return CPER_SEV_RECOVERABLE;
>> +
>> +	return ghes_cper_severity(gdata->error_severity);
>> +}
>> +/*
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-05-11 15:45       ` Alex G.
@ 2018-05-11 15:58         ` Borislav Petkov
  2018-05-11 16:12           ` Alex G.
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2018-05-11 15:58 UTC (permalink / raw)
  To: Alex G.
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On Fri, May 11, 2018 at 10:45:49AM -0500, Alex G. wrote:
> 
> 
> On 05/11/2018 10:39 AM, Borislav Petkov wrote:
> > On Mon, Apr 30, 2018 at 04:33:51PM -0500, Alexandru Gagniuc wrote:
> >> ghes_severity() is a misnomer in this case, as it implies the severity
> >> of the entire GHES structure. Instead, it maps one CPER value to a
> >> monotonically increasing number.
> > 
> > ... as opposed to CPER severity which is something else or what is this
> > formulation trying to express?
> > 
> 
> CPER madness goes like this:

Let's slow down first. Why is it a "CPER madness"? Maybe this is clear
in your head but I'm not in it.

> 	0 - Recoverable
> 	1 - Fatal
> 	2 - Corrected
> 	3 - None

If you're quoting this:

enum {
        CPER_SEV_RECOVERABLE,
        CPER_SEV_FATAL,
        CPER_SEV_CORRECTED,
        CPER_SEV_INFORMATIONAL,
};

that last 3 is informational.

> As you can see, the numbering was created by crackmonkeys. GHES_* is an
> internal enum that goes up in order of severity, as you'd expect.

So what are you trying to tell me - that those CPER numbers are not
increasing?!

Why does that even matter?

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-05-11 15:54       ` Alex G.
@ 2018-05-11 16:02         ` Borislav Petkov
  2018-05-11 16:12           ` Alex G.
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2018-05-11 16:02 UTC (permalink / raw)
  To: Alex G.
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On Fri, May 11, 2018 at 10:54:09AM -0500, Alex G. wrote:
> That being clarified, should I replace "crackmonkey" with "broken" in
> the commit message?

Keep your opinion *outside* of commit messages - their goal is to
explain *why* the change is being made in strictly technical language so
that when someone looks at git history, someone can know *why*.

> Borislav, I sense some confusion. AER is not a "reporting" driver. It
> handles the errors. You can't leave these errors unhandled. They
> propagate to the root complex and can cause fatal MCEs when not handled.
> The window to handle the error is pretty large, so it's not a concern
> when you're handling it.

I think *you* didn't get it: IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER) is not
enough of a check to confirm that there actually *is* an AER driver to
handle the errors. If you really want to make sure the driver is loaded
and functioning, then you need an explicit registering mechanism or some
other way of checking it really is there and handling errors.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-05-11 15:58         ` Borislav Petkov
@ 2018-05-11 16:12           ` Alex G.
  2018-05-11 16:19             ` Borislav Petkov
  0 siblings, 1 reply; 23+ messages in thread
From: Alex G. @ 2018-05-11 16:12 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On 05/11/2018 10:58 AM, Borislav Petkov wrote:
> On Fri, May 11, 2018 at 10:45:49AM -0500, Alex G. wrote:
>>
>>
>> On 05/11/2018 10:39 AM, Borislav Petkov wrote:
>>> On Mon, Apr 30, 2018 at 04:33:51PM -0500, Alexandru Gagniuc wrote:
>>>> ghes_severity() is a misnomer in this case, as it implies the severity
>>>> of the entire GHES structure. Instead, it maps one CPER value to a
>>>> monotonically increasing number.
>>>
>>> ... as opposed to CPER severity which is something else or what is this
>>> formulation trying to express?
>>>
>>
>> CPER madness goes like this:
> 
> Let's slow down first. Why is it a "CPER madness"? Maybe this is clear
> in your head but I'm not in it.
> 
>> 	0 - Recoverable
>> 	1 - Fatal
>> 	2 - Corrected
>> 	3 - None
> 
> If you're quoting this:

I'm quoting ACPI 6.2, Table 18-381 Generic Error Data Entry, though I'm
certain they got that from the efi spec.

> enum {
>         CPER_SEV_RECOVERABLE,
>         CPER_SEV_FATAL,
>         CPER_SEV_CORRECTED,
>         CPER_SEV_INFORMATIONAL,
> };
> 
> that last 3 is informational.
> 
>> As you can see, the numbering was created by crackmonkeys. GHES_* is an
>> internal enum that goes up in order of severity, as you'd expect.
> 
> So what are you trying to tell me - that those CPER numbers are not
> increasing?!
> 
> Why does that even matter?

Because the GHES structure uses CPER values, but all the code is written
to use GHES_SEV_ values. GHES_SEV_ is a made up enum, specifically for
linux.

Sure, the return in ghes_sec_pcie_severity() should say
GHES_SEV_RECOVERABLE, but that is a Freudian slip rather than
intentional typing. Thank you for catching that :)

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-05-11 16:02         ` Borislav Petkov
@ 2018-05-11 16:12           ` Alex G.
  2018-05-11 16:29             ` Borislav Petkov
  0 siblings, 1 reply; 23+ messages in thread
From: Alex G. @ 2018-05-11 16:12 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel



On 05/11/2018 11:02 AM, Borislav Petkov wrote:
> On Fri, May 11, 2018 at 10:54:09AM -0500, Alex G. wrote:
>> That being clarified, should I replace "crackmonkey" with "broken" in
>> the commit message?
> 
> Keep your opinion *outside* of commit messages - their goal is to
> explain *why* the change is being made in strictly technical language so
> that when someone looks at git history, someone can know *why*.
> 
>> Borislav, I sense some confusion. AER is not a "reporting" driver. It
>> handles the errors. You can't leave these errors unhandled. They
>> propagate to the root complex and can cause fatal MCEs when not handled.
>> The window to handle the error is pretty large, so it's not a concern
>> when you're handling it.
> 
> I think *you* didn't get it: IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER) is not
> enough of a check to confirm that there actually *is* an AER driver to
> handle the errors. If you really want to make sure the driver is loaded
> and functioning, then you need an explicit registering mechanism or some
> other way of checking it really is there and handling errors.

config ACPI_APEI_PCIEAER
	bool "APEI PCIe AER logging/recovering support"
	depends on ACPI_APEI && PCIEAER
	help
	  PCIe AER errors may be reported via APEI firmware first mode.
	  Turn on this option to enable the corresponding support.

PCIAER is not modularizable. QED

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-05-11 16:12           ` Alex G.
@ 2018-05-11 16:19             ` Borislav Petkov
  2018-05-11 17:03               ` Alex G.
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2018-05-11 16:19 UTC (permalink / raw)
  To: Alex G.
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On Fri, May 11, 2018 at 11:12:24AM -0500, Alex G. wrote:
> Because the GHES structure uses CPER values, but all the code is written
> to use GHES_SEV_ values. GHES_SEV_ is a made up enum, specifically for
> linux.

Again, what does that even matter?

They're defines in both cases. The *actual* value means shit.

Ah, I see it:

	...
		sec_sev = ghes_sec_pcie_severity(gdata);

	worst_sev = max(worst_sev, sec_sev);


Yeah, no, you can't do that. No apples and oranges comparisons.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-05-11 16:12           ` Alex G.
@ 2018-05-11 16:29             ` Borislav Petkov
  2018-05-11 17:01               ` Alex G.
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2018-05-11 16:29 UTC (permalink / raw)
  To: Alex G.
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On Fri, May 11, 2018 at 11:12:25AM -0500, Alex G. wrote:
> > I think *you* didn't get it: IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER) is not
> > enough of a check to confirm that there actually *is* an AER driver to
> > handle the errors. If you really want to make sure the driver is loaded
> > and functioning, then you need an explicit registering mechanism or some
> > other way of checking it really is there and handling errors.
> 
> config ACPI_APEI_PCIEAER
> 	bool "APEI PCIe AER logging/recovering support"
> 	depends on ACPI_APEI && PCIEAER
> 	help
> 	  PCIe AER errors may be reported via APEI firmware first mode.
> 	  Turn on this option to enable the corresponding support.
> 
> PCIAER is not modularizable. QED

QED my ass.

Read the f*ck my email again: the presence of the *code* is
not enough of a check to confirm the error has been handled.
aer_recover_work_func() can fail as that kfifo_put() in
aer_recover_queue() can too.

You need an *actual* confirmation that the error has been handled
properly and *only* *then* not panic the system. Otherwise you are
potentially leaving those errors unhandled.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-05-11 16:29             ` Borislav Petkov
@ 2018-05-11 17:01               ` Alex G.
  2018-05-11 17:41                 ` Borislav Petkov
  0 siblings, 1 reply; 23+ messages in thread
From: Alex G. @ 2018-05-11 17:01 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On 05/11/2018 11:29 AM, Borislav Petkov wrote:
> On Fri, May 11, 2018 at 11:12:25AM -0500, Alex G. wrote:
>>> I think *you* didn't get it: IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER) is not
>>> enough of a check to confirm that there actually *is* an AER driver to
>>> handle the errors. If you really want to make sure the driver is loaded
>>> and functioning, then you need an explicit registering mechanism or some
>>> other way of checking it really is there and handling errors.
>>
>> config ACPI_APEI_PCIEAER
>> 	bool "APEI PCIe AER logging/recovering support"
>> 	depends on ACPI_APEI && PCIEAER
>> 	help
>> 	  PCIe AER errors may be reported via APEI firmware first mode.
>> 	  Turn on this option to enable the corresponding support.
>>
>> PCIAER is not modularizable. QED
> 
> QED my ass.
> 
> Read the f*ck my email again: the presence of the *code* is
> not enough of a check to confirm the error has been handled.
> aer_recover_work_func() can fail as that kfifo_put() in
> aer_recover_queue() can too.
> 
> You need an *actual* confirmation that the error has been handled
> properly and *only* *then* not panic the system. Otherwise you are
> potentially leaving those errors unhandled.


"How is PCIe error severity dependent on whether the AER error reporting
 driver is enabled (and possibly not even loaded) on the system?"

Little about confirmation of error being handled was talked about either
in your **** email, or previous versions of this series.  And quite
frankly it's besides the scope of this patch.

The scope is to enable SURPRISE!!! removal of NVMe drives and PCIe
devices. For that purpose, we don't need confirmation that the error was
handled. Such a confirmation requires a rework of GHES handling, or at
least the interaction between GHES and AER, both of which I find to be
mostly satisfactory.

You can't at this point know if the error is going to be handled.
There's code further downstream to handle this. You also didn't like it
when I wanted to handle things downstream.

I understand your concern with unhandled AER errors evolving into MCE's.
That's extremely rare, but when it happens you still panic due to the
MCE. To give you an idea of the rarity, in several months of testing, I
was only able to reproduce MCEs once, and that was with a very defective
drive, and a very idiotic test case.

If you find this solution unacceptable, that's fine. We can fix it in
firmware. We can hide all the events from the OS, contain the downstream
ports, and simulate hot-remove interrupts. All in firmware, all the time.

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-05-11 16:19             ` Borislav Petkov
@ 2018-05-11 17:03               ` Alex G.
  0 siblings, 0 replies; 23+ messages in thread
From: Alex G. @ 2018-05-11 17:03 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel



On 05/11/2018 11:19 AM, Borislav Petkov wrote:
> On Fri, May 11, 2018 at 11:12:24AM -0500, Alex G. wrote:
>> Because the GHES structure uses CPER values, but all the code is written
>> to use GHES_SEV_ values. GHES_SEV_ is a made up enum, specifically for
>> linux.
> 
> Again, what does that even matter?

I will shorten the commit message.


> They're defines in both cases. The *actual* value means shit.
> 
> Ah, I see it:
> 
> 	...
> 		sec_sev = ghes_sec_pcie_severity(gdata);
> 
> 	worst_sev = max(worst_sev, sec_sev);
> 
> 
> Yeah, no, you can't do that. No apples and oranges comparisons.

That was a mistake on my part. Despite my godlike ability to produce
great fixes, I am, in fact, human.

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-05-11 17:01               ` Alex G.
@ 2018-05-11 17:41                 ` Borislav Petkov
  2018-05-11 17:56                   ` Alex G.
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2018-05-11 17:41 UTC (permalink / raw)
  To: Alex G.
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On Fri, May 11, 2018 at 12:01:52PM -0500, Alex G. wrote:
> I understand your concern with unhandled AER errors evolving into MCE's.
> That's extremely rare, but when it happens you still panic due to the
> MCE.

I don't like leaving holes in the handling of PCIe errors. You need to
handle only those errors which are caused by hot-removal and not affect
other error types. Or do a comprehensive PCIe errors handling of all
errors in the AER driver.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-05-11 17:41                 ` Borislav Petkov
@ 2018-05-11 17:56                   ` Alex G.
  0 siblings, 0 replies; 23+ messages in thread
From: Alex G. @ 2018-05-11 17:56 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On 05/11/2018 12:41 PM, Borislav Petkov wrote:
> On Fri, May 11, 2018 at 12:01:52PM -0500, Alex G. wrote:
>> I understand your concern with unhandled AER errors evolving into MCE's.
>> That's extremely rare, but when it happens you still panic due to the
>> MCE.
> 
> I don't like leaving holes in the handling of PCIe errors. You need to
> handle only those errors which are caused by hot-removal and not affect
> other error types. Or do a comprehensive PCIe errors handling of all
> errors in the AER driver.

Forget about how AER works, and worry about parity with native AER. If
AER is buggy, it will have the same bug in native and FFS cases. Right
now we're paranoid, over-babying the errors, and don't even make it to
the handler. How is this better?

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error
  2018-04-30 21:33 ` [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error Alexandru Gagniuc
  2018-04-30 21:33   ` [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
  2018-04-30 21:33   ` [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES Alexandru Gagniuc
@ 2018-05-12  9:00   ` Borislav Petkov
  2 siblings, 0 replies; 23+ messages in thread
From: Borislav Petkov @ 2018-05-12  9:00 UTC (permalink / raw)
  To: Alexandru Gagniuc
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Rafael J. Wysocki,
	Len Brown, Tony Luck, Mauro Carvalho Chehab, Robert Moore,
	Erik Schmauss, Tyler Baicar, Will Deacon, James Morse,
	Shiju Jose, Jonathan (Zhixiong) Zhang, Dongjiu Geng, linux-acpi,
	linux-kernel, linux-edac, devel

On Mon, Apr 30, 2018 at 04:33:50PM -0500, Alexandru Gagniuc wrote:
> The use of the 'ghes' argument was removed in a previous commit, but
> function signature was not updated to reflect this.
> 
> Fixes: 0fe5f281f749 ("EDAC, ghes: Model a single, logical memory controller")
> Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> ---
>  drivers/acpi/apei/ghes.c | 2 +-
>  drivers/edac/ghes_edac.c | 3 +--
>  include/acpi/ghes.h      | 5 ++---
>  3 files changed, 4 insertions(+), 6 deletions(-)

Applied, thanks.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v5 0/2] acpi: apei: Improve PCIe error handling with FFS
       [not found] <20180430212836.7807-1-mr.nuke.me@gmail.com>
  2018-04-30 21:33 ` [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error Alexandru Gagniuc
@ 2018-05-14 14:59 ` Alexandru Gagniuc
  2018-05-14 14:59   ` [PATCH v5 1/2] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
  2018-05-14 14:59   ` [PATCH v5 2/2] acpi: apei: Do not panic() on PCIe errors reported through GHES Alexandru Gagniuc
  1 sibling, 2 replies; 23+ messages in thread
From: Alexandru Gagniuc @ 2018-05-14 14:59 UTC (permalink / raw)
  To: bp
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Alexandru Gagniuc,
	Rafael J. Wysocki, Len Brown, Tony Luck, Tyler Baicar,
	Will Deacon, James Morse, Shiju Jose, Jonathan (Zhixiong) Zhang,
	Dongjiu Geng, linux-acpi, linux-kernel

The purpose of these changes is to see if we can safely de-escalate
the situation and notify the appropriate error handler. Since FFS
reports errors through NMIs or other non-standard mechanism, we have
to be just a little more careful with reporting the error.

We're concerned with things, such as being able to cross the NMI/IRQ
boundary, or being able to safely schedule work and notify the
appropriate subsystem. Once the notification is sent, our job is done.
I'm explicitly _NOT_ concerned with whether the error is handled or
not, especially since such concern reduces to a call to __ghes_panic().

There are rare cases that prevent us from de-escalating to lesser
contexts, such as uncorrectable memory errors in kernel. In these sort
of cases, trying to leave the NMI might cause a triple fault. James
Morse explained this very well when discussing v1 of this series. In
and only in such cases, we are justified to panic().

Once the error is safely sent its merry way, it's really up to the
error handler to panic() or continue. For example, aer_recover_queue()
might for ungodly reasons fail. However, it's up to the AER code to
decide whether failing to queue an error for handling is panic worthy.

Changes since v4:
 - Fix Freudian slip and use GHES_ instead of CPER_ enum
 - Rephrased comments to clarify what we don't care about

Changes since v3:
 - Renamed ghes_severity to something more concrete
 - Reorganized code to make it look like more than just a rename
 - Remembered to remove last patch in the series

Changes since v2:
 - Due to popular request, simple is chosen over flexible
 - Removed splitting of handlers into irq safe portion.
 - Change behavior only for PCIe errors

Changes since v1:
 - Due to popular request, the panic() is left in the NMI handler
 - GHES AER handler is split into NMI and non-NMI portions
 - ghes_notify_nmi() does not panic on deferrable errors
 - The handlers are put in a mapping and given a common call signature

Alexandru Gagniuc (2):
  acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  acpi: apei: Do not panic() on PCIe errors reported through GHES

 drivers/acpi/apei/ghes.c | 63 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 11 deletions(-)

-- 
2.14.3

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v5 1/2] acpi: apei: Rename ghes_severity() to ghes_cper_severity()
  2018-05-14 14:59 ` [PATCH v5 0/2] acpi: apei: Improve PCIe error handling with FFS Alexandru Gagniuc
@ 2018-05-14 14:59   ` Alexandru Gagniuc
  2018-05-14 14:59   ` [PATCH v5 2/2] acpi: apei: Do not panic() on PCIe errors reported through GHES Alexandru Gagniuc
  1 sibling, 0 replies; 23+ messages in thread
From: Alexandru Gagniuc @ 2018-05-14 14:59 UTC (permalink / raw)
  To: bp
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Alexandru Gagniuc,
	Rafael J. Wysocki, Len Brown, Tony Luck, Tyler Baicar,
	Will Deacon, James Morse, Shiju Jose, Jonathan (Zhixiong) Zhang,
	Dongjiu Geng, linux-acpi, linux-kernel

ghes_severity() is a misnomer in this case, as it implies the severity
of the entire GHES structure. Instead, it maps one CPER value to a
GHES_SEV* value. ghes_cper_severity() is clearer.

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
 drivers/acpi/apei/ghes.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 1efefe919555..7c1a16b106ba 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -271,7 +271,7 @@ static void ghes_fini(struct ghes *ghes)
 		unmap_gen_v2(ghes);
 }
 
-static inline int ghes_severity(int severity)
+static inline int ghes_cper_severity(int severity)
 {
 	switch (severity) {
 	case CPER_SEV_INFORMATIONAL:
@@ -388,7 +388,7 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
 #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
 	unsigned long pfn;
 	int flags = -1;
-	int sec_sev = ghes_severity(gdata->error_severity);
+	int sec_sev = ghes_cper_severity(gdata->error_severity);
 	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
 	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
@@ -468,10 +468,10 @@ static void ghes_do_proc(struct ghes *ghes,
 	guid_t *fru_id = &NULL_UUID_LE;
 	char *fru_text = "";
 
-	sev = ghes_severity(estatus->error_severity);
+	sev = ghes_cper_severity(estatus->error_severity);
 	apei_estatus_for_each_section(estatus, gdata) {
 		sec_type = (guid_t *)gdata->section_type;
-		sec_sev = ghes_severity(gdata->error_severity);
+		sec_sev = ghes_cper_severity(gdata->error_severity);
 		if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
 			fru_id = (guid_t *)gdata->fru_id;
 
@@ -512,7 +512,7 @@ static void __ghes_print_estatus(const char *pfx,
 	char pfx_seq[64];
 
 	if (pfx == NULL) {
-		if (ghes_severity(estatus->error_severity) <=
+		if (ghes_cper_severity(estatus->error_severity) <=
 		    GHES_SEV_CORRECTED)
 			pfx = KERN_WARNING;
 		else
@@ -534,7 +534,7 @@ static int ghes_print_estatus(const char *pfx,
 	static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
 	struct ratelimit_state *ratelimit;
 
-	if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
+	if (ghes_cper_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
 		ratelimit = &ratelimit_corrected;
 	else
 		ratelimit = &ratelimit_uncorrected;
@@ -705,9 +705,8 @@ static int ghes_proc(struct ghes *ghes)
 	if (rc)
 		goto out;
 
-	if (ghes_severity(ghes->estatus->error_severity) >= GHES_SEV_PANIC) {
+	if (ghes_cper_severity(ghes->estatus->error_severity) >= GHES_SEV_PANIC)
 		__ghes_panic(ghes);
-	}
 
 	if (!ghes_estatus_cached(ghes->estatus)) {
 		if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
@@ -945,7 +944,7 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 			ret = NMI_HANDLED;
 		}
 
-		sev = ghes_severity(ghes->estatus->error_severity);
+		sev = ghes_cper_severity(ghes->estatus->error_severity);
 		if (sev >= GHES_SEV_PANIC) {
 			oops_begin();
 			ghes_print_queued_estatus();
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v5 2/2] acpi: apei: Do not panic() on PCIe errors reported through GHES
  2018-05-14 14:59 ` [PATCH v5 0/2] acpi: apei: Improve PCIe error handling with FFS Alexandru Gagniuc
  2018-05-14 14:59   ` [PATCH v5 1/2] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
@ 2018-05-14 14:59   ` Alexandru Gagniuc
  1 sibling, 0 replies; 23+ messages in thread
From: Alexandru Gagniuc @ 2018-05-14 14:59 UTC (permalink / raw)
  To: bp
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Alexandru Gagniuc,
	Rafael J. Wysocki, Len Brown, Tony Luck, Tyler Baicar,
	Will Deacon, James Morse, Shiju Jose, Jonathan (Zhixiong) Zhang,
	Dongjiu Geng, linux-acpi, linux-kernel

The policy was to panic() when GHES said that an error is "Fatal".
This logic is wrong for several reasons, as it doesn't take into
account what caused the error.

PCIe fatal errors indicate that the link to a device is either
unstable or unusable. They don't indicate that the machine is on fire,
and they are not severe enough that we need to panic(). Instead of
relying on crackmonkey firmware, evaluate the error severity based on
what caused the error (GHES subsections).

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
 drivers/acpi/apei/ghes.c | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 7c1a16b106ba..9baaab798020 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -425,8 +425,7 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
  * GHES_SEV_RECOVERABLE -> AER_NONFATAL
  * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
  *     These both need to be reported and recovered from by the AER driver.
- * GHES_SEV_PANIC does not make it to this handling since the kernel must
- *     panic.
+ * GHES_SEV_PANIC -> AER_FATAL
  */
 static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 {
@@ -459,6 +458,49 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 #endif
 }
 
+/* PCIe errors should not cause a panic. */
+static int ghes_sec_pcie_severity(struct acpi_hest_generic_data *gdata)
+{
+	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
+
+	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO &&
+	    IS_ENABLED(CONFIG_ACPI_APEI_PCIEAER))
+		return GHES_SEV_RECOVERABLE;
+
+	return ghes_cper_severity(gdata->error_severity);
+}
+
+/*
+ * The severity field in the status block is an unreliable metric for the
+ * severity. A more reliable way is to look at each subsection and see how safe
+ * it is to call the approproate error handler.
+ * We're not conerned with handling the error. We're concerned with being able
+ * to notify an error handler by crossing the NMI/IRQ boundary, being able to
+ * schedule_work, and so forth.
+ *   - SEC_PCIE: All PCIe errors can be handled by AER.
+ */
+static int ghes_severity(struct ghes *ghes)
+{
+	int worst_sev, sec_sev;
+	struct acpi_hest_generic_data *gdata;
+	const guid_t *section_type;
+	const struct acpi_hest_generic_status *estatus = ghes->estatus;
+
+	worst_sev = GHES_SEV_NO;
+	apei_estatus_for_each_section(estatus, gdata) {
+		section_type = (guid_t *)gdata->section_type;
+		sec_sev = ghes_cper_severity(gdata->error_severity);
+
+		if (guid_equal(section_type, &CPER_SEC_PCIE))
+			sec_sev = ghes_sec_pcie_severity(gdata);
+
+		worst_sev = max(worst_sev, sec_sev);
+	}
+
+	return worst_sev;
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -944,7 +986,7 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 			ret = NMI_HANDLED;
 		}
 
-		sev = ghes_cper_severity(ghes->estatus->error_severity);
+		sev = ghes_severity(ghes);
 		if (sev >= GHES_SEV_PANIC) {
 			oops_begin();
 			ghes_print_queued_estatus();
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2018-05-14 15:00 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20180430212836.7807-1-mr.nuke.me@gmail.com>
2018-04-30 21:33 ` [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error Alexandru Gagniuc
2018-04-30 21:33   ` [RFC PATCH v4 2/3] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
2018-05-04 11:56     ` Shiju Jose
2018-05-04 23:33       ` Alex G.
2018-05-11 15:39     ` Borislav Petkov
2018-05-11 15:45       ` Alex G.
2018-05-11 15:58         ` Borislav Petkov
2018-05-11 16:12           ` Alex G.
2018-05-11 16:19             ` Borislav Petkov
2018-05-11 17:03               ` Alex G.
2018-04-30 21:33   ` [RFC PATCH v4 3/3] acpi: apei: Do not panic() on PCIe errors reported through GHES Alexandru Gagniuc
2018-05-11 15:40     ` Borislav Petkov
2018-05-11 15:54       ` Alex G.
2018-05-11 16:02         ` Borislav Petkov
2018-05-11 16:12           ` Alex G.
2018-05-11 16:29             ` Borislav Petkov
2018-05-11 17:01               ` Alex G.
2018-05-11 17:41                 ` Borislav Petkov
2018-05-11 17:56                   ` Alex G.
2018-05-12  9:00   ` [RFC PATCH v4 1/3] EDAC, GHES: Remove unused argument to ghes_edac_report_mem_error Borislav Petkov
2018-05-14 14:59 ` [PATCH v5 0/2] acpi: apei: Improve PCIe error handling with FFS Alexandru Gagniuc
2018-05-14 14:59   ` [PATCH v5 1/2] acpi: apei: Rename ghes_severity() to ghes_cper_severity() Alexandru Gagniuc
2018-05-14 14:59   ` [PATCH v5 2/2] acpi: apei: Do not panic() on PCIe errors reported through GHES Alexandru Gagniuc

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).