All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] x86/RAS queue
@ 2016-07-08  9:09 Borislav Petkov
  2016-07-08  9:09 ` [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type Borislav Petkov
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML

From: Borislav Petkov <bp@suse.de>

Hi,

here's some more RAS stuff for 4.8.

Please queue,
thanks.

Aravind Gopalakrishnan (1):
  x86/mce/AMD: Increase size of bank_map type

Borislav Petkov (1):
  x86/mce: Fix mce_rdmsrl() warning message

Yazen Ghannam (4):
  x86/RAS/AMD: Reduce number of IPIs when prepping error injection
  x86/mce: Add support for new MCA_SYND register
  EDAC, mce_amd: Print syndrome register value on SMCA systems
  x86/RAS: Add syndrome support to mce_amd_inj

 arch/x86/include/asm/mce.h           |  5 ++-
 arch/x86/include/uapi/asm/mce.h      |  1 +
 arch/x86/kernel/cpu/mcheck/mce.c     |  6 +++-
 arch/x86/kernel/cpu/mcheck/mce_amd.c |  5 ++-
 arch/x86/ras/mce_amd_inj.c           | 69 ++++++++++++++++++++----------------
 drivers/edac/mce_amd.c               | 14 ++++++--
 include/trace/events/mce.h           |  6 ++--
 7 files changed, 68 insertions(+), 38 deletions(-)

-- 
2.7.3

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type
  2016-07-08  9:09 [PATCH 0/6] x86/RAS queue Borislav Petkov
@ 2016-07-08  9:09 ` Borislav Petkov
  2016-07-08  9:21   ` Ingo Molnar
  2016-07-08 12:05   ` [tip:ras/core] x86/mce/AMD: Increase size of the " tip-bot for Aravind Gopalakrishnan
  2016-07-08  9:09 ` [PATCH 2/6] x86/RAS/AMD: Reduce number of IPIs when prepping error injection Borislav Petkov
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML

From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>

Change bank_map type from char to int since we now have more than eight
banks in a system.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1466462163-29008-1-git-send-email-Yazen.Ghannam@amd.com
Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 10b0661651e0..7b7f3be783d4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -93,7 +93,7 @@ const char * const amd_df_mcablock_names[] = {
 EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
 
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
+static DEFINE_PER_CPU(unsigned int, bank_map);	/* see which banks are on */
 
 static void amd_threshold_interrupt(void);
 static void amd_deferred_error_interrupt(void);
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/6] x86/RAS/AMD: Reduce number of IPIs when prepping error injection
  2016-07-08  9:09 [PATCH 0/6] x86/RAS queue Borislav Petkov
  2016-07-08  9:09 ` [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type Borislav Petkov
@ 2016-07-08  9:09 ` Borislav Petkov
  2016-07-08 12:06   ` [tip:ras/core] x86/RAS/AMD: Reduce the " tip-bot for Yazen Ghannam
  2016-07-08  9:09 ` [PATCH 3/6] x86/mce: Add support for new MCA_SYND register Borislav Petkov
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML

From: Yazen Ghannam <Yazen.Ghannam@amd.com>

We currently use wrmsr_on_cpu() 4 times when prepping for an error
injection. This will generate 4 IPIs for each MSR write. We can reduce
the number of IPIs to 1 by grouping the MSR writes and executing them
serially on the appropriate CPU.

Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Suggested-by: Borislav Petkov <bp@suse.de>
Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1466462347-31657-1-git-send-email-Yazen.Ghannam@amd.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/ras/mce_amd_inj.c | 58 ++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index e69f4701a076..1104515d5ad2 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -241,6 +241,31 @@ static void toggle_nb_mca_mst_cpu(u16 nid)
 		       __func__, PCI_FUNC(F3->devfn), NBCFG);
 }
 
+static void prepare_msrs(void *info)
+{
+	struct mce i_mce = *(struct mce *)info;
+	u8 b = i_mce.bank;
+
+	wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus);
+
+	if (boot_cpu_has(X86_FEATURE_SMCA)) {
+		if (i_mce.inject_flags == DFR_INT_INJ) {
+			wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status);
+			wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr);
+		} else {
+			wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status);
+			wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr);
+		}
+
+		wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc);
+	} else {
+		wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status);
+		wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr);
+		wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc);
+	}
+
+}
+
 static void do_inject(void)
 {
 	u64 mcg_status = 0;
@@ -287,36 +312,9 @@ static void do_inject(void)
 
 	toggle_hw_mce_inject(cpu, true);
 
-	wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS,
-		     (u32)mcg_status, (u32)(mcg_status >> 32));
-
-	if (boot_cpu_has(X86_FEATURE_SMCA)) {
-		if (inj_type == DFR_INT_INJ) {
-			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b),
-				     (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
-			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b),
-				     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
-		} else {
-			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b),
-				     (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
-			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b),
-				     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
-		}
-
-		wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b),
-			     (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
-	} else {
-		wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
-			     (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
-		wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
-			     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
-
-		wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
-			     (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
-	}
+	i_mce.mcgstatus = mcg_status;
+	i_mce.inject_flags = inj_type;
+	smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
 
 	toggle_hw_mce_inject(cpu, false);
 
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/6] x86/mce: Add support for new MCA_SYND register
  2016-07-08  9:09 [PATCH 0/6] x86/RAS queue Borislav Petkov
  2016-07-08  9:09 ` [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type Borislav Petkov
  2016-07-08  9:09 ` [PATCH 2/6] x86/RAS/AMD: Reduce number of IPIs when prepping error injection Borislav Petkov
@ 2016-07-08  9:09 ` Borislav Petkov
  2016-07-08  9:26   ` Ingo Molnar
  2016-07-08  9:09 ` [PATCH 4/6] x86/mce: Fix mce_rdmsrl() warning message Borislav Petkov
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML

From: Yazen Ghannam <Yazen.Ghannam@amd.com>

Syndrome information is no longer contained in MCA_STATUS for SMCA
systems but in a new register.

Add a synd field to struct mce to hold MCA_SYND register value. Add it
to the end of struct mce to maintain compatibility with old versions of
mcelog. Also, add it to the respective tracepoint.

Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/1467633035-32080-1-git-send-email-Yazen.Ghannam@amd.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/mce.h           | 5 ++++-
 arch/x86/include/uapi/asm/mce.h      | 1 +
 arch/x86/kernel/cpu/mcheck/mce.c     | 4 ++++
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 3 +++
 include/trace/events/mce.h           | 6 ++++--
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 8bf766ef0e18..21bc5a3a4c89 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,9 +40,10 @@
 #define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
 
 /* AMD-specific bits */
+#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
+#define MCI_STATUS_SYNDV	(1ULL<<53)  /* synd reg. valid */
 #define MCI_STATUS_DEFERRED	(1ULL<<44)  /* uncorrected error, deferred exception */
 #define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
-#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
 
 /*
  * McaX field if set indicates a given bank supports MCA extensions:
@@ -110,6 +111,7 @@
 #define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
 #define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
+#define MSR_AMD64_SMCA_MC0_SYND		0xc0002006
 #define MSR_AMD64_SMCA_MC0_DESTAT	0xc0002008
 #define MSR_AMD64_SMCA_MC0_DEADDR	0xc0002009
 #define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
@@ -119,6 +121,7 @@
 #define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND(x)	(MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_DESTAT(x)	(MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_DEADDR(x)	(MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 2184943341bf..8c75fbc94c3f 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -26,6 +26,7 @@ struct mce {
 	__u32 socketid;	/* CPU socket ID */
 	__u32 apicid;	/* CPU initial apic ID */
 	__u64 mcgcap;	/* MCGCAP MSR: machine check capabilities of CPU */
+	__u64 synd;	/* MCA_SYND MSR: only valid on SMCA systems */
 };
 
 #define MCE_GET_RECORD_LEN   _IOR('M', 1, int)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 92e5e37d97bf..16aebe737cae 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -568,6 +568,7 @@ static void mce_read_aux(struct mce *m, int i)
 {
 	if (m->status & MCI_STATUS_MISCV)
 		m->misc = mce_rdmsrl(msr_ops.misc(i));
+
 	if (m->status & MCI_STATUS_ADDRV) {
 		m->addr = mce_rdmsrl(msr_ops.addr(i));
 
@@ -580,6 +581,9 @@ static void mce_read_aux(struct mce *m, int i)
 			m->addr <<= shift;
 		}
 	}
+
+	if (mce_flags.smca && (m->status & MCI_STATUS_SYNDV))
+		m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 }
 
 static bool memory_error(struct mce *m)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 7b7f3be783d4..8b8c33a6e6a0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -479,6 +479,9 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 	if (m.status & MCI_STATUS_ADDRV)
 		rdmsrl(msr_addr, m.addr);
 
+	if (mce_flags.smca && (m.status & MCI_STATUS_SYNDV))
+		rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+
 	mce_log(&m);
 
 	wrmsrl(msr_status, 0);
diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
index 4cbbcef6baa8..8be5268caf28 100644
--- a/include/trace/events/mce.h
+++ b/include/trace/events/mce.h
@@ -20,6 +20,7 @@ TRACE_EVENT(mce_record,
 		__field(	u64,		status		)
 		__field(	u64,		addr		)
 		__field(	u64,		misc		)
+		__field(	u64,		synd		)
 		__field(	u64,		ip		)
 		__field(	u64,		tsc		)
 		__field(	u64,		walltime	)
@@ -38,6 +39,7 @@ TRACE_EVENT(mce_record,
 		__entry->status		= m->status;
 		__entry->addr		= m->addr;
 		__entry->misc		= m->misc;
+		__entry->synd		= m->synd;
 		__entry->ip		= m->ip;
 		__entry->tsc		= m->tsc;
 		__entry->walltime	= m->time;
@@ -50,11 +52,11 @@ TRACE_EVENT(mce_record,
 		__entry->cpuvendor	= m->cpuvendor;
 	),
 
-	TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC: %016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x",
+	TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x",
 		__entry->cpu,
 		__entry->mcgcap, __entry->mcgstatus,
 		__entry->bank, __entry->status,
-		__entry->addr, __entry->misc,
+		__entry->addr, __entry->misc, __entry->synd,
 		__entry->cs, __entry->ip,
 		__entry->tsc,
 		__entry->cpuvendor, __entry->cpuid,
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/6] x86/mce: Fix mce_rdmsrl() warning message
  2016-07-08  9:09 [PATCH 0/6] x86/RAS queue Borislav Petkov
                   ` (2 preceding siblings ...)
  2016-07-08  9:09 ` [PATCH 3/6] x86/mce: Add support for new MCA_SYND register Borislav Petkov
@ 2016-07-08  9:09 ` Borislav Petkov
  2016-07-08 12:06   ` [tip:ras/core] " tip-bot for Borislav Petkov
  2016-07-08  9:09 ` [PATCH 5/6] EDAC, mce_amd: Print syndrome register value on SMCA systems Borislav Petkov
  2016-07-08  9:09 ` [PATCH 6/6] x86/RAS: Add syndrome support to mce_amd_inj Borislav Petkov
  5 siblings, 1 reply; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML

From: Borislav Petkov <bp@suse.de>

The MSR address we're dumping in there should be in hex, otherwise we
get funsies like:

[    0.016000] WARNING: CPU: 1 PID: 0 at arch/x86/kernel/cpu/mcheck/mce.c:428 mce_rdmsrl+0xd9/0xe0
[    0.016000] mce: Unable to read msr -1073733631!
				       ^^^^^^^^^^^

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 16aebe737cae..2f7bb1f075c2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -425,7 +425,7 @@ static u64 mce_rdmsrl(u32 msr)
 	}
 
 	if (rdmsrl_safe(msr, &v)) {
-		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
+		WARN_ONCE(1, "mce: Unable to read msr 0x%x!\n", msr);
 		/*
 		 * Return zero in case the access faulted. This should
 		 * not happen normally but can happen if the CPU does
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/6] EDAC, mce_amd: Print syndrome register value on SMCA systems
  2016-07-08  9:09 [PATCH 0/6] x86/RAS queue Borislav Petkov
                   ` (3 preceding siblings ...)
  2016-07-08  9:09 ` [PATCH 4/6] x86/mce: Fix mce_rdmsrl() warning message Borislav Petkov
@ 2016-07-08  9:09 ` Borislav Petkov
  2016-07-08  9:09 ` [PATCH 6/6] x86/RAS: Add syndrome support to mce_amd_inj Borislav Petkov
  5 siblings, 0 replies; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML

From: Yazen Ghannam <Yazen.Ghannam@amd.com>

Print SyndV bit status and print the raw value of the MCA_SYND register.
Further decoding of the syndrome from struct mce.synd can be done in
other places where appropriate, e.g. DRAM ECC.

Boris: make the error stanza more compact by putting the error address
and syndrome on the same line:

  [Hardware Error]: Corrected error, no action required.
  [Hardware Error]: CPU:2 (17:0:0) MC4_STATUS[-|CE|-|PCC|AddrV|-|-|SyndV|CECC]: 0x96204100001e0117
  [Hardware Error]: Error Addr: 0x000000007f4c52e3, Syndrome: 0x0000000000000000
  [Hardware Error]: Invalid IP block specified.
  [Hardware Error]: cache level: L3/GEN, tx: DATA, mem-tx: RD

Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1467633035-32080-2-git-send-email-Yazen.Ghannam@amd.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/mce_amd.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 9b6800a79c7f..057ece577800 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -927,7 +927,7 @@ static void decode_smca_errors(struct mce *m)
 	size_t len;
 
 	if (rdmsr_safe(addr, &low, &high)) {
-		pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+		pr_emerg(HW_ERR "Invalid IP block specified.\n");
 		return;
 	}
 
@@ -1078,6 +1078,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 		u32 low, high;
 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
 
+		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
+
 		if (!rdmsr_safe(addr, &low, &high) &&
 		    (low & MCI_CONFIG_MCAX))
 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
@@ -1091,12 +1093,18 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	pr_cont("]: 0x%016llx\n", m->status);
 
 	if (m->status & MCI_STATUS_ADDRV)
-		pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
+		pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr);
 
 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
+		if (m->status & MCI_STATUS_SYNDV)
+			pr_cont(", Syndrome: 0x%016llx", m->synd);
+
+		pr_cont("\n");
+
 		decode_smca_errors(m);
 		goto err_code;
-	}
+	} else
+		pr_cont("\n");
 
 	if (!fam_ops)
 		goto err_code;
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/6] x86/RAS: Add syndrome support to mce_amd_inj
  2016-07-08  9:09 [PATCH 0/6] x86/RAS queue Borislav Petkov
                   ` (4 preceding siblings ...)
  2016-07-08  9:09 ` [PATCH 5/6] EDAC, mce_amd: Print syndrome register value on SMCA systems Borislav Petkov
@ 2016-07-08  9:09 ` Borislav Petkov
  5 siblings, 0 replies; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML

From: Yazen Ghannam <Yazen.Ghannam@amd.com>

Add a debugfs file which holds the error syndrome (written into
MCA_SYND) of an injected error. Only write it on SMCA systems. Update
README file, while at it.

Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/1467633035-32080-3-git-send-email-Yazen.Ghannam@amd.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/ras/mce_amd_inj.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index 1104515d5ad2..ff8eb1a9ce6d 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val)				\
 MCE_INJECT_SET(status);
 MCE_INJECT_SET(misc);
 MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
 
 #define MCE_INJECT_GET(reg)						\
 static int inj_##reg##_get(void *data, u64 *val)			\
@@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val)			\
 MCE_INJECT_GET(status);
 MCE_INJECT_GET(misc);
 MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
 
 DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
 
 /*
  * Caller needs to be make sure this cpu doesn't disappear
@@ -258,6 +261,7 @@ static void prepare_msrs(void *info)
 		}
 
 		wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc);
+		wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), i_mce.synd);
 	} else {
 		wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status);
 		wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr);
@@ -275,6 +279,9 @@ static void do_inject(void)
 	if (i_mce.misc)
 		i_mce.status |= MCI_STATUS_MISCV;
 
+	if (i_mce.synd)
+		i_mce.status |= MCI_STATUS_SYNDV;
+
 	if (inj_type == SW_INJ) {
 		mce_inject_log(&i_mce);
 		return;
@@ -371,6 +378,9 @@ static const char readme_msg[] =
 "\t used for error thresholding purposes and its validity is indicated by\n"
 "\t MCi_STATUS[MiscV].\n"
 "\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
 "addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
 "\t associated with the error.\n"
 "\n"
@@ -420,6 +430,7 @@ static struct dfs_node {
 	{ .name = "status",	.fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
 	{ .name = "misc",	.fops = &misc_fops,   .perm = S_IRUSR | S_IWUSR },
 	{ .name = "addr",	.fops = &addr_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "synd",	.fops = &synd_fops,   .perm = S_IRUSR | S_IWUSR },
 	{ .name = "bank",	.fops = &bank_fops,   .perm = S_IRUSR | S_IWUSR },
 	{ .name = "flags",	.fops = &flags_fops,  .perm = S_IRUSR | S_IWUSR },
 	{ .name = "cpu",	.fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
-- 
2.7.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type
  2016-07-08  9:09 ` [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type Borislav Petkov
@ 2016-07-08  9:21   ` Ingo Molnar
  2016-07-08  9:32     ` Borislav Petkov
  2016-07-08 12:05   ` [tip:ras/core] x86/mce/AMD: Increase size of the " tip-bot for Aravind Gopalakrishnan
  1 sibling, 1 reply; 18+ messages in thread
From: Ingo Molnar @ 2016-07-08  9:21 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: LKML, Aravind Gopalakrishnan, Yazen Ghannam


* Borislav Petkov <bp@alien8.de> wrote:

> From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
> 
> Change bank_map type from char to int since we now have more than eight
> banks in a system.
> 
> Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
> Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
> Cc: Tony Luck <tony.luck@intel.com>
> Cc: linux-edac <linux-edac@vger.kernel.org>
> Link: http://lkml.kernel.org/r/1466462163-29008-1-git-send-email-Yazen.Ghannam@amd.com
> Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
> Signed-off-by: Borislav Petkov <bp@suse.de>
> ---
>  arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
> index 10b0661651e0..7b7f3be783d4 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
> @@ -93,7 +93,7 @@ const char * const amd_df_mcablock_names[] = {
>  EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
>  
>  static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
> -static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
> +static DEFINE_PER_CPU(unsigned int, bank_map);	/* see which banks are on */

Btw., is there any check somewhere which printed a helpful warning when we 
exceeded the 8 banks limit - and which would prints a helpful warning if we ever 
exceed the 32 banks limit?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] x86/mce: Add support for new MCA_SYND register
  2016-07-08  9:09 ` [PATCH 3/6] x86/mce: Add support for new MCA_SYND register Borislav Petkov
@ 2016-07-08  9:26   ` Ingo Molnar
  2016-07-08  9:37     ` Borislav Petkov
  0 siblings, 1 reply; 18+ messages in thread
From: Ingo Molnar @ 2016-07-08  9:26 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: LKML, Yazen Ghannam


* Borislav Petkov <bp@alien8.de> wrote:

> From: Yazen Ghannam <Yazen.Ghannam@amd.com>
> 
> Syndrome information is no longer contained in MCA_STATUS for SMCA
> systems but in a new register.
> 
> Add a synd field to struct mce to hold MCA_SYND register value. Add it
> to the end of struct mce to maintain compatibility with old versions of
> mcelog. Also, add it to the respective tracepoint.

>  /* AMD-specific bits */
> +#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
> +#define MCI_STATUS_SYNDV	(1ULL<<53)  /* synd reg. valid */

> --- a/arch/x86/include/uapi/asm/mce.h
> +++ b/arch/x86/include/uapi/asm/mce.h
> @@ -26,6 +26,7 @@ struct mce {
>  	__u32 socketid;	/* CPU socket ID */
>  	__u32 apicid;	/* CPU initial apic ID */
>  	__u64 mcgcap;	/* MCGCAP MSR: machine check capabilities of CPU */
> +	__u64 synd;	/* MCA_SYND MSR: only valid on SMCA systems */
>  };

So why does neither the changelog nor the code comment actually _explain_ this and 
give aa bit of a background about what 'syndrome information' is and why we want 
to have kernel support for it?

This is why I hate kernel tooling that is not part of the kernel tree - the mcelog 
patch (hopefully ...) would tell us more about all this - but it's separate and 
this patch does not tell us anything ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type
  2016-07-08  9:21   ` Ingo Molnar
@ 2016-07-08  9:32     ` Borislav Petkov
  0 siblings, 0 replies; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:32 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML, Aravind Gopalakrishnan, Yazen Ghannam

On Fri, Jul 08, 2016 at 11:21:35AM +0200, Ingo Molnar wrote:
> Btw., is there any check somewhere which printed a helpful warning when we 
> exceeded the 8 banks limit - and which would prints a helpful warning if we ever 
> exceed the 32 banks limit?

__mcheck_cpu_cap_init().

And it'll be hard to exceed this limit as there are hw limitations in play.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] x86/mce: Add support for new MCA_SYND register
  2016-07-08  9:26   ` Ingo Molnar
@ 2016-07-08  9:37     ` Borislav Petkov
  2016-07-08  9:46       ` Ingo Molnar
  0 siblings, 1 reply; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08  9:37 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML, Yazen Ghannam

On Fri, Jul 08, 2016 at 11:26:59AM +0200, Ingo Molnar wrote:
> So why does neither the changelog nor the code comment actually _explain_ this and 
> give aa bit of a background about what 'syndrome information' is and why we want 
> to have kernel support for it?
> 
> This is why I hate kernel tooling that is not part of the kernel tree - the mcelog 
> patch (hopefully ...) would tell us more about all this - but it's separate and 
> this patch does not tell us anything ...

Ah, this is one of those omissions where we forgot to explain, sorry.
How about this:

"The syndrome value is used to uniquely identify which bits of a
reported ECC error are corrupted."

Do you want it as a comment in the code or in the commit message or both?

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] x86/mce: Add support for new MCA_SYND register
  2016-07-08  9:37     ` Borislav Petkov
@ 2016-07-08  9:46       ` Ingo Molnar
  2016-07-08 10:14         ` Borislav Petkov
  0 siblings, 1 reply; 18+ messages in thread
From: Ingo Molnar @ 2016-07-08  9:46 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: LKML, Yazen Ghannam, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra


* Borislav Petkov <bp@alien8.de> wrote:

> On Fri, Jul 08, 2016 at 11:26:59AM +0200, Ingo Molnar wrote:
> > So why does neither the changelog nor the code comment actually _explain_ this and 
> > give aa bit of a background about what 'syndrome information' is and why we want 
> > to have kernel support for it?
> > 
> > This is why I hate kernel tooling that is not part of the kernel tree - the mcelog 
> > patch (hopefully ...) would tell us more about all this - but it's separate and 
> > this patch does not tell us anything ...
> 
> Ah, this is one of those omissions where we forgot to explain, sorry.
> How about this:
> 
> "The syndrome value is used to uniquely identify which bits of a
> reported ECC error are corrupted."

I'm not sure I can parse that: how can a reported error have bits corrupted?

Or is this about various details about the location of the error (normally 
contained in a 'struct mce' entry), and the 'syndrome value' further qualifies 
that information by telling us which fields of those records are reliable?

I.e. a bit more context would be nice. You cannot go wrong if you assume that 
readers of changelogs (and maintainers in particular) have the attention span
of a slightly retarded golden retriever.

> Do you want it as a comment in the code or in the commit message or both?

I'm fine with an add-on patch that adds a good explanation for all this to the 
code.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] x86/mce: Add support for new MCA_SYND register
  2016-07-08  9:46       ` Ingo Molnar
@ 2016-07-08 10:14         ` Borislav Petkov
  2016-07-08 10:26           ` Ingo Molnar
  0 siblings, 1 reply; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08 10:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: LKML, Yazen Ghannam, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra

On Fri, Jul 08, 2016 at 11:46:53AM +0200, Ingo Molnar wrote:
> I'm not sure I can parse that: how can a reported error have bits corrupted?

No, it is about the actual bits in memory the ECC error is generated
for. So, for example, if an ECC error reports that memory location X had
some bit flips, the syndrome value which gets reported together with
same ECC error shows which actual bits have flipped.

Here's an example from the AMD BKDG, maybe that'll make it more clear:

http://support.amd.com/TechDocs/42301_15h_Mod_00h-0Fh_BKDG.pdf

Go to page 246, there it says this:

"For example, assume the ECC syndrome is 03EAh. First search row EAh
for the complete syndrome. Since it is not found, search row 03h for
the complete syndrome. It is found in column 9h, so symbol 9h has the
error. Since the error bitmask indicates value 3h (0011b), bits 0 and 1
within that symbol are corrupted. Symbol 9h maps to bits 72-79, so the
corrupted bits are 72 and 73 of the line."

So you basically search the table of x8 ECC correctable syndromes, first
in row EAh (second syndrome byte) and if you don't find the complete
syndrome there, you search row 03 for it.

It is in column 9 and that means symbol 9. The symbols are 16 - one
symbol for each byte in a 128bit DRAM word + 3 special symbols for the
ECC bits.

The row number 3h is also the error bitmask, so bits 0 and 1 are the
ones which are corrupted.

Which means, when you look at the value in DRAM at the address the error
was reported, you need to go to symbol 9, that's 9*8 = 72 which means,
bits 72-79 and the first 2 in that byte are bits 72 and 73.

So if you want to correct them, you simply flip them as the syndrome
tells you that those 2 are corrupted.

Ok?

See how easy it is :-)))

> I'm fine with an add-on patch that adds a good explanation for all
> this to the code.

How about we point to that section in the BKDG? I think it is written
pretty understandably for a technical document and the example makes it
even more explicit.

:-)

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] x86/mce: Add support for new MCA_SYND register
  2016-07-08 10:14         ` Borislav Petkov
@ 2016-07-08 10:26           ` Ingo Molnar
  2016-07-08 10:48             ` Borislav Petkov
  0 siblings, 1 reply; 18+ messages in thread
From: Ingo Molnar @ 2016-07-08 10:26 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: LKML, Yazen Ghannam, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra


* Borislav Petkov <bp@alien8.de> wrote:

> On Fri, Jul 08, 2016 at 11:46:53AM +0200, Ingo Molnar wrote:
> > I'm not sure I can parse that: how can a reported error have bits corrupted?
> 
> No, it is about the actual bits in memory the ECC error is generated
> for. So, for example, if an ECC error reports that memory location X had
> some bit flips, the syndrome value which gets reported together with
> same ECC error shows which actual bits have flipped.
> 
> Here's an example from the AMD BKDG, maybe that'll make it more clear:
> 
> http://support.amd.com/TechDocs/42301_15h_Mod_00h-0Fh_BKDG.pdf
> 
> Go to page 246, there it says this:
> 
> "For example, assume the ECC syndrome is 03EAh. First search row EAh
> for the complete syndrome. Since it is not found, search row 03h for
> the complete syndrome. It is found in column 9h, so symbol 9h has the
> error. Since the error bitmask indicates value 3h (0011b), bits 0 and 1
> within that symbol are corrupted. Symbol 9h maps to bits 72-79, so the
> corrupted bits are 72 and 73 of the line."
> 
> So you basically search the table of x8 ECC correctable syndromes, first
> in row EAh (second syndrome byte) and if you don't find the complete
> syndrome there, you search row 03 for it.
> 
> It is in column 9 and that means symbol 9. The symbols are 16 - one
> symbol for each byte in a 128bit DRAM word + 3 special symbols for the
> ECC bits.
> 
> The row number 3h is also the error bitmask, so bits 0 and 1 are the
> ones which are corrupted.
> 
> Which means, when you look at the value in DRAM at the address the error
> was reported, you need to go to symbol 9, that's 9*8 = 72 which means,
> bits 72-79 and the first 2 in that byte are bits 72 and 73.
> 
> So if you want to correct them, you simply flip them as the syndrome
> tells you that those 2 are corrupted.
> 
> Ok?

So is 'ECC syndrome' a fancy word and a complicated process for identifying what 
data got corrupted, in a more accurate fashion than what we had before?

Because previously we already had a memory address of the memory corruption, 
right?

What is the typical 'scope' of that memory corruption address - a cache line, a 
machine word, a byte or maybe a variable unit that is memory hardware dependent?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] x86/mce: Add support for new MCA_SYND register
  2016-07-08 10:26           ` Ingo Molnar
@ 2016-07-08 10:48             ` Borislav Petkov
  0 siblings, 0 replies; 18+ messages in thread
From: Borislav Petkov @ 2016-07-08 10:48 UTC (permalink / raw)
  To: Ingo Molnar, Yazen Ghannam
  Cc: LKML, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra

On Fri, Jul 08, 2016 at 12:26:48PM +0200, Ingo Molnar wrote:
> So is 'ECC syndrome' a fancy word and a complicated process for
> identifying what data got corrupted, in a more accurate fashion than
> what we had before?

The syndrome has always been there - even since K8 at least. This patch
is simply adding the change that on SMCA systems it should be read from
a different MSR.

The syndrome is part of the magic math behind Error Correction Codes
which can be used to point to which bits in the word in that memory
address were flipped.

OOOOh wait a minute!

I'm just getting the sickest idea:

@Yazen, is that SMCA syndrome max 16 bits on SMCA? Because if so - and I
would bet good money it is so - then we can stuff it into its old place
in the MCI_STATUS register part of struct mce, i.e. mce->status.

And then you won't need to touch the tracepoint and any of that.

Because you do:

	rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd)

and I'll venture a good guess that that whole 64 bits MSR is not the
syndrome.

Right?

If I'm right, all those patches adding syndrome support need to be
reworked.

> Because previously we already had a memory address of the memory
> corruption, right?

We've always had the address and the syndrome. The syndrome is in
MCI_STATUS on older machines.

> What is the typical 'scope' of that memory corruption address - a
> cache line, a machine word, a byte or maybe a variable unit that is
> memory hardware dependent?

Typically 128 bit as the example above shows. The syndrome covers those
whole 128 bit. AFAIR(!), DRAM accesses are always done in 128 bit words
even if less is being read. All nicely hidden by the DRAM controller.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [tip:ras/core] x86/mce/AMD: Increase size of the bank_map type
  2016-07-08  9:09 ` [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type Borislav Petkov
  2016-07-08  9:21   ` Ingo Molnar
@ 2016-07-08 12:05   ` tip-bot for Aravind Gopalakrishnan
  1 sibling, 0 replies; 18+ messages in thread
From: tip-bot for Aravind Gopalakrishnan @ 2016-07-08 12:05 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: brgerst, dvlasenk, hpa, peterz, mingo, Yazen.Ghannam,
	aravindksg.lkml, torvalds, linux-kernel, Aravind.Gopalakrishnan,
	tony.luck, linux-edac, luto, jpoimboe, tglx, bp, bp

Commit-ID:  955d1427a91b18f53e082bd7c19c40ce13b0a0f4
Gitweb:     http://git.kernel.org/tip/955d1427a91b18f53e082bd7c19c40ce13b0a0f4
Author:     Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
AuthorDate: Fri, 8 Jul 2016 11:09:38 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 8 Jul 2016 11:29:25 +0200

x86/mce/AMD: Increase size of the bank_map type

Change bank_map type from 'char' to 'int' since we now have more than eight
banks in a system.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1467968983-4874-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 10b0661..7b7f3be 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -93,7 +93,7 @@ const char * const amd_df_mcablock_names[] = {
 EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
 
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
+static DEFINE_PER_CPU(unsigned int, bank_map);	/* see which banks are on */
 
 static void amd_threshold_interrupt(void);
 static void amd_deferred_error_interrupt(void);

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [tip:ras/core] x86/RAS/AMD: Reduce the number of IPIs when prepping error injection
  2016-07-08  9:09 ` [PATCH 2/6] x86/RAS/AMD: Reduce number of IPIs when prepping error injection Borislav Petkov
@ 2016-07-08 12:06   ` tip-bot for Yazen Ghannam
  0 siblings, 0 replies; 18+ messages in thread
From: tip-bot for Yazen Ghannam @ 2016-07-08 12:06 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: luto, brgerst, peterz, linux-edac, torvalds, tglx, hpa,
	Yazen.Ghannam, tony.luck, jpoimboe, linux-kernel, dvlasenk,
	mingo, bp, bp, aravindksg.lkml

Commit-ID:  340e983ab8afd02b59d698dd1365d7773bf136b3
Gitweb:     http://git.kernel.org/tip/340e983ab8afd02b59d698dd1365d7773bf136b3
Author:     Yazen Ghannam <Yazen.Ghannam@amd.com>
AuthorDate: Fri, 8 Jul 2016 11:09:39 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 8 Jul 2016 11:29:26 +0200

x86/RAS/AMD: Reduce the number of IPIs when prepping error injection

We currently use wrmsr_on_cpu() 4 times when prepping for an error
injection. This will generate 4 IPIs for each MSR write. We can reduce
the number of IPIs to 1 by grouping the MSR writes and executing them
serially on the appropriate CPU.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Yazen Ghannam <Yazen.Ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aravind Gopalakrishnan <aravindksg.lkml@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1467968983-4874-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ras/mce_amd_inj.c | 58 ++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index e69f470..1104515 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -241,6 +241,31 @@ static void toggle_nb_mca_mst_cpu(u16 nid)
 		       __func__, PCI_FUNC(F3->devfn), NBCFG);
 }
 
+static void prepare_msrs(void *info)
+{
+	struct mce i_mce = *(struct mce *)info;
+	u8 b = i_mce.bank;
+
+	wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus);
+
+	if (boot_cpu_has(X86_FEATURE_SMCA)) {
+		if (i_mce.inject_flags == DFR_INT_INJ) {
+			wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status);
+			wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr);
+		} else {
+			wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status);
+			wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr);
+		}
+
+		wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc);
+	} else {
+		wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status);
+		wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr);
+		wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc);
+	}
+
+}
+
 static void do_inject(void)
 {
 	u64 mcg_status = 0;
@@ -287,36 +312,9 @@ static void do_inject(void)
 
 	toggle_hw_mce_inject(cpu, true);
 
-	wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS,
-		     (u32)mcg_status, (u32)(mcg_status >> 32));
-
-	if (boot_cpu_has(X86_FEATURE_SMCA)) {
-		if (inj_type == DFR_INT_INJ) {
-			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b),
-				     (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
-			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b),
-				     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
-		} else {
-			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b),
-				     (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
-			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b),
-				     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
-		}
-
-		wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b),
-			     (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
-	} else {
-		wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
-			     (u32)i_mce.status, (u32)(i_mce.status >> 32));
-
-		wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
-			     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
-
-		wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
-			     (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
-	}
+	i_mce.mcgstatus = mcg_status;
+	i_mce.inject_flags = inj_type;
+	smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
 
 	toggle_hw_mce_inject(cpu, false);
 

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [tip:ras/core] x86/mce: Fix mce_rdmsrl() warning message
  2016-07-08  9:09 ` [PATCH 4/6] x86/mce: Fix mce_rdmsrl() warning message Borislav Petkov
@ 2016-07-08 12:06   ` tip-bot for Borislav Petkov
  0 siblings, 0 replies; 18+ messages in thread
From: tip-bot for Borislav Petkov @ 2016-07-08 12:06 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: torvalds, hpa, tony.luck, bp, peterz, brgerst, tglx, bp,
	linux-kernel, dvlasenk, luto, jpoimboe, mingo

Commit-ID:  38c54ccb2ded3e93d8a353baeb7b9e12e1b77e23
Gitweb:     http://git.kernel.org/tip/38c54ccb2ded3e93d8a353baeb7b9e12e1b77e23
Author:     Borislav Petkov <bp@suse.de>
AuthorDate: Fri, 8 Jul 2016 11:09:41 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 8 Jul 2016 11:29:26 +0200

x86/mce: Fix mce_rdmsrl() warning message

The MSR address we're dumping in there should be in hex, otherwise we
get funsies like:

  [    0.016000] WARNING: CPU: 1 PID: 0 at arch/x86/kernel/cpu/mcheck/mce.c:428 mce_rdmsrl+0xd9/0xe0
  [    0.016000] mce: Unable to read msr -1073733631!
				       ^^^^^^^^^^^

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1467968983-4874-5-git-send-email-bp@alien8.de
[ Fixed capitalization of 'MSR'. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 92e5e37..58af630 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -425,7 +425,7 @@ static u64 mce_rdmsrl(u32 msr)
 	}
 
 	if (rdmsrl_safe(msr, &v)) {
-		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
+		WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
 		/*
 		 * Return zero in case the access faulted. This should
 		 * not happen normally but can happen if the CPU does

^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2016-07-08 12:07 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-08  9:09 [PATCH 0/6] x86/RAS queue Borislav Petkov
2016-07-08  9:09 ` [PATCH 1/6] x86/mce/AMD: Increase size of bank_map type Borislav Petkov
2016-07-08  9:21   ` Ingo Molnar
2016-07-08  9:32     ` Borislav Petkov
2016-07-08 12:05   ` [tip:ras/core] x86/mce/AMD: Increase size of the " tip-bot for Aravind Gopalakrishnan
2016-07-08  9:09 ` [PATCH 2/6] x86/RAS/AMD: Reduce number of IPIs when prepping error injection Borislav Petkov
2016-07-08 12:06   ` [tip:ras/core] x86/RAS/AMD: Reduce the " tip-bot for Yazen Ghannam
2016-07-08  9:09 ` [PATCH 3/6] x86/mce: Add support for new MCA_SYND register Borislav Petkov
2016-07-08  9:26   ` Ingo Molnar
2016-07-08  9:37     ` Borislav Petkov
2016-07-08  9:46       ` Ingo Molnar
2016-07-08 10:14         ` Borislav Petkov
2016-07-08 10:26           ` Ingo Molnar
2016-07-08 10:48             ` Borislav Petkov
2016-07-08  9:09 ` [PATCH 4/6] x86/mce: Fix mce_rdmsrl() warning message Borislav Petkov
2016-07-08 12:06   ` [tip:ras/core] " tip-bot for Borislav Petkov
2016-07-08  9:09 ` [PATCH 5/6] EDAC, mce_amd: Print syndrome register value on SMCA systems Borislav Petkov
2016-07-08  9:09 ` [PATCH 6/6] x86/RAS: Add syndrome support to mce_amd_inj Borislav Petkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.