[PATCH] x86: show remote CPU state upon fatal NMI

* [PATCH] x86: show remote CPU state upon fatal NMI
@ 2016-06-14 14:33 Jan Beulich
  2016-06-14 15:03 ` Andrew Cooper
  0 siblings, 1 reply; 6+ messages in thread
From: Jan Beulich @ 2016-06-14 14:33 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

[-- Attachment #1: Type: text/plain, Size: 3339 bytes --]

Quite frequently the watchdog would hit an innocent CPU, e.g. one
trying to acquire a spin lock a remote CPU holds for extended periods
of time, or a random CPU in TSC calbration rendezvous. In such cases
the register and stack dump for that CPU doesn't really help in the
analysis of the problem.

To keep things reasonable on large systems, only log CS:RIP by default.
This can be overridden via a new extension to the "nmi=" command line
option such that full register/stack state will get dumped.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1155,7 +1155,7 @@ Use the MWAIT idle driver (with model sp
 of the ACPI based one.
 
 ### nmi
-> `= ignore | dom0 | fatal`
+> `= ignore | dom0 | fatal [,show-all]`
 
 > Default: `fatal` for a debug build, or `dom0` for a non-debug build
 
@@ -1163,6 +1163,9 @@ Specify what Xen should do in the event
 `ignore` discards the error; `dom0` causes Xen to report the error to
 dom0, while 'fatal' causes Xen to print diagnostics and then hang.
 
+The `show-all` modifier forces all CPUs' full state to be dumped upon
+fatal NMIs (normally a result of the watchdog kicking in).
+
 ### noapic
 
 Instruct Xen to ignore any IOAPICs that are present in the system, and
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -84,10 +84,11 @@
  *  dom0:   The NMI is virtualised to DOM0.
  *  ignore: The NMI error is cleared and ignored.
  */
+static char __read_mostly opt_nmi[16] =
 #ifdef NDEBUG
-static char __read_mostly opt_nmi[10] = "dom0";
+ "dom0";
 #else
-static char __read_mostly opt_nmi[10] = "fatal";
+ "fatal";
 #endif
 string_param("nmi", opt_nmi);
 
@@ -525,6 +526,35 @@ void vcpu_show_execution_state(struct vc
     vcpu_unpause(v);
 }
 
+static cpumask_t nmi_show_state_mask;
+static bool_t opt_nmi_show_all;
+
+static int __init get_nmi_show_all(void)
+{
+    const char *s = strchr(opt_nmi, ',');
+
+    if ( s && !strcmp(s + 1, "show-all") )
+        opt_nmi_show_all = 1;
+
+    return 0;
+}
+presmp_initcall(get_nmi_show_all);
+
+static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
+{
+    if ( !cpumask_test_cpu(cpu, &nmi_show_state_mask) )
+        return 0;
+
+    if ( opt_nmi_show_all )
+        show_execution_state(regs);
+    else
+        printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs, regs->rip,
+               guest_mode(regs) ? _p(regs->rip) : NULL);
+    cpumask_clear_cpu(cpu, &nmi_show_state_mask);
+
+    return 1;
+}
+
 static const char *trapstr(unsigned int trapnr)
 {
     static const char * const strings[] = {
@@ -570,6 +600,15 @@ void fatal_trap(const struct cpu_user_re
             printk("Faulting linear address: %p\n", _p(cr2));
             show_page_walk(cr2);
         }
+        else if ( trapnr == TRAP_nmi )
+        {
+            cpumask_andnot(&nmi_show_state_mask, &cpu_online_map,
+                           cpumask_of(smp_processor_id()));
+            set_nmi_callback(nmi_show_execution_state);
+            smp_send_nmi_allbutself();
+            while ( !cpumask_empty(&nmi_show_state_mask) )
+                cpu_relax();
+        }
     }
 
     panic("FATAL TRAP: vector = %d (%s)\n"




[-- Attachment #2: x86-NMI-show-global-state.patch --]
[-- Type: text/plain, Size: 3378 bytes --]

x86: show remote CPU state upon fatal NMI

Quite frequently the watchdog would hit an innocent CPU, e.g. one
trying to acquire a spin lock a remote CPU holds for extended periods
of time, or a random CPU in TSC calbration rendezvous. In such cases
the register and stack dump for that CPU doesn't really help in the
analysis of the problem.

To keep things reasonable on large systems, only log CS:RIP by default.
This can be overridden via a new extension to the "nmi=" command line
option such that full register/stack state will get dumped.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1155,7 +1155,7 @@ Use the MWAIT idle driver (with model sp
 of the ACPI based one.
 
 ### nmi
-> `= ignore | dom0 | fatal`
+> `= ignore | dom0 | fatal [,show-all]`
 
 > Default: `fatal` for a debug build, or `dom0` for a non-debug build
 
@@ -1163,6 +1163,9 @@ Specify what Xen should do in the event
 `ignore` discards the error; `dom0` causes Xen to report the error to
 dom0, while 'fatal' causes Xen to print diagnostics and then hang.
 
+The `show-all` modifier forces all CPUs' full state to be dumped upon
+fatal NMIs (normally a result of the watchdog kicking in).
+
 ### noapic
 
 Instruct Xen to ignore any IOAPICs that are present in the system, and
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -84,10 +84,11 @@
  *  dom0:   The NMI is virtualised to DOM0.
  *  ignore: The NMI error is cleared and ignored.
  */
+static char __read_mostly opt_nmi[16] =
 #ifdef NDEBUG
-static char __read_mostly opt_nmi[10] = "dom0";
+ "dom0";
 #else
-static char __read_mostly opt_nmi[10] = "fatal";
+ "fatal";
 #endif
 string_param("nmi", opt_nmi);
 
@@ -525,6 +526,35 @@ void vcpu_show_execution_state(struct vc
     vcpu_unpause(v);
 }
 
+static cpumask_t nmi_show_state_mask;
+static bool_t opt_nmi_show_all;
+
+static int __init get_nmi_show_all(void)
+{
+    const char *s = strchr(opt_nmi, ',');
+
+    if ( s && !strcmp(s + 1, "show-all") )
+        opt_nmi_show_all = 1;
+
+    return 0;
+}
+presmp_initcall(get_nmi_show_all);
+
+static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
+{
+    if ( !cpumask_test_cpu(cpu, &nmi_show_state_mask) )
+        return 0;
+
+    if ( opt_nmi_show_all )
+        show_execution_state(regs);
+    else
+        printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs, regs->rip,
+               guest_mode(regs) ? _p(regs->rip) : NULL);
+    cpumask_clear_cpu(cpu, &nmi_show_state_mask);
+
+    return 1;
+}
+
 static const char *trapstr(unsigned int trapnr)
 {
     static const char * const strings[] = {
@@ -570,6 +600,15 @@ void fatal_trap(const struct cpu_user_re
             printk("Faulting linear address: %p\n", _p(cr2));
             show_page_walk(cr2);
         }
+        else if ( trapnr == TRAP_nmi )
+        {
+            cpumask_andnot(&nmi_show_state_mask, &cpu_online_map,
+                           cpumask_of(smp_processor_id()));
+            set_nmi_callback(nmi_show_execution_state);
+            smp_send_nmi_allbutself();
+            while ( !cpumask_empty(&nmi_show_state_mask) )
+                cpu_relax();
+        }
     }
 
     panic("FATAL TRAP: vector = %d (%s)\n"

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 6+ messages in thread