All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
@ 2009-09-09  2:28 Huang Ying
  2009-09-09 12:06 ` Avi Kivity
  2009-09-16 17:59 ` Marcelo Tosatti
  0 siblings, 2 replies; 15+ messages in thread
From: Huang Ying @ 2009-09-09  2:28 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andi Kleen, Anthony Liguori, kvm

UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
where some hardware error such as some memory error can be reported
without PCC (processor context corrupted). To recover from such MCE,
the corresponding memory will be unmapped, and all processes accessing
the memory will be killed via SIGBUS.

For KVM, if QEMU/KVM is killed, all guest processes will be killed
too. So we relay SIGBUS from host OS to guest system via a UCR MCE
injection. Then guest OS can isolate corresponding memory and kill
necessary guest processes only. SIGBUS sent to main thread (not VCPU
threads) will be broadcast to all VCPU threads as UCR MCE.

v2:

- Use qemu_ram_addr_from_host instead of self made one to covert from
  host address to guest RAM address. Thanks Anthony Liguori.

Signed-off-by: Huang Ying <ying.huang@intel.com>

---
 cpu-common.h      |    1 
 exec.c            |   20 +++++--
 qemu-kvm.c        |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 target-i386/cpu.h |   20 ++++++-
 4 files changed, 178 insertions(+), 17 deletions(-)

--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -27,10 +27,23 @@
 #include <sys/mman.h>
 #include <sys/ioctl.h>
 #include <signal.h>
+#include <sys/signalfd.h>
+#include <sys/prctl.h>
 
 #define false 0
 #define true 1
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #define EXPECTED_KVM_API_VERSION 12
 
 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
@@ -1507,6 +1520,37 @@ static void sig_ipi_handler(int n)
 {
 }
 
+static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
+{
+    if (siginfo->ssi_code == BUS_MCEERR_AO) {
+        uint64_t status;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %llx\n",
+                    (unsigned long long)siginfo->ssi_addr);
+            return;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
+        return;
+    } else if (siginfo->ssi_code == BUS_MCEERR_AR)
+        fprintf(stderr, "Hardware memory error!\n");
+    else
+        fprintf(stderr, "Internal error in QEMU!\n");
+    exit(1);
+}
+
 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
 {
     struct qemu_work_item wi;
@@ -1649,29 +1693,102 @@ static void flush_queued_work(CPUState *
     pthread_cond_broadcast(&qemu_work_cond);
 }
 
+static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
+{
+#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    unsigned long paddr;
+    int r;
+
+    if (env->mcg_cap && siginfo->si_addr
+        && (siginfo->si_code == BUS_MCEERR_AR
+            || siginfo->si_code == BUS_MCEERR_AO)) {
+        if (siginfo->si_code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instaed of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (siginfo->si_code == BUS_MCEERR_AO)
+                return;
+            else
+                exit(1);
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            exit(1);
+        }
+    } else
+#endif
+    {
+        if (siginfo->si_code == BUS_MCEERR_AO)
+            return;
+        if (siginfo->si_code == BUS_MCEERR_AR)
+            fprintf(stderr, "Hardware memory error!\n");
+        else
+            fprintf(stderr, "Internal error in QEMU!\n");
+        exit(1);
+    }
+}
+
 static void kvm_main_loop_wait(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
-
-    pthread_mutex_unlock(&qemu_mutex);
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
+    do {
+        pthread_mutex_unlock(&qemu_mutex);
 
-    pthread_mutex_lock(&qemu_mutex);
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        printf("sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        pthread_mutex_lock(&qemu_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            printf("sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+            kvm_on_sigbus(env, &siginfo);
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            printf("sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 
     cpu_single_env = env;
     flush_queued_work(env);
@@ -1752,6 +1869,7 @@ static void setup_kernel_sigmask(CPUStat
 
     sigprocmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
 
     kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set);
 }
@@ -1877,12 +1995,20 @@ void kvm_hpet_enable_kpit(void)
 
 int kvm_init_ap(void)
 {
+    struct sigaction action;
+
 #ifdef TARGET_I386
     kvm_tpr_opt_setup();
 #endif
     qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
 
     signal(SIG_IPI, sig_ipi_handler);
+
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1);
     return 0;
 }
 
@@ -1943,7 +2069,10 @@ static void sigfd_handler(void *opaque)
         }
 
         sigaction(info.ssi_signo, NULL, &action);
-        if (action.sa_handler)
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+	else if (action.sa_handler)
             action.sa_handler(info.ssi_signo);
 
     }
@@ -1993,6 +2122,7 @@ int kvm_main_loop(void)
     sigemptyset(&mask);
     sigaddset(&mask, SIGIO);
     sigaddset(&mask, SIGALRM);
+    sigaddset(&mask, SIGBUS);
     sigprocmask(SIG_BLOCK, &mask, NULL);
 
     sigfd = qemu_signalfd(&mask);
@@ -2518,6 +2648,10 @@ void kvm_inject_x86_mce(CPUState *cenv, 
         .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
     on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
 #endif
 }
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -34,6 +34,7 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
 ram_addr_t qemu_ram_addr_from_host(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
--- a/exec.c
+++ b/exec.c
@@ -2589,9 +2589,7 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return block->host + (addr - block->offset);
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *prev;
     RAMBlock **prevp;
@@ -2608,11 +2606,23 @@ ram_addr_t qemu_ram_addr_from_host(void 
         prev = block;
         block = block->next;
     }
-    if (!block) {
+    if (!block)
+        return -1;
+    *ram_addr = block->offset + (host - block->host);
+    return 0;
+}
+
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
+{
+    ram_addr_t ram_addr;
+
+    if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) {
         fprintf(stderr, "Bad ram pointer %p\n", ptr);
         abort();
     }
-    return block->offset + (host - block->host);
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-09  2:28 [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest Huang Ying
@ 2009-09-09 12:06 ` Avi Kivity
  2009-09-09 12:16   ` Avi Kivity
  2009-09-10  2:40   ` Huang Ying
  2009-09-16 17:59 ` Marcelo Tosatti
  1 sibling, 2 replies; 15+ messages in thread
From: Avi Kivity @ 2009-09-09 12:06 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, Anthony Liguori, kvm

On 09/09/2009 05:28 AM, Huang Ying wrote:
> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> where some hardware error such as some memory error can be reported
> without PCC (processor context corrupted). To recover from such MCE,
> the corresponding memory will be unmapped, and all processes accessing
> the memory will be killed via SIGBUS.
>
> For KVM, if QEMU/KVM is killed, all guest processes will be killed
> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> injection. Then guest OS can isolate corresponding memory and kill
> necessary guest processes only. SIGBUS sent to main thread (not VCPU
> threads) will be broadcast to all VCPU threads as UCR MCE.
>
> v2:
>
> - Use qemu_ram_addr_from_host instead of self made one to covert from
>    host address to guest RAM address. Thanks Anthony Liguori.
>
>    

Patch looks good, but can you clarify the following:

> @@ -1877,12 +1995,20 @@ void kvm_hpet_enable_kpit(void)
>
>   int kvm_init_ap(void)
>   {
> +    struct sigaction action;
> +
>   #ifdef TARGET_I386
>       kvm_tpr_opt_setup();
>   #endif
>       qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
>
>       signal(SIG_IPI, sig_ipi_handler);
> +
> +    memset(&action, 0, sizeof(action));
> +    action.sa_flags = SA_SIGINFO;
> +    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
> +    sigaction(SIGBUS,&action, NULL);
> +    prctl(PR_MCE_KILL, 1, 1);
>       return 0;
>   }
>    

Why do we need a SIGBUS handler?  kvm vcpu threads will block and 
dequeue a SIGBUG in guest mode, so the handler will never be called, and 
we can't really handle SIGBUS in user mode.

(also, I if we can't handle guest-mode SIGBUS I think it would be nice 
to raise it again so the process terminates due to the SIGBUS).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-09 12:06 ` Avi Kivity
@ 2009-09-09 12:16   ` Avi Kivity
  2009-09-09 12:18     ` Avi Kivity
  2009-09-10  2:40   ` Huang Ying
  1 sibling, 1 reply; 15+ messages in thread
From: Avi Kivity @ 2009-09-09 12:16 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, Anthony Liguori, kvm

On 09/09/2009 03:06 PM, Avi Kivity wrote:
> On 09/09/2009 05:28 AM, Huang Ying wrote:
>> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
>> where some hardware error such as some memory error can be reported
>> without PCC (processor context corrupted). To recover from such MCE,
>> the corresponding memory will be unmapped, and all processes accessing
>> the memory will be killed via SIGBUS.
>>
>> For KVM, if QEMU/KVM is killed, all guest processes will be killed
>> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
>> injection. Then guest OS can isolate corresponding memory and kill
>> necessary guest processes only. SIGBUS sent to main thread (not VCPU
>> threads) will be broadcast to all VCPU threads as UCR MCE.
>>
>> v2:
>>
>> - Use qemu_ram_addr_from_host instead of self made one to covert from
>>    host address to guest RAM address. Thanks Anthony Liguori.
>>
>
> Patch looks good, but can you clarify the following:

Oh and I forgot - please make MCE injection optional.  Some 
installations may prefer to kill the guest and have the virtualization 
management system restart it rather than trust the guest to handle the 
MCE properly.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-09 12:16   ` Avi Kivity
@ 2009-09-09 12:18     ` Avi Kivity
  0 siblings, 0 replies; 15+ messages in thread
From: Avi Kivity @ 2009-09-09 12:18 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, Anthony Liguori, kvm

On 09/09/2009 03:16 PM, Avi Kivity wrote:
> Oh and I forgot - please make MCE injection optional.  Some 
> installations may prefer to kill the guest and have the virtualization 
> management system restart it rather than trust the guest to handle the 
> MCE properly.
>

Thinking a little more about it, they can configure the guest to 
panic/reboot on MCE rather than kill the affected process if they don't 
trust the guest service control.  So we can enable it unconditionally.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-09 12:06 ` Avi Kivity
  2009-09-09 12:16   ` Avi Kivity
@ 2009-09-10  2:40   ` Huang Ying
  2009-09-10  9:35     ` Andi Kleen
  1 sibling, 1 reply; 15+ messages in thread
From: Huang Ying @ 2009-09-10  2:40 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andi Kleen, Anthony Liguori, kvm

On Wed, 2009-09-09 at 20:06 +0800, Avi Kivity wrote:
[snip] 
> >
> >   int kvm_init_ap(void)
> >   {
> > +    struct sigaction action;
> > +
> >   #ifdef TARGET_I386
> >       kvm_tpr_opt_setup();
> >   #endif
> >       qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
> >
> >       signal(SIG_IPI, sig_ipi_handler);
> > +
> > +    memset(&action, 0, sizeof(action));
> > +    action.sa_flags = SA_SIGINFO;
> > +    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
> > +    sigaction(SIGBUS,&action, NULL);
> > +    prctl(PR_MCE_KILL, 1, 1);
> >       return 0;
> >   }
> >    
> 
> Why do we need a SIGBUS handler?  kvm vcpu threads will block and 
> dequeue a SIGBUG in guest mode, so the handler will never be called, and 
> we can't really handle SIGBUS in user mode.

There are two kinds of SIGBUS, corresponding to two kinds of UCR MCE. 

One kind of UCR MCE is caused by user space (guest mode) read/write, and
will send SIGBUS to vcpu threads. And that will be processed by vcpu
thread sigbus handler just like SIG_IPI processing.

The other kind of UCR MCE is caused by asynchronously detected errors
(e.g. patrol scrubbing), and will send SIGBUS to main thread instead of
vcpu threads.

> (also, I if we can't handle guest-mode SIGBUS I think it would be nice 
> to raise it again so the process terminates due to the SIGBUS).

For SIGBUS we can not relay to guest as MCE, we can either abort or
reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
the latter one?

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-10  2:40   ` Huang Ying
@ 2009-09-10  9:35     ` Andi Kleen
  2009-09-14  2:55       ` Huang Ying
  2009-09-14  5:10       ` Avi Kivity
  0 siblings, 2 replies; 15+ messages in thread
From: Andi Kleen @ 2009-09-10  9:35 UTC (permalink / raw)
  To: Huang Ying; +Cc: Avi Kivity, Andi Kleen, Anthony Liguori, kvm

> > (also, I if we can't handle guest-mode SIGBUS I think it would be nice 
> > to raise it again so the process terminates due to the SIGBUS).
> 
> For SIGBUS we can not relay to guest as MCE, we can either abort or
> reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
> the latter one?

I think a suitable error message and exit would be better than a plain 
signal kill. It shouldn't look like qemu crashed due to a software
bug. Ideally a error message in a way that it can be parsed by libvirt etc.
and reported in a suitable way.

However qemu getting killed itself is very unlikely, it doesn't
have much memory foot print compared to the guest and other data. 
So this should be a very rare condition.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-10  9:35     ` Andi Kleen
@ 2009-09-14  2:55       ` Huang Ying
  2009-09-14  5:10         ` Avi Kivity
  2009-09-14  5:10       ` Avi Kivity
  1 sibling, 1 reply; 15+ messages in thread
From: Huang Ying @ 2009-09-14  2:55 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andi Kleen, Anthony Liguori, kvm

Hi, Avi,

On Thu, 2009-09-10 at 17:35 +0800, Andi Kleen wrote: 
> > > (also, I if we can't handle guest-mode SIGBUS I think it would be nice 
> > > to raise it again so the process terminates due to the SIGBUS).
> > 
> > For SIGBUS we can not relay to guest as MCE, we can either abort or
> > reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
> > the latter one?
> 
> I think a suitable error message and exit would be better than a plain 
> signal kill. It shouldn't look like qemu crashed due to a software
> bug. Ideally a error message in a way that it can be parsed by libvirt etc.
> and reported in a suitable way.

Do you agree with us about SIGBUS processing?

Best Regards,
Huang Ying


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-10  9:35     ` Andi Kleen
  2009-09-14  2:55       ` Huang Ying
@ 2009-09-14  5:10       ` Avi Kivity
  1 sibling, 0 replies; 15+ messages in thread
From: Avi Kivity @ 2009-09-14  5:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Huang Ying, Anthony Liguori, kvm

On 09/10/2009 12:35 PM, Andi Kleen wrote:
>>> (also, I if we can't handle guest-mode SIGBUS I think it would be nice
>>> to raise it again so the process terminates due to the SIGBUS).
>>>        
>> For SIGBUS we can not relay to guest as MCE, we can either abort or
>> reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
>> the latter one?
>>      
> I think a suitable error message and exit would be better than a plain
> signal kill. It shouldn't look like qemu crashed due to a software
> bug. Ideally a error message in a way that it can be parsed by libvirt etc.
> and reported in a suitable way.
>    

libvirt etc. can/should wait() for qemu to terminate abnormally and 
report the reason why.  However it doesn't seem there is a way to get 
extended signal information from wait(), so it looks like internal 
handling by qemu is better.


> However qemu getting killed itself is very unlikely, it doesn't
> have much memory foot print compared to the guest and other data.
> So this should be a very rare condition.
>    

(I get SIGBUS quite often running qemu from nfs and compiling a new one, 
however that's a really different use case)

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-14  2:55       ` Huang Ying
@ 2009-09-14  5:10         ` Avi Kivity
  2009-09-16  1:09           ` Huang Ying
  0 siblings, 1 reply; 15+ messages in thread
From: Avi Kivity @ 2009-09-14  5:10 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, Anthony Liguori, kvm

On 09/14/2009 05:55 AM, Huang Ying wrote:
> Hi, Avi,
>
> On Thu, 2009-09-10 at 17:35 +0800, Andi Kleen wrote:
>    
>>>> (also, I if we can't handle guest-mode SIGBUS I think it would be nice
>>>> to raise it again so the process terminates due to the SIGBUS).
>>>>          
>>> For SIGBUS we can not relay to guest as MCE, we can either abort or
>>> reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
>>> the latter one?
>>>        
>> I think a suitable error message and exit would be better than a plain
>> signal kill. It shouldn't look like qemu crashed due to a software
>> bug. Ideally a error message in a way that it can be parsed by libvirt etc.
>> and reported in a suitable way.
>>      
> Do you agree with us about SIGBUS processing?
>    

Yes.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-14  5:10         ` Avi Kivity
@ 2009-09-16  1:09           ` Huang Ying
  2009-09-16  8:10             ` Avi Kivity
  0 siblings, 1 reply; 15+ messages in thread
From: Huang Ying @ 2009-09-16  1:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andi Kleen, Anthony Liguori, kvm

On Mon, 2009-09-14 at 13:10 +0800, Avi Kivity wrote: 
> On 09/14/2009 05:55 AM, Huang Ying wrote:
> > Hi, Avi,
> >
> > On Thu, 2009-09-10 at 17:35 +0800, Andi Kleen wrote:
> >    
> >>>> (also, I if we can't handle guest-mode SIGBUS I think it would be nice
> >>>> to raise it again so the process terminates due to the SIGBUS).
> >>>>          
> >>> For SIGBUS we can not relay to guest as MCE, we can either abort or
> >>> reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
> >>> the latter one?
> >>>        
> >> I think a suitable error message and exit would be better than a plain
> >> signal kill. It shouldn't look like qemu crashed due to a software
> >> bug. Ideally a error message in a way that it can be parsed by libvirt etc.
> >> and reported in a suitable way.
> >>      
> > Do you agree with us about SIGBUS processing?
> >    
> 
> Yes.

Do you have some other issue for this patch?

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-16  1:09           ` Huang Ying
@ 2009-09-16  8:10             ` Avi Kivity
  0 siblings, 0 replies; 15+ messages in thread
From: Avi Kivity @ 2009-09-16  8:10 UTC (permalink / raw)
  To: Huang Ying; +Cc: Andi Kleen, Anthony Liguori, kvm, Marcelo Tosatti

On 09/16/2009 04:09 AM, Huang Ying wrote:
> On Mon, 2009-09-14 at 13:10 +0800, Avi Kivity wrote:
>    
>> On 09/14/2009 05:55 AM, Huang Ying wrote:
>>      
>>> Hi, Avi,
>>>
>>> On Thu, 2009-09-10 at 17:35 +0800, Andi Kleen wrote:
>>>
>>>        
>>>>>> (also, I if we can't handle guest-mode SIGBUS I think it would be nice
>>>>>> to raise it again so the process terminates due to the SIGBUS).
>>>>>>
>>>>>>              
>>>>> For SIGBUS we can not relay to guest as MCE, we can either abort or
>>>>> reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
>>>>> the latter one?
>>>>>
>>>>>            
>>>> I think a suitable error message and exit would be better than a plain
>>>> signal kill. It shouldn't look like qemu crashed due to a software
>>>> bug. Ideally a error message in a way that it can be parsed by libvirt etc.
>>>> and reported in a suitable way.
>>>>
>>>>          
>>> Do you agree with us about SIGBUS processing?
>>>
>>>        
>> Yes.
>>      
> Do you have some other issue for this patch?
>
>    

No - but Marcelo is committing this week (we are alternating 
maintainership every week), maybe he didn't see this, copying him.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-09  2:28 [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest Huang Ying
  2009-09-09 12:06 ` Avi Kivity
@ 2009-09-16 17:59 ` Marcelo Tosatti
  2009-09-17  1:13   ` Huang Ying
  1 sibling, 1 reply; 15+ messages in thread
From: Marcelo Tosatti @ 2009-09-16 17:59 UTC (permalink / raw)
  To: Huang Ying; +Cc: Avi Kivity, Andi Kleen, Anthony Liguori, kvm

On Wed, Sep 09, 2009 at 10:28:02AM +0800, Huang Ying wrote:
> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> where some hardware error such as some memory error can be reported
> without PCC (processor context corrupted). To recover from such MCE,
> the corresponding memory will be unmapped, and all processes accessing
> the memory will be killed via SIGBUS.
> 
> For KVM, if QEMU/KVM is killed, all guest processes will be killed
> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> injection. Then guest OS can isolate corresponding memory and kill
> necessary guest processes only. SIGBUS sent to main thread (not VCPU
> threads) will be broadcast to all VCPU threads as UCR MCE.
> 
> v2:
> 
> - Use qemu_ram_addr_from_host instead of self made one to covert from
>   host address to guest RAM address. Thanks Anthony Liguori.
> 
> Signed-off-by: Huang Ying <ying.huang@intel.com>
> 
> ---
>  cpu-common.h      |    1 
>  exec.c            |   20 +++++--
>  qemu-kvm.c        |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++----
>  target-i386/cpu.h |   20 ++++++-
>  4 files changed, 178 insertions(+), 17 deletions(-)
> 
> --- a/qemu-kvm.c
> +++ b/qemu-kvm.c
> @@ -27,10 +27,23 @@
>  #include <sys/mman.h>
>  #include <sys/ioctl.h>
>  #include <signal.h>
> +#include <sys/signalfd.h>
> +#include <sys/prctl.h>
>  
>  #define false 0
>  #define true 1
>  
> +#ifndef PR_MCE_KILL
> +#define PR_MCE_KILL 33
> +#endif
> +
> +#ifndef BUS_MCEERR_AR
> +#define BUS_MCEERR_AR 4
> +#endif
> +#ifndef BUS_MCEERR_AO
> +#define BUS_MCEERR_AO 5
> +#endif
> +
>  #define EXPECTED_KVM_API_VERSION 12
>  
>  #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> @@ -1507,6 +1520,37 @@ static void sig_ipi_handler(int n)
>  {
>  }
>  
> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> +{
> +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> +        uint64_t status;
> +        unsigned long paddr;
> +        CPUState *cenv;
> +
> +        /* Hope we are lucky for AO MCE */
> +        if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instead of guest system!: %llx\n",
> +                    (unsigned long long)siginfo->ssi_addr);
> +            return;

qemu-kvm should die here?

> +        }
> +        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +            | 0xc0;
> +        kvm_inject_x86_mce(first_cpu, 9, status,
> +                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> +                           (MCM_ADDR_PHYS << 6) | 0xc);
> +        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
> +            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
> +                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
> +        return;

Should abort if kvm_inject_x86_mce fails?

> +    } else if (siginfo->ssi_code == BUS_MCEERR_AR)
> +        fprintf(stderr, "Hardware memory error!\n");
> +    else
> +        fprintf(stderr, "Internal error in QEMU!\n");

Can you re-raise SIGBUS so you we get a coredump on non-MCE SIGBUS as
usual?

> +    exit(1);
> +}
> +
>  static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
>  {
>      struct qemu_work_item wi;
> @@ -1649,29 +1693,102 @@ static void flush_queued_work(CPUState *
>      pthread_cond_broadcast(&qemu_work_cond);
>  }
>  
> +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
> +{
> +#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
> +    struct kvm_x86_mce mce = {
> +            .bank = 9,
> +    };
> +    unsigned long paddr;
> +    int r;
> +
> +    if (env->mcg_cap && siginfo->si_addr
> +        && (siginfo->si_code == BUS_MCEERR_AR
> +            || siginfo->si_code == BUS_MCEERR_AO)) {
> +        if (siginfo->si_code == BUS_MCEERR_AR) {
> +            /* Fake an Intel architectural Data Load SRAR UCR */
> +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +                | MCI_STATUS_AR | 0x134;
> +            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
> +        } else {
> +            /* Fake an Intel architectural Memory scrubbing UCR */
> +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +                | 0xc0;
> +            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
> +        }
> +        if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instaed of guest system!\n");
> +            /* Hope we are lucky for AO MCE */
> +            if (siginfo->si_code == BUS_MCEERR_AO)
> +                return;

Should die?

> +            else
> +                exit(1);
> +        }
> +        mce.addr = paddr;
> +        r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
> +        if (r < 0) {
> +            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
> +            exit(1);
> +        }
> +    } else
> +#endif
> +    {
> +        if (siginfo->si_code == BUS_MCEERR_AO)
> +            return;
> +        if (siginfo->si_code == BUS_MCEERR_AR)
> +            fprintf(stderr, "Hardware memory error!\n");
> +        else
> +            fprintf(stderr, "Internal error in QEMU!\n");
> +        exit(1);
> +    }
> +}


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-16 17:59 ` Marcelo Tosatti
@ 2009-09-17  1:13   ` Huang Ying
  2009-09-17 21:36     ` Marcelo Tosatti
  0 siblings, 1 reply; 15+ messages in thread
From: Huang Ying @ 2009-09-17  1:13 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, Andi Kleen, Anthony Liguori, kvm

On Thu, 2009-09-17 at 01:59 +0800, Marcelo Tosatti wrote: 
> On Wed, Sep 09, 2009 at 10:28:02AM +0800, Huang Ying wrote:
> > UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> > where some hardware error such as some memory error can be reported
> > without PCC (processor context corrupted). To recover from such MCE,
> > the corresponding memory will be unmapped, and all processes accessing
> > the memory will be killed via SIGBUS.
> > 
> > For KVM, if QEMU/KVM is killed, all guest processes will be killed
> > too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> > injection. Then guest OS can isolate corresponding memory and kill
> > necessary guest processes only. SIGBUS sent to main thread (not VCPU
> > threads) will be broadcast to all VCPU threads as UCR MCE.
> > 
> > v2:
> > 
> > - Use qemu_ram_addr_from_host instead of self made one to covert from
> >   host address to guest RAM address. Thanks Anthony Liguori.
> > 
> > Signed-off-by: Huang Ying <ying.huang@intel.com>
> > 
> > ---
> >  cpu-common.h      |    1 
> >  exec.c            |   20 +++++--
> >  qemu-kvm.c        |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> >  target-i386/cpu.h |   20 ++++++-
> >  4 files changed, 178 insertions(+), 17 deletions(-)
> > 
> > --- a/qemu-kvm.c
> > +++ b/qemu-kvm.c
> > @@ -27,10 +27,23 @@
> >  #include <sys/mman.h>
> >  #include <sys/ioctl.h>
> >  #include <signal.h>
> > +#include <sys/signalfd.h>
> > +#include <sys/prctl.h>
> >  
> >  #define false 0
> >  #define true 1
> >  
> > +#ifndef PR_MCE_KILL
> > +#define PR_MCE_KILL 33
> > +#endif
> > +
> > +#ifndef BUS_MCEERR_AR
> > +#define BUS_MCEERR_AR 4
> > +#endif
> > +#ifndef BUS_MCEERR_AO
> > +#define BUS_MCEERR_AO 5
> > +#endif
> > +
> >  #define EXPECTED_KVM_API_VERSION 12
> >  
> >  #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> > @@ -1507,6 +1520,37 @@ static void sig_ipi_handler(int n)
> >  {
> >  }
> >  
> > +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> > +{
> > +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> > +        uint64_t status;
> > +        unsigned long paddr;
> > +        CPUState *cenv;
> > +
> > +        /* Hope we are lucky for AO MCE */
> > +        if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
> > +            fprintf(stderr, "Hardware memory error for memory used by "
> > +                    "QEMU itself instead of guest system!: %llx\n",
> > +                    (unsigned long long)siginfo->ssi_addr);
> > +            return;
> 
> qemu-kvm should die here?

There are two kinds of UCR MCE. One is triggered by user space/guest
read/write, the other is triggered by asynchronously detected error
(e.g. patrol scrubbing). The latter one is reported as AO (Action
Optional) MCE, and it has nothing to do with current path. So if we are
lucky enough, we can survive. And when we finally touch the error memory
reported by AO MCE, another AR (Action Required) MCE will be triggered.
We have another chance to deal with it.

> > +        }
> > +        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> > +            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> > +            | 0xc0;
> > +        kvm_inject_x86_mce(first_cpu, 9, status,
> > +                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> > +                           (MCM_ADDR_PHYS << 6) | 0xc);
> > +        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
> > +            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
> > +                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
> > +        return;
> 
> Should abort if kvm_inject_x86_mce fails?

kvm_inject_x86_mce will abort by itself.

> > +    } else if (siginfo->ssi_code == BUS_MCEERR_AR)
> > +        fprintf(stderr, "Hardware memory error!\n");
> > +    else
> > +        fprintf(stderr, "Internal error in QEMU!\n");
> 
> Can you re-raise SIGBUS so you we get a coredump on non-MCE SIGBUS as
> usual?

We discuss this before. Copied below, please comment the comments
below, :)

Avi:
(also, I if we can't handle guest-mode SIGBUS I think it would be nice 
to raise it again so the process terminates due to the SIGBUS).

Huang Ying:
For SIGBUS we can not relay to guest as MCE, we can either abort or
reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
the latter one?

Andi:
I think a suitable error message and exit would be better than a plain 
signal kill. It shouldn't look like qemu crashed due to a software
bug. Ideally a error message in a way that it can be parsed by libvirt
etc. and reported in a suitable way.

However qemu getting killed itself is very unlikely, it doesn't
have much memory foot print compared to the guest and other data. 
So this should be a very rare condition.

Avi:
libvirt etc. can/should wait() for qemu to terminate abnormally and 
report the reason why.  However it doesn't seem there is a way to get 
extended signal information from wait(), so it looks like internal 
handling by qemu is better.

> > +    exit(1);
> > +}
> > +
> >  static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
> >  {
> >      struct qemu_work_item wi;
> > @@ -1649,29 +1693,102 @@ static void flush_queued_work(CPUState *
> >      pthread_cond_broadcast(&qemu_work_cond);
> >  }
> >  
> > +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
> > +{
> > +#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
> > +    struct kvm_x86_mce mce = {
> > +            .bank = 9,
> > +    };
> > +    unsigned long paddr;
> > +    int r;
> > +
> > +    if (env->mcg_cap && siginfo->si_addr
> > +        && (siginfo->si_code == BUS_MCEERR_AR
> > +            || siginfo->si_code == BUS_MCEERR_AO)) {
> > +        if (siginfo->si_code == BUS_MCEERR_AR) {
> > +            /* Fake an Intel architectural Data Load SRAR UCR */
> > +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> > +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> > +                | MCI_STATUS_AR | 0x134;
> > +            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> > +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
> > +        } else {
> > +            /* Fake an Intel architectural Memory scrubbing UCR */
> > +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> > +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> > +                | 0xc0;
> > +            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> > +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
> > +        }
> > +        if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
> > +            fprintf(stderr, "Hardware memory error for memory used by "
> > +                    "QEMU itself instaed of guest system!\n");
> > +            /* Hope we are lucky for AO MCE */
> > +            if (siginfo->si_code == BUS_MCEERR_AO)
> > +                return;
> 
> Should die?

Please refer to my description about AO and AR above.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-17  1:13   ` Huang Ying
@ 2009-09-17 21:36     ` Marcelo Tosatti
  2009-09-18  3:01       ` Huang Ying
  0 siblings, 1 reply; 15+ messages in thread
From: Marcelo Tosatti @ 2009-09-17 21:36 UTC (permalink / raw)
  To: Huang Ying; +Cc: Avi Kivity, Andi Kleen, Anthony Liguori, kvm

On Thu, Sep 17, 2009 at 09:13:29AM +0800, Huang Ying wrote:
> On Thu, 2009-09-17 at 01:59 +0800, Marcelo Tosatti wrote: 
> > On Wed, Sep 09, 2009 at 10:28:02AM +0800, Huang Ying wrote:
> > > UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> > > where some hardware error such as some memory error can be reported
> > > without PCC (processor context corrupted). To recover from such MCE,
> > > the corresponding memory will be unmapped, and all processes accessing
> > > the memory will be killed via SIGBUS.
> > > 
> > > For KVM, if QEMU/KVM is killed, all guest processes will be killed
> > > too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> > > injection. Then guest OS can isolate corresponding memory and kill
> > > necessary guest processes only. SIGBUS sent to main thread (not VCPU
> > > threads) will be broadcast to all VCPU threads as UCR MCE.
> > > 
> > > v2:
> > > 
> > > - Use qemu_ram_addr_from_host instead of self made one to covert from
> > >   host address to guest RAM address. Thanks Anthony Liguori.
> > > 
> > > Signed-off-by: Huang Ying <ying.huang@intel.com>
> > > 
> > > ---
> > >  cpu-common.h      |    1 
> > >  exec.c            |   20 +++++--
> > >  qemu-kvm.c        |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> > >  target-i386/cpu.h |   20 ++++++-
> > >  4 files changed, 178 insertions(+), 17 deletions(-)
> > > 
> > > --- a/qemu-kvm.c
> > > +++ b/qemu-kvm.c
> > > @@ -27,10 +27,23 @@
> > >  #include <sys/mman.h>
> > >  #include <sys/ioctl.h>
> > >  #include <signal.h>
> > > +#include <sys/signalfd.h>
> > > +#include <sys/prctl.h>
> > >  
> > >  #define false 0
> > >  #define true 1
> > >  
> > > +#ifndef PR_MCE_KILL
> > > +#define PR_MCE_KILL 33
> > > +#endif
> > > +
> > > +#ifndef BUS_MCEERR_AR
> > > +#define BUS_MCEERR_AR 4
> > > +#endif
> > > +#ifndef BUS_MCEERR_AO
> > > +#define BUS_MCEERR_AO 5
> > > +#endif
> > > +
> > >  #define EXPECTED_KVM_API_VERSION 12
> > >  
> > >  #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> > > @@ -1507,6 +1520,37 @@ static void sig_ipi_handler(int n)
> > >  {
> > >  }
> > >  
> > > +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> > > +{
> > > +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> > > +        uint64_t status;
> > > +        unsigned long paddr;
> > > +        CPUState *cenv;
> > > +
> > > +        /* Hope we are lucky for AO MCE */
> > > +        if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
> > > +            fprintf(stderr, "Hardware memory error for memory used by "
> > > +                    "QEMU itself instead of guest system!: %llx\n",
> > > +                    (unsigned long long)siginfo->ssi_addr);
> > > +            return;
> > 
> > qemu-kvm should die here?
> 
> There are two kinds of UCR MCE. One is triggered by user space/guest
> read/write, the other is triggered by asynchronously detected error
> (e.g. patrol scrubbing). The latter one is reported as AO (Action
> Optional) MCE, and it has nothing to do with current path. So if we are
> lucky enough, we can survive. And when we finally touch the error memory
> reported by AO MCE, another AR (Action Required) MCE will be triggered.
> We have another chance to deal with it.

OK.

> 
> > > +        }
> > > +        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> > > +            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> > > +            | 0xc0;
> > > +        kvm_inject_x86_mce(first_cpu, 9, status,
> > > +                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> > > +                           (MCM_ADDR_PHYS << 6) | 0xc);
> > > +        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
> > > +            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
> > > +                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
> > > +        return;
> > 
> > Should abort if kvm_inject_x86_mce fails?
> 
> kvm_inject_x86_mce will abort by itself.

OK.

> 
> > > +    } else if (siginfo->ssi_code == BUS_MCEERR_AR)
> > > +        fprintf(stderr, "Hardware memory error!\n");
> > > +    else
> > > +        fprintf(stderr, "Internal error in QEMU!\n");
> > 
> > Can you re-raise SIGBUS so you we get a coredump on non-MCE SIGBUS as
> > usual?
> 
> We discuss this before. Copied below, please comment the comments
> below, :)
> 
> Avi:
> (also, I if we can't handle guest-mode SIGBUS I think it would be nice 
> to raise it again so the process terminates due to the SIGBUS).
> 
> Huang Ying:
> For SIGBUS we can not relay to guest as MCE, we can either abort or
> reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
> the latter one?
> 
> Andi:
> I think a suitable error message and exit would be better than a plain 
> signal kill. It shouldn't look like qemu crashed due to a software
> bug. Ideally a error message in a way that it can be parsed by libvirt
> etc. and reported in a suitable way.
> 
> However qemu getting killed itself is very unlikely, it doesn't
> have much memory foot print compared to the guest and other data. 
> So this should be a very rare condition.
> 
> Avi:
> libvirt etc. can/should wait() for qemu to terminate abnormally and 
> report the reason why.  However it doesn't seem there is a way to get 
> extended signal information from wait(), so it looks like internal 
> handling by qemu is better.

I'm not talking about SIGBUS generated by MCE.

What i mean is, for SIGBUS signals that are not due to MCE errors, the
current behaviour is to generate a core dump (which is useful
information for debugging). 

With your patch, qemu-kvm handles the signal, prints a message before
exiting.

This is annoying. It seems the discussion above is about SIGBUS
initiated by MCE errors.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest
  2009-09-17 21:36     ` Marcelo Tosatti
@ 2009-09-18  3:01       ` Huang Ying
  0 siblings, 0 replies; 15+ messages in thread
From: Huang Ying @ 2009-09-18  3:01 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, Andi Kleen, Anthony Liguori, kvm

On Fri, 2009-09-18 at 05:36 +0800, Marcelo Tosatti wrote: 
> > > > +    } else if (siginfo->ssi_code == BUS_MCEERR_AR)
> > > > +        fprintf(stderr, "Hardware memory error!\n");
> > > > +    else
> > > > +        fprintf(stderr, "Internal error in QEMU!\n");
> > > 
> > > Can you re-raise SIGBUS so you we get a coredump on non-MCE SIGBUS as
> > > usual?
> > 
> > We discuss this before. Copied below, please comment the comments
> > below, :)
> > 
> > Avi:
> > (also, I if we can't handle guest-mode SIGBUS I think it would be nice 
> > to raise it again so the process terminates due to the SIGBUS).
> > 
> > Huang Ying:
> > For SIGBUS we can not relay to guest as MCE, we can either abort or
> > reset SIGBUS to SIGDFL and re-raise it. Both are OK for me. You prefer
> > the latter one?
> > 
> > Andi:
> > I think a suitable error message and exit would be better than a plain 
> > signal kill. It shouldn't look like qemu crashed due to a software
> > bug. Ideally a error message in a way that it can be parsed by libvirt
> > etc. and reported in a suitable way.
> > 
> > However qemu getting killed itself is very unlikely, it doesn't
> > have much memory foot print compared to the guest and other data. 
> > So this should be a very rare condition.
> > 
> > Avi:
> > libvirt etc. can/should wait() for qemu to terminate abnormally and 
> > report the reason why.  However it doesn't seem there is a way to get 
> > extended signal information from wait(), so it looks like internal 
> > handling by qemu is better.
> 
> I'm not talking about SIGBUS generated by MCE.
> 
> What i mean is, for SIGBUS signals that are not due to MCE errors, the
> current behaviour is to generate a core dump (which is useful
> information for debugging). 
> 
> With your patch, qemu-kvm handles the signal, prints a message before
> exiting.
> 
> This is annoying. It seems the discussion above is about SIGBUS
> initiated by MCE errors.

OK. I will re-raise for SIGBUS not initiated by MCE.

Best Regards,
Huang Ying


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2009-09-18  3:01 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-09  2:28 [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest Huang Ying
2009-09-09 12:06 ` Avi Kivity
2009-09-09 12:16   ` Avi Kivity
2009-09-09 12:18     ` Avi Kivity
2009-09-10  2:40   ` Huang Ying
2009-09-10  9:35     ` Andi Kleen
2009-09-14  2:55       ` Huang Ying
2009-09-14  5:10         ` Avi Kivity
2009-09-16  1:09           ` Huang Ying
2009-09-16  8:10             ` Avi Kivity
2009-09-14  5:10       ` Avi Kivity
2009-09-16 17:59 ` Marcelo Tosatti
2009-09-17  1:13   ` Huang Ying
2009-09-17 21:36     ` Marcelo Tosatti
2009-09-18  3:01       ` Huang Ying

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.