[patch uq/master 0/8] port qemu-kvm's MCE support

All of lore.kernel.org
 help / color / mirror / Atom feed

* [patch uq/master 0/8] port qemu-kvm's MCE support
@ 2010-10-04 18:54 ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson




^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 0/8] port qemu-kvm's MCE support
@ 2010-10-04 18:54 ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Huang Ying



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 1/8] signalfd compatibility
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-04 18:54   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: signalfd --]
[-- Type: text/plain, Size: 5970 bytes --]

Port qemu-kvm's signalfd compat code.

commit 5a7fdd0abd7cd24dac205317a4195446ab8748b5
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed May 7 11:55:47 2008 -0500

    Use signalfd() in io-thread
    
    This patch reworks the IO thread to use signalfd() instead of sigtimedwait()
    This will eliminate the need to use SIGIO everywhere.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/compatfd.c
===================================================================
--- /dev/null
+++ qemu/compatfd.c
@@ -0,0 +1,117 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "compatfd.h"
+
+#include <sys/syscall.h>
+#include <pthread.h>
+
+struct sigfd_compat_info
+{
+    sigset_t mask;
+    int fd;
+};
+
+static void *sigwait_compat(void *opaque)
+{
+    struct sigfd_compat_info *info = opaque;
+    int err;
+    sigset_t all;
+
+    sigfillset(&all);
+    sigprocmask(SIG_BLOCK, &all, NULL);
+
+    do {
+        siginfo_t siginfo;
+
+        err = sigwaitinfo(&info->mask, &siginfo);
+        if (err == -1 && errno == EINTR) {
+            err = 0;
+            continue;
+        }
+
+        if (err > 0) {
+            char buffer[128];
+            size_t offset = 0;
+
+            memcpy(buffer, &err, sizeof(err));
+            while (offset < sizeof(buffer)) {
+                ssize_t len;
+
+                len = write(info->fd, buffer + offset,
+                            sizeof(buffer) - offset);
+                if (len == -1 && errno == EINTR)
+                    continue;
+
+                if (len <= 0) {
+                    err = -1;
+                    break;
+                }
+
+                offset += len;
+            }
+        }
+    } while (err >= 0);
+
+    return NULL;
+}
+
+static int qemu_signalfd_compat(const sigset_t *mask)
+{
+    pthread_attr_t attr;
+    pthread_t tid;
+    struct sigfd_compat_info *info;
+    int fds[2];
+
+    info = malloc(sizeof(*info));
+    if (info == NULL) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    if (pipe(fds) == -1) {
+        free(info);
+        return -1;
+    }
+
+    qemu_set_cloexec(fds[0]);
+    qemu_set_cloexec(fds[1]);
+
+    memcpy(&info->mask, mask, sizeof(*mask));
+    info->fd = fds[1];
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+    pthread_create(&tid, &attr, sigwait_compat, info);
+
+    pthread_attr_destroy(&attr);
+
+    return fds[0];
+}
+
+int qemu_signalfd(const sigset_t *mask)
+{
+#if defined(CONFIG_SIGNALFD)
+    int ret;
+
+    ret = syscall(SYS_signalfd, -1, mask, _NSIG / 8);
+    if (ret != -1) {
+        qemu_set_cloexec(ret);
+        return ret;
+    }
+#endif
+
+    return qemu_signalfd_compat(mask);
+}
Index: qemu/compatfd.h
===================================================================
--- /dev/null
+++ qemu/compatfd.h
@@ -0,0 +1,43 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COMPATFD_H
+#define QEMU_COMPATFD_H
+
+#include <signal.h>
+
+struct qemu_signalfd_siginfo {
+    uint32_t ssi_signo;   /* Signal number */
+    int32_t  ssi_errno;   /* Error number (unused) */
+    int32_t  ssi_code;    /* Signal code */
+    uint32_t ssi_pid;     /* PID of sender */
+    uint32_t ssi_uid;     /* Real UID of sender */
+    int32_t  ssi_fd;      /* File descriptor (SIGIO) */
+    uint32_t ssi_tid;     /* Kernel timer ID (POSIX timers) */
+    uint32_t ssi_band;    /* Band event (SIGIO) */
+    uint32_t ssi_overrun; /* POSIX timer overrun count */
+    uint32_t ssi_trapno;  /* Trap number that caused signal */
+    int32_t  ssi_status;  /* Exit status or signal (SIGCHLD) */
+    int32_t  ssi_int;     /* Integer sent by sigqueue(2) */
+    uint64_t ssi_ptr;     /* Pointer sent by sigqueue(2) */
+    uint64_t ssi_utime;   /* User CPU time consumed (SIGCHLD) */
+    uint64_t ssi_stime;   /* System CPU time consumed (SIGCHLD) */
+    uint64_t ssi_addr;    /* Address that generated signal
+                             (for hardware-generated signals) */
+    uint8_t  pad[48];     /* Pad size to 128 bytes (allow for
+                             additional fields in the future) */
+};
+
+int qemu_signalfd(const sigset_t *mask);
+
+#endif
Index: qemu/Makefile.objs
===================================================================
--- qemu.orig/Makefile.objs
+++ qemu/Makefile.objs
@@ -121,6 +121,7 @@ common-obj-y += $(addprefix ui/, $(ui-ob
 
 common-obj-y += iov.o acl.o
 common-obj-$(CONFIG_THREAD) += qemu-thread.o
+common-obj-$(CONFIG_IOTHREAD) += compatfd.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o
 
Index: qemu/configure
===================================================================
--- qemu.orig/configure
+++ qemu/configure
@@ -1936,6 +1936,21 @@ if compile_prog "" "" ; then
   splice=yes
 fi
 
+##########################################
+# signalfd probe
+signalfd="no"
+cat > $TMPC << EOF
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <signal.h>
+int main(void) { return syscall(SYS_signalfd, -1, NULL, _NSIG / 8); }
+EOF
+
+if compile_prog "" "" ; then
+  signalfd=yes
+fi
+
 # check if eventfd is supported
 eventfd=no
 cat > $TMPC << EOF
@@ -2509,6 +2524,9 @@ fi
 if test "$fdt" = "yes" ; then
   echo "CONFIG_FDT=y" >> $config_host_mak
 fi
+if test "$signalfd" = "yes" ; then
+  echo "CONFIG_SIGNALFD=y" >> $config_host_mak
+fi
 if test "$need_offsetof" = "yes" ; then
   echo "CONFIG_NEED_OFFSETOF=y" >> $config_host_mak
 fi



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 1/8] signalfd compatibility
@ 2010-10-04 18:54   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: signalfd --]
[-- Type: text/plain, Size: 5968 bytes --]

Port qemu-kvm's signalfd compat code.

commit 5a7fdd0abd7cd24dac205317a4195446ab8748b5
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed May 7 11:55:47 2008 -0500

    Use signalfd() in io-thread
    
    This patch reworks the IO thread to use signalfd() instead of sigtimedwait()
    This will eliminate the need to use SIGIO everywhere.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/compatfd.c
===================================================================
--- /dev/null
+++ qemu/compatfd.c
@@ -0,0 +1,117 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "compatfd.h"
+
+#include <sys/syscall.h>
+#include <pthread.h>
+
+struct sigfd_compat_info
+{
+    sigset_t mask;
+    int fd;
+};
+
+static void *sigwait_compat(void *opaque)
+{
+    struct sigfd_compat_info *info = opaque;
+    int err;
+    sigset_t all;
+
+    sigfillset(&all);
+    sigprocmask(SIG_BLOCK, &all, NULL);
+
+    do {
+        siginfo_t siginfo;
+
+        err = sigwaitinfo(&info->mask, &siginfo);
+        if (err == -1 && errno == EINTR) {
+            err = 0;
+            continue;
+        }
+
+        if (err > 0) {
+            char buffer[128];
+            size_t offset = 0;
+
+            memcpy(buffer, &err, sizeof(err));
+            while (offset < sizeof(buffer)) {
+                ssize_t len;
+
+                len = write(info->fd, buffer + offset,
+                            sizeof(buffer) - offset);
+                if (len == -1 && errno == EINTR)
+                    continue;
+
+                if (len <= 0) {
+                    err = -1;
+                    break;
+                }
+
+                offset += len;
+            }
+        }
+    } while (err >= 0);
+
+    return NULL;
+}
+
+static int qemu_signalfd_compat(const sigset_t *mask)
+{
+    pthread_attr_t attr;
+    pthread_t tid;
+    struct sigfd_compat_info *info;
+    int fds[2];
+
+    info = malloc(sizeof(*info));
+    if (info == NULL) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    if (pipe(fds) == -1) {
+        free(info);
+        return -1;
+    }
+
+    qemu_set_cloexec(fds[0]);
+    qemu_set_cloexec(fds[1]);
+
+    memcpy(&info->mask, mask, sizeof(*mask));
+    info->fd = fds[1];
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+    pthread_create(&tid, &attr, sigwait_compat, info);
+
+    pthread_attr_destroy(&attr);
+
+    return fds[0];
+}
+
+int qemu_signalfd(const sigset_t *mask)
+{
+#if defined(CONFIG_SIGNALFD)
+    int ret;
+
+    ret = syscall(SYS_signalfd, -1, mask, _NSIG / 8);
+    if (ret != -1) {
+        qemu_set_cloexec(ret);
+        return ret;
+    }
+#endif
+
+    return qemu_signalfd_compat(mask);
+}
Index: qemu/compatfd.h
===================================================================
--- /dev/null
+++ qemu/compatfd.h
@@ -0,0 +1,43 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COMPATFD_H
+#define QEMU_COMPATFD_H
+
+#include <signal.h>
+
+struct qemu_signalfd_siginfo {
+    uint32_t ssi_signo;   /* Signal number */
+    int32_t  ssi_errno;   /* Error number (unused) */
+    int32_t  ssi_code;    /* Signal code */
+    uint32_t ssi_pid;     /* PID of sender */
+    uint32_t ssi_uid;     /* Real UID of sender */
+    int32_t  ssi_fd;      /* File descriptor (SIGIO) */
+    uint32_t ssi_tid;     /* Kernel timer ID (POSIX timers) */
+    uint32_t ssi_band;    /* Band event (SIGIO) */
+    uint32_t ssi_overrun; /* POSIX timer overrun count */
+    uint32_t ssi_trapno;  /* Trap number that caused signal */
+    int32_t  ssi_status;  /* Exit status or signal (SIGCHLD) */
+    int32_t  ssi_int;     /* Integer sent by sigqueue(2) */
+    uint64_t ssi_ptr;     /* Pointer sent by sigqueue(2) */
+    uint64_t ssi_utime;   /* User CPU time consumed (SIGCHLD) */
+    uint64_t ssi_stime;   /* System CPU time consumed (SIGCHLD) */
+    uint64_t ssi_addr;    /* Address that generated signal
+                             (for hardware-generated signals) */
+    uint8_t  pad[48];     /* Pad size to 128 bytes (allow for
+                             additional fields in the future) */
+};
+
+int qemu_signalfd(const sigset_t *mask);
+
+#endif
Index: qemu/Makefile.objs
===================================================================
--- qemu.orig/Makefile.objs
+++ qemu/Makefile.objs
@@ -121,6 +121,7 @@ common-obj-y += $(addprefix ui/, $(ui-ob
 
 common-obj-y += iov.o acl.o
 common-obj-$(CONFIG_THREAD) += qemu-thread.o
+common-obj-$(CONFIG_IOTHREAD) += compatfd.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o
 
Index: qemu/configure
===================================================================
--- qemu.orig/configure
+++ qemu/configure
@@ -1936,6 +1936,21 @@ if compile_prog "" "" ; then
   splice=yes
 fi
 
+##########################################
+# signalfd probe
+signalfd="no"
+cat > $TMPC << EOF
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <signal.h>
+int main(void) { return syscall(SYS_signalfd, -1, NULL, _NSIG / 8); }
+EOF
+
+if compile_prog "" "" ; then
+  signalfd=yes
+fi
+
 # check if eventfd is supported
 eventfd=no
 cat > $TMPC << EOF
@@ -2509,6 +2524,9 @@ fi
 if test "$fdt" = "yes" ; then
   echo "CONFIG_FDT=y" >> $config_host_mak
 fi
+if test "$signalfd" = "yes" ; then
+  echo "CONFIG_SIGNALFD=y" >> $config_host_mak
+fi
 if test "$need_offsetof" = "yes" ; then
   echo "CONFIG_NEED_OFFSETOF=y" >> $config_host_mak
 fi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 2/8] iothread: use signalfd
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-04 18:54   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: block-io-signals-in-iothread --]
[-- Type: text/plain, Size: 3272 bytes --]

Block SIGALRM, SIGIO and consume them via signalfd.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -33,6 +33,7 @@
 #include "exec-all.h"
 
 #include "cpus.h"
+#include "compatfd.h"
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -329,14 +330,75 @@ static QemuCond qemu_work_cond;
 
 static void tcg_init_ipi(void);
 static void kvm_init_ipi(CPUState *env);
-static void unblock_io_signals(void);
+static sigset_t block_io_signals(void);
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+static void sigfd_handler(void *opaque)
+{
+    int fd = (unsigned long) opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            printf("read from sigfd returned %zd: %m\n", len);
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signalfd_init(sigset_t mask)
+{
+    int sigfd;
+
+    sigfd = qemu_signalfd(&mask);
+    if (sigfd == -1) {
+        fprintf(stderr, "failed to create signalfd\n");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
+                         (void *)(unsigned long) sigfd);
+
+    return 0;
+}
 
 int qemu_init_main_loop(void)
 {
     int ret;
+    sigset_t blocked_signals;
 
     cpu_set_debug_excp_handler(cpu_debug_handler);
 
+    blocked_signals = block_io_signals();
+
+    ret = qemu_signalfd_init(blocked_signals);
+    if (ret)
+        return ret;
+
+    /* Note eventfd must be drained before signalfd handlers run */
     ret = qemu_event_init();
     if (ret)
         return ret;
@@ -347,7 +409,6 @@ int qemu_init_main_loop(void)
     qemu_mutex_init(&qemu_global_mutex);
     qemu_mutex_lock(&qemu_global_mutex);
 
-    unblock_io_signals();
     qemu_thread_self(&io_thread);
 
     return 0;
@@ -586,19 +647,22 @@ static void kvm_init_ipi(CPUState *env)
     }
 }
 
-static void unblock_io_signals(void)
+static sigset_t block_io_signals(void)
 {
     sigset_t set;
 
+    /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
     sigaddset(&set, SIGUSR2);
-    sigaddset(&set, SIGIO);
-    sigaddset(&set, SIGALRM);
     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 
     sigemptyset(&set);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    return set;
 }
 
 void qemu_mutex_lock_iothread(void)



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 2/8] iothread: use signalfd
@ 2010-10-04 18:54   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: block-io-signals-in-iothread --]
[-- Type: text/plain, Size: 3270 bytes --]

Block SIGALRM, SIGIO and consume them via signalfd.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -33,6 +33,7 @@
 #include "exec-all.h"
 
 #include "cpus.h"
+#include "compatfd.h"
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -329,14 +330,75 @@ static QemuCond qemu_work_cond;
 
 static void tcg_init_ipi(void);
 static void kvm_init_ipi(CPUState *env);
-static void unblock_io_signals(void);
+static sigset_t block_io_signals(void);
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+static void sigfd_handler(void *opaque)
+{
+    int fd = (unsigned long) opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            printf("read from sigfd returned %zd: %m\n", len);
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signalfd_init(sigset_t mask)
+{
+    int sigfd;
+
+    sigfd = qemu_signalfd(&mask);
+    if (sigfd == -1) {
+        fprintf(stderr, "failed to create signalfd\n");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
+                         (void *)(unsigned long) sigfd);
+
+    return 0;
+}
 
 int qemu_init_main_loop(void)
 {
     int ret;
+    sigset_t blocked_signals;
 
     cpu_set_debug_excp_handler(cpu_debug_handler);
 
+    blocked_signals = block_io_signals();
+
+    ret = qemu_signalfd_init(blocked_signals);
+    if (ret)
+        return ret;
+
+    /* Note eventfd must be drained before signalfd handlers run */
     ret = qemu_event_init();
     if (ret)
         return ret;
@@ -347,7 +409,6 @@ int qemu_init_main_loop(void)
     qemu_mutex_init(&qemu_global_mutex);
     qemu_mutex_lock(&qemu_global_mutex);
 
-    unblock_io_signals();
     qemu_thread_self(&io_thread);
 
     return 0;
@@ -586,19 +647,22 @@ static void kvm_init_ipi(CPUState *env)
     }
 }
 
-static void unblock_io_signals(void)
+static sigset_t block_io_signals(void)
 {
     sigset_t set;
 
+    /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
     sigaddset(&set, SIGUSR2);
-    sigaddset(&set, SIGIO);
-    sigaddset(&set, SIGALRM);
     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 
     sigemptyset(&set);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    return set;
 }
 
 void qemu_mutex_lock_iothread(void)

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 3/8] Expose thread_id in info cpus
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-04 18:54   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: thread-id --]
[-- Type: text/plain, Size: 3812 bytes --]

commit ce6325ff1af34dbaee91c8d28e792277e43f1227
Author: Glauber Costa <gcosta@redhat.com>
Date:   Wed Mar 5 17:01:10 2008 -0300

    Augment info cpus
    
    This patch exposes the thread id associated with each
    cpu through the already well known 'info cpus' interface.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-defs.h
===================================================================
--- qemu.orig/cpu-defs.h
+++ qemu/cpu-defs.h
@@ -197,6 +197,7 @@ typedef struct CPUWatchpoint {
     int nr_cores;  /* number of cores within this CPU package */        \
     int nr_threads;/* number of threads within this CPU */              \
     int running; /* Nonzero if cpu is currently running(usermode).  */  \
+    int thread_id;                                                      \
     /* user data */                                                     \
     void *opaque;                                                       \
                                                                         \
Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -539,6 +539,7 @@ static void *kvm_cpu_thread_fn(void *arg
 
     qemu_mutex_lock(&qemu_global_mutex);
     qemu_thread_self(env->thread);
+    env->thread_id = get_thread_id();
     if (kvm_enabled())
         kvm_init_vcpu(env);
 
@@ -578,6 +579,10 @@ static void *tcg_cpu_thread_fn(void *arg
     while (!qemu_system_ready)
         qemu_cond_timedwait(&qemu_system_cond, &qemu_global_mutex, 100);
 
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        env->thread_id = get_thread_id();
+    }
+
     while (1) {
         cpu_exec_all();
         qemu_tcg_wait_io_event();
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -637,6 +637,7 @@ void cpu_exec_init(CPUState *env)
     env->numa_node = 0;
     QTAILQ_INIT(&env->breakpoints);
     QTAILQ_INIT(&env->watchpoints);
+    env->thread_id = get_thread_id();
     *penv = env;
 #if defined(CONFIG_USER_ONLY)
     cpu_list_unlock();
Index: qemu/osdep.c
===================================================================
--- qemu.orig/osdep.c
+++ qemu/osdep.c
@@ -44,6 +44,10 @@
 extern int madvise(caddr_t, size_t, int);
 #endif
 
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+
 #ifdef CONFIG_EVENTFD
 #include <sys/eventfd.h>
 #endif
@@ -200,6 +204,17 @@ int qemu_create_pidfile(const char *file
     return 0;
 }
 
+int get_thread_id(void)
+{
+#if defined (_WIN32)
+    return GetCurrentThreadId();
+#elif defined (__linux__)
+    return syscall(SYS_gettid);
+#else
+    return getpid();
+#endif
+}
+
 #ifdef _WIN32
 
 /* mingw32 needs ffs for compilations without optimization. */
Index: qemu/osdep.h
===================================================================
--- qemu.orig/osdep.h
+++ qemu/osdep.h
@@ -126,6 +126,7 @@ void qemu_vfree(void *ptr);
 int qemu_madvise(void *addr, size_t len, int advice);
 
 int qemu_create_pidfile(const char *filename);
+int get_thread_id(void);
 
 #ifdef _WIN32
 int ffs(int i);
Index: qemu/monitor.c
===================================================================
--- qemu.orig/monitor.c
+++ qemu/monitor.c
@@ -878,6 +878,9 @@ static void print_cpu_iter(QObject *obj,
         monitor_printf(mon, " (halted)");
     }
 
+    monitor_printf(mon, " thread_id=%" PRId64 " ",
+					qdict_get_int(cpu, "thread_id"));
+
     monitor_printf(mon, "\n");
 }
 
@@ -922,6 +925,7 @@ static void do_info_cpus(Monitor *mon, Q
 #elif defined(TARGET_MIPS)
         qdict_put(cpu, "PC", qint_from_int(env->active_tc.PC));
 #endif
+        qdict_put(cpu, "thread_id", qint_from_int(env->thread_id));
 
         qlist_append(cpu_list, cpu);
     }



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 3/8] Expose thread_id in info cpus
@ 2010-10-04 18:54   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: thread-id --]
[-- Type: text/plain, Size: 3810 bytes --]

commit ce6325ff1af34dbaee91c8d28e792277e43f1227
Author: Glauber Costa <gcosta@redhat.com>
Date:   Wed Mar 5 17:01:10 2008 -0300

    Augment info cpus
    
    This patch exposes the thread id associated with each
    cpu through the already well known 'info cpus' interface.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-defs.h
===================================================================
--- qemu.orig/cpu-defs.h
+++ qemu/cpu-defs.h
@@ -197,6 +197,7 @@ typedef struct CPUWatchpoint {
     int nr_cores;  /* number of cores within this CPU package */        \
     int nr_threads;/* number of threads within this CPU */              \
     int running; /* Nonzero if cpu is currently running(usermode).  */  \
+    int thread_id;                                                      \
     /* user data */                                                     \
     void *opaque;                                                       \
                                                                         \
Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -539,6 +539,7 @@ static void *kvm_cpu_thread_fn(void *arg
 
     qemu_mutex_lock(&qemu_global_mutex);
     qemu_thread_self(env->thread);
+    env->thread_id = get_thread_id();
     if (kvm_enabled())
         kvm_init_vcpu(env);
 
@@ -578,6 +579,10 @@ static void *tcg_cpu_thread_fn(void *arg
     while (!qemu_system_ready)
         qemu_cond_timedwait(&qemu_system_cond, &qemu_global_mutex, 100);
 
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        env->thread_id = get_thread_id();
+    }
+
     while (1) {
         cpu_exec_all();
         qemu_tcg_wait_io_event();
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -637,6 +637,7 @@ void cpu_exec_init(CPUState *env)
     env->numa_node = 0;
     QTAILQ_INIT(&env->breakpoints);
     QTAILQ_INIT(&env->watchpoints);
+    env->thread_id = get_thread_id();
     *penv = env;
 #if defined(CONFIG_USER_ONLY)
     cpu_list_unlock();
Index: qemu/osdep.c
===================================================================
--- qemu.orig/osdep.c
+++ qemu/osdep.c
@@ -44,6 +44,10 @@
 extern int madvise(caddr_t, size_t, int);
 #endif
 
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+
 #ifdef CONFIG_EVENTFD
 #include <sys/eventfd.h>
 #endif
@@ -200,6 +204,17 @@ int qemu_create_pidfile(const char *file
     return 0;
 }
 
+int get_thread_id(void)
+{
+#if defined (_WIN32)
+    return GetCurrentThreadId();
+#elif defined (__linux__)
+    return syscall(SYS_gettid);
+#else
+    return getpid();
+#endif
+}
+
 #ifdef _WIN32
 
 /* mingw32 needs ffs for compilations without optimization. */
Index: qemu/osdep.h
===================================================================
--- qemu.orig/osdep.h
+++ qemu/osdep.h
@@ -126,6 +126,7 @@ void qemu_vfree(void *ptr);
 int qemu_madvise(void *addr, size_t len, int advice);
 
 int qemu_create_pidfile(const char *filename);
+int get_thread_id(void);
 
 #ifdef _WIN32
 int ffs(int i);
Index: qemu/monitor.c
===================================================================
--- qemu.orig/monitor.c
+++ qemu/monitor.c
@@ -878,6 +878,9 @@ static void print_cpu_iter(QObject *obj,
         monitor_printf(mon, " (halted)");
     }
 
+    monitor_printf(mon, " thread_id=%" PRId64 " ",
+					qdict_get_int(cpu, "thread_id"));
+
     monitor_printf(mon, "\n");
 }
 
@@ -922,6 +925,7 @@ static void do_info_cpus(Monitor *mon, Q
 #elif defined(TARGET_MIPS)
         qdict_put(cpu, "PC", qint_from_int(env->active_tc.PC));
 #endif
+        qdict_put(cpu, "thread_id", qint_from_int(env->thread_id));
 
         qlist_append(cpu_list, cpu);
     }

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 4/8] kvm: x86: add mce support
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-04 18:54   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: mce --]
[-- Type: text/plain, Size: 4542 bytes --]

Port qemu-kvm's MCE support

commit c68b2374c9048812f488e00ffb95db66c0bc07a7
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Jul 20 10:00:53 2009 +0800

    Add MCE simulation support to qemu/kvm
    
    KVM ioctls are used to initialize MCE simulation and inject MCE. The
    real MCE simulation is implemented in Linux kernel. The Kernel part
    has been merged.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -27,6 +27,7 @@
 #include "exec-all.h"
 #include "qemu-common.h"
 #include "kvm.h"
+#include "kvm_x86.h"
 
 //#define DEBUG_MMU
 
@@ -1030,6 +1031,11 @@ void cpu_inject_x86_mce(CPUState *cenv, 
     if (bank >= bank_num || !(status & MCI_STATUS_VAL))
         return;
 
+    if (kvm_enabled()) {
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        return;
+    }
+
     /*
      * if MSR_MCG_CTL is not all 1s, the uncorrected error
      * reporting is disabled
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -27,6 +27,7 @@
 #include "hw/pc.h"
 #include "hw/apic.h"
 #include "ioport.h"
+#include "kvm_x86.h"
 
 #ifdef CONFIG_KVM_PARA
 #include <linux/kvm_para.h>
@@ -167,6 +168,67 @@ static int get_para_features(CPUState *e
 }
 #endif
 
+#ifdef KVM_CAP_MCE
+static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
+                                     int *max_banks)
+{
+    int r;
+
+    r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
+    if (r > 0) {
+        *max_banks = r;
+        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
+    }
+    return -ENOSYS;
+}
+
+static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
+}
+
+static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
+}
+
+struct kvm_x86_mce_data
+{
+    CPUState *env;
+    struct kvm_x86_mce *mce;
+};
+
+static void kvm_do_inject_x86_mce(void *_data)
+{
+    struct kvm_x86_mce_data *data = _data;
+    int r;
+
+    r = kvm_set_mce(data->env, data->mce);
+    if (r < 0)
+        perror("kvm_set_mce FAILED");
+}
+#endif
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+#ifdef KVM_CAP_MCE
+    struct kvm_x86_mce mce = {
+        .bank = bank,
+        .status = status,
+        .mcg_status = mcg_status,
+        .addr = addr,
+        .misc = misc,
+    };
+    struct kvm_x86_mce_data data = {
+            .env = cenv,
+            .mce = &mce,
+    };
+
+    run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#endif
+}
+
 int kvm_arch_init_vcpu(CPUState *env)
 {
     struct {
@@ -274,6 +336,28 @@ int kvm_arch_init_vcpu(CPUState *env)
 
     cpuid_data.cpuid.nent = cpuid_i;
 
+#ifdef KVM_CAP_MCE
+    if (((env->cpuid_version >> 8)&0xF) >= 6
+        && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
+        && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
+        uint64_t mcg_cap;
+        int banks;
+
+        if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks))
+            perror("kvm_get_mce_cap_supported FAILED");
+        else {
+            if (banks > MCE_BANKS_DEF)
+                banks = MCE_BANKS_DEF;
+            mcg_cap &= MCE_CAP_DEF;
+            mcg_cap |= banks;
+            if (kvm_setup_mce(env, &mcg_cap))
+                perror("kvm_setup_mce FAILED");
+            else
+                env->mcg_cap = mcg_cap;
+        }
+    }
+#endif
+
     return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
 }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- /dev/null
+++ qemu/target-i386/kvm_x86.h
@@ -0,0 +1,21 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright (C) 2009 Red Hat Inc.
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __KVM_X86_H__
+#define __KVM_X86_H__
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+
+#endif



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 4/8] kvm: x86: add mce support
@ 2010-10-04 18:54   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: mce --]
[-- Type: text/plain, Size: 4540 bytes --]

Port qemu-kvm's MCE support

commit c68b2374c9048812f488e00ffb95db66c0bc07a7
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Jul 20 10:00:53 2009 +0800

    Add MCE simulation support to qemu/kvm
    
    KVM ioctls are used to initialize MCE simulation and inject MCE. The
    real MCE simulation is implemented in Linux kernel. The Kernel part
    has been merged.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -27,6 +27,7 @@
 #include "exec-all.h"
 #include "qemu-common.h"
 #include "kvm.h"
+#include "kvm_x86.h"
 
 //#define DEBUG_MMU
 
@@ -1030,6 +1031,11 @@ void cpu_inject_x86_mce(CPUState *cenv, 
     if (bank >= bank_num || !(status & MCI_STATUS_VAL))
         return;
 
+    if (kvm_enabled()) {
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        return;
+    }
+
     /*
      * if MSR_MCG_CTL is not all 1s, the uncorrected error
      * reporting is disabled
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -27,6 +27,7 @@
 #include "hw/pc.h"
 #include "hw/apic.h"
 #include "ioport.h"
+#include "kvm_x86.h"
 
 #ifdef CONFIG_KVM_PARA
 #include <linux/kvm_para.h>
@@ -167,6 +168,67 @@ static int get_para_features(CPUState *e
 }
 #endif
 
+#ifdef KVM_CAP_MCE
+static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
+                                     int *max_banks)
+{
+    int r;
+
+    r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
+    if (r > 0) {
+        *max_banks = r;
+        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
+    }
+    return -ENOSYS;
+}
+
+static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
+}
+
+static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
+}
+
+struct kvm_x86_mce_data
+{
+    CPUState *env;
+    struct kvm_x86_mce *mce;
+};
+
+static void kvm_do_inject_x86_mce(void *_data)
+{
+    struct kvm_x86_mce_data *data = _data;
+    int r;
+
+    r = kvm_set_mce(data->env, data->mce);
+    if (r < 0)
+        perror("kvm_set_mce FAILED");
+}
+#endif
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+#ifdef KVM_CAP_MCE
+    struct kvm_x86_mce mce = {
+        .bank = bank,
+        .status = status,
+        .mcg_status = mcg_status,
+        .addr = addr,
+        .misc = misc,
+    };
+    struct kvm_x86_mce_data data = {
+            .env = cenv,
+            .mce = &mce,
+    };
+
+    run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#endif
+}
+
 int kvm_arch_init_vcpu(CPUState *env)
 {
     struct {
@@ -274,6 +336,28 @@ int kvm_arch_init_vcpu(CPUState *env)
 
     cpuid_data.cpuid.nent = cpuid_i;
 
+#ifdef KVM_CAP_MCE
+    if (((env->cpuid_version >> 8)&0xF) >= 6
+        && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
+        && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
+        uint64_t mcg_cap;
+        int banks;
+
+        if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks))
+            perror("kvm_get_mce_cap_supported FAILED");
+        else {
+            if (banks > MCE_BANKS_DEF)
+                banks = MCE_BANKS_DEF;
+            mcg_cap &= MCE_CAP_DEF;
+            mcg_cap |= banks;
+            if (kvm_setup_mce(env, &mcg_cap))
+                perror("kvm_setup_mce FAILED");
+            else
+                env->mcg_cap = mcg_cap;
+        }
+    }
+#endif
+
     return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
 }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- /dev/null
+++ qemu/target-i386/kvm_x86.h
@@ -0,0 +1,21 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright (C) 2009 Red Hat Inc.
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __KVM_X86_H__
+#define __KVM_X86_H__
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+
+#endif

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 5/8] Export qemu_ram_addr_from_host
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-04 18:54   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: do_qemu_ram_addr_from_host --]
[-- Type: text/plain, Size: 1918 bytes --]

To be used by next patches.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-common.h
===================================================================
--- qemu.orig/cpu-common.h
+++ qemu/cpu-common.h
@@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
 ram_addr_t qemu_ram_addr_from_host(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return NULL;
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (host - block->host < block->length) {
-            return block->offset + (host - block->host);
+            *ram_addr = block->offset + (host - block->host);
+            return 0;
         }
     }
+    return -1;
+}
 
-    fprintf(stderr, "Bad ram pointer %p\n", ptr);
-    abort();
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
+{
+    ram_addr_t ram_addr;
 
-    return 0;
+    if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) {
+        fprintf(stderr, "Bad ram pointer %p\n", ptr);
+        abort();
+    }
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 5/8] Export qemu_ram_addr_from_host
@ 2010-10-04 18:54   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: do_qemu_ram_addr_from_host --]
[-- Type: text/plain, Size: 1916 bytes --]

To be used by next patches.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-common.h
===================================================================
--- qemu.orig/cpu-common.h
+++ qemu/cpu-common.h
@@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
 ram_addr_t qemu_ram_addr_from_host(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return NULL;
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (host - block->host < block->length) {
-            return block->offset + (host - block->host);
+            *ram_addr = block->offset + (host - block->host);
+            return 0;
         }
     }
+    return -1;
+}
 
-    fprintf(stderr, "Bad ram pointer %p\n", ptr);
-    abort();
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
+{
+    ram_addr_t ram_addr;
 
-    return 0;
+    if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) {
+        fprintf(stderr, "Bad ram pointer %p\n", ptr);
+        abort();
+    }
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 6/8] Add RAM -> physical addr mapping in MCE simulation
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-04 18:54   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: kvm_physical_memory_addr_from_ram --]
[-- Type: text/plain, Size: 1710 bytes --]

From: Huang Ying <ying.huang@intel.com>

In QEMU-KVM, physical address != RAM address. While MCE simulation
needs physical address instead of RAM address. So
kvm_physical_memory_addr_from_ram() is implemented to do the
conversion, and it is invoked before being filled in the IA32_MCi_ADDR
MSR.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/kvm-all.c
===================================================================
--- qemu.orig/kvm-all.c
+++ qemu/kvm-all.c
@@ -137,6 +137,24 @@ static KVMSlot *kvm_lookup_overlapping_s
     return found;
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+        KVMSlot *mem = &s->slots[i];
+
+        if (ram_addr >= mem->phys_offset &&
+            ram_addr < mem->phys_offset + mem->memory_size) {
+            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 {
     struct kvm_userspace_memory_region mem;
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -174,6 +174,9 @@ static inline void cpu_synchronize_post_
     }
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr);
+
 #endif
 int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign);
 



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 6/8] Add RAM -> physical addr mapping in MCE simulation
@ 2010-10-04 18:54   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: kvm_physical_memory_addr_from_ram --]
[-- Type: text/plain, Size: 1708 bytes --]

From: Huang Ying <ying.huang@intel.com>

In QEMU-KVM, physical address != RAM address. While MCE simulation
needs physical address instead of RAM address. So
kvm_physical_memory_addr_from_ram() is implemented to do the
conversion, and it is invoked before being filled in the IA32_MCi_ADDR
MSR.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/kvm-all.c
===================================================================
--- qemu.orig/kvm-all.c
+++ qemu/kvm-all.c
@@ -137,6 +137,24 @@ static KVMSlot *kvm_lookup_overlapping_s
     return found;
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+        KVMSlot *mem = &s->slots[i];
+
+        if (ram_addr >= mem->phys_offset &&
+            ram_addr < mem->phys_offset + mem->memory_size) {
+            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 {
     struct kvm_userspace_memory_region mem;
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -174,6 +174,9 @@ static inline void cpu_synchronize_post_
     }
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr);
+
 #endif
 int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign);
 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-04 18:54   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: kvm-mce-sigbus --]
[-- Type: text/plain, Size: 14890 bytes --]

Port qemu-kvm's

commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Sep 21 10:43:25 2009 +0800

    MCE: Relay UCR MCE to guest
    
    UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
    where some hardware error such as some memory error can be reported
    without PCC (processor context corrupted). To recover from such MCE,
    the corresponding memory will be unmapped, and all processes accessing
    the memory will be killed via SIGBUS.
    
    For KVM, if QEMU/KVM is killed, all guest processes will be killed
    too. So we relay SIGBUS from host OS to guest system via a UCR MCE
    injection. Then guest OS can isolate corresponding memory and kill
    necessary guest processes only. SIGBUS sent to main thread (not VCPU
    threads) will be broadcast to all VCPU threads as UCR MCE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -34,6 +34,10 @@
 
 #include "cpus.h"
 #include "compatfd.h"
+#ifdef CONFIG_LINUX
+#include <sys/prctl.h>
+#include <sys/signalfd.h>
+#endif
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -41,6 +45,10 @@
 #define SIG_IPI SIGUSR1
 #endif
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
 static CPUState *next_cpu;
 
 /***********************************************************/
@@ -498,28 +506,77 @@ static void qemu_tcg_wait_io_event(void)
     }
 }
 
+static void sigbus_reraise(void)
+{
+    sigset_t set;
+    struct sigaction action;
+
+    memset(&action, 0, sizeof(action));
+    action.sa_handler = SIG_DFL;
+    if (!sigaction(SIGBUS, &action, NULL)) {
+        raise(SIGBUS);
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        sigprocmask(SIG_UNBLOCK, &set, NULL);
+    }
+    perror("Failed to re-raise SIGBUS!\n");
+    abort();
+}
+
+static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
+                           void *ctx)
+{
+#if defined(TARGET_I386)
+    if (kvm_on_sigbus_vcpu(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
+#endif
+        sigbus_reraise();
+}
+
 static void qemu_kvm_eat_signal(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
 
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    qemu_mutex_unlock(&qemu_global_mutex);
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
-    qemu_mutex_lock(&qemu_global_mutex);
+    do {
+        qemu_mutex_unlock(&qemu_global_mutex);
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
+
+        qemu_mutex_lock(&qemu_global_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+#ifdef TARGET_I386
+            if (kvm_on_sigbus(env, siginfo.si_code, siginfo.si_addr))
+#endif
+                sigbus_reraise();
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            fprintf(stderr, "sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 }
 
 static void qemu_kvm_wait_io_event(CPUState *env)
@@ -645,6 +702,7 @@ static void kvm_init_ipi(CPUState *env)
 
     pthread_sigmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
     r = kvm_set_signal_mask(env, &set);
     if (r) {
         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
@@ -655,6 +713,7 @@ static void kvm_init_ipi(CPUState *env)
 static sigset_t block_io_signals(void)
 {
     sigset_t set;
+    struct sigaction action;
 
     /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
@@ -665,8 +724,15 @@ static sigset_t block_io_signals(void)
     sigaddset(&set, SIGIO);
     sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
+    sigaddset(&set, SIGBUS);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1, 0, 0);
+
     return set;
 }
 
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
 
 void kvm_arch_reset_vcpu(CPUState *env);
 
+int kvm_on_sigbus(CPUState *env, int code, void *addr);
+int kvm_on_sigbus_vcpu(int code, void *addr);
+
 struct kvm_guest_debug;
 struct kvm_debug_exit_arch;
 
Index: qemu/target-i386/cpu.h
===================================================================
--- qemu.orig/target-i386/cpu.h
+++ qemu/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -46,6 +46,13 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #ifdef KVM_CAP_EXT_CPUID
 
 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
@@ -192,10 +199,39 @@ static int kvm_set_mce(CPUState *env, st
     return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
 }
 
+static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
+{
+    struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
+    int r;
+
+    kmsrs->nmsrs = n;
+    memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
+    r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
+    memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
+    free(kmsrs);
+    return r;
+}
+
+/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
+static int kvm_mce_in_exception(CPUState *env)
+{
+    struct kvm_msr_entry msr_mcg_status = {
+        .index = MSR_MCG_STATUS,
+    };
+    int r;
+
+    r = kvm_get_msr(env, &msr_mcg_status, 1);
+    if (r == -1 || r == 0) {
+        return -1;
+    }
+    return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
+}
+
 struct kvm_x86_mce_data
 {
     CPUState *env;
     struct kvm_x86_mce *mce;
+    int abort_on_error;
 };
 
 static void kvm_do_inject_x86_mce(void *_data)
@@ -203,14 +239,26 @@ static void kvm_do_inject_x86_mce(void *
     struct kvm_x86_mce_data *data = _data;
     int r;
 
+    /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
+    r = kvm_mce_in_exception(data->env);
+    if (r == -1)
+        fprintf(stderr, "Failed to get MCE status\n");
+    else if (r && !(data->mce->status & MCI_STATUS_AR))
+        return;
+
     r = kvm_set_mce(data->env, data->mce);
-    if (r < 0)
+    if (r < 0) {
         perror("kvm_set_mce FAILED");
+        if (data->abort_on_error) {
+            abort();
+        }
+    }
 }
 #endif
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error)
 {
 #ifdef KVM_CAP_MCE
     struct kvm_x86_mce mce = {
@@ -225,7 +273,15 @@ void kvm_inject_x86_mce(CPUState *cenv, 
             .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
+
     run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+    if (abort_on_error)
+        abort();
 #endif
 }
 
@@ -1525,3 +1581,122 @@ bool kvm_arch_stop_on_emulation_error(CP
               ((env->segs[R_CS].selector  & 3) != 3);
 }
 
+static void hardware_memory_error(void)
+{
+    fprintf(stderr, "Hardware memory error!\n");
+    exit(1);
+}
+
+int kvm_on_sigbus(CPUState *env, int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    void *vaddr;
+    ram_addr_t ram_addr;
+    unsigned long paddr;
+    int r;
+
+    if (env->mcg_cap && addr
+        && (code == BUS_MCEERR_AR
+            || code == BUS_MCEERR_AO)) {
+        if (code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /*
+             * If there is an MCE excpetion being processed, ignore
+             * this SRAO MCE
+             */
+            r = kvm_mce_in_exception(env);
+            if (r == -1) {
+                fprintf(stderr, "Failed to get MCE status\n");
+            } else if (r) {
+                return 0;
+            }
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        vaddr = (void *)addr;
+        if (do_qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instaed of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (code == BUS_MCEERR_AO) {
+                return 0;
+            } else {
+                hardware_memory_error();
+            }
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            abort();
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int kvm_on_sigbus_vcpu(int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    if (first_cpu->mcg_cap && addr && code == BUS_MCEERR_AO) {
+        uint64_t status;
+        void *vaddr;
+        ram_addr_t ram_addr;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        vaddr = addr;
+        if (do_qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %p\n", addr);
+            return 0;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -1032,7 +1032,7 @@ void cpu_inject_x86_mce(CPUState *cenv, 
         return;
 
     if (kvm_enabled()) {
-        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
         return;
     }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- qemu.orig/target-i386/kvm_x86.h
+++ qemu/target-i386/kvm_x86.h
@@ -16,6 +16,7 @@
 #define __KVM_X86_H__
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error);
 
 #endif
Index: qemu/kvm-stub.c
===================================================================
--- qemu.orig/kvm-stub.c
+++ qemu/kvm-stub.c
@@ -141,3 +141,9 @@ int kvm_set_ioeventfd_mmio_long(int fd, 
 {
     return -ENOSYS;
 }
+
+int kvm_on_sigbus_vcpu(int code, void *addr)
+{
+    return 1;
+}
+



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-04 18:54   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: kvm-mce-sigbus --]
[-- Type: text/plain, Size: 14888 bytes --]

Port qemu-kvm's

commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Sep 21 10:43:25 2009 +0800

    MCE: Relay UCR MCE to guest
    
    UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
    where some hardware error such as some memory error can be reported
    without PCC (processor context corrupted). To recover from such MCE,
    the corresponding memory will be unmapped, and all processes accessing
    the memory will be killed via SIGBUS.
    
    For KVM, if QEMU/KVM is killed, all guest processes will be killed
    too. So we relay SIGBUS from host OS to guest system via a UCR MCE
    injection. Then guest OS can isolate corresponding memory and kill
    necessary guest processes only. SIGBUS sent to main thread (not VCPU
    threads) will be broadcast to all VCPU threads as UCR MCE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -34,6 +34,10 @@
 
 #include "cpus.h"
 #include "compatfd.h"
+#ifdef CONFIG_LINUX
+#include <sys/prctl.h>
+#include <sys/signalfd.h>
+#endif
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -41,6 +45,10 @@
 #define SIG_IPI SIGUSR1
 #endif
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
 static CPUState *next_cpu;
 
 /***********************************************************/
@@ -498,28 +506,77 @@ static void qemu_tcg_wait_io_event(void)
     }
 }
 
+static void sigbus_reraise(void)
+{
+    sigset_t set;
+    struct sigaction action;
+
+    memset(&action, 0, sizeof(action));
+    action.sa_handler = SIG_DFL;
+    if (!sigaction(SIGBUS, &action, NULL)) {
+        raise(SIGBUS);
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        sigprocmask(SIG_UNBLOCK, &set, NULL);
+    }
+    perror("Failed to re-raise SIGBUS!\n");
+    abort();
+}
+
+static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
+                           void *ctx)
+{
+#if defined(TARGET_I386)
+    if (kvm_on_sigbus_vcpu(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
+#endif
+        sigbus_reraise();
+}
+
 static void qemu_kvm_eat_signal(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
 
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    qemu_mutex_unlock(&qemu_global_mutex);
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
-    qemu_mutex_lock(&qemu_global_mutex);
+    do {
+        qemu_mutex_unlock(&qemu_global_mutex);
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
+
+        qemu_mutex_lock(&qemu_global_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+#ifdef TARGET_I386
+            if (kvm_on_sigbus(env, siginfo.si_code, siginfo.si_addr))
+#endif
+                sigbus_reraise();
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            fprintf(stderr, "sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 }
 
 static void qemu_kvm_wait_io_event(CPUState *env)
@@ -645,6 +702,7 @@ static void kvm_init_ipi(CPUState *env)
 
     pthread_sigmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
     r = kvm_set_signal_mask(env, &set);
     if (r) {
         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
@@ -655,6 +713,7 @@ static void kvm_init_ipi(CPUState *env)
 static sigset_t block_io_signals(void)
 {
     sigset_t set;
+    struct sigaction action;
 
     /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
@@ -665,8 +724,15 @@ static sigset_t block_io_signals(void)
     sigaddset(&set, SIGIO);
     sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
+    sigaddset(&set, SIGBUS);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1, 0, 0);
+
     return set;
 }
 
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
 
 void kvm_arch_reset_vcpu(CPUState *env);
 
+int kvm_on_sigbus(CPUState *env, int code, void *addr);
+int kvm_on_sigbus_vcpu(int code, void *addr);
+
 struct kvm_guest_debug;
 struct kvm_debug_exit_arch;
 
Index: qemu/target-i386/cpu.h
===================================================================
--- qemu.orig/target-i386/cpu.h
+++ qemu/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -46,6 +46,13 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #ifdef KVM_CAP_EXT_CPUID
 
 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
@@ -192,10 +199,39 @@ static int kvm_set_mce(CPUState *env, st
     return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
 }
 
+static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
+{
+    struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
+    int r;
+
+    kmsrs->nmsrs = n;
+    memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
+    r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
+    memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
+    free(kmsrs);
+    return r;
+}
+
+/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
+static int kvm_mce_in_exception(CPUState *env)
+{
+    struct kvm_msr_entry msr_mcg_status = {
+        .index = MSR_MCG_STATUS,
+    };
+    int r;
+
+    r = kvm_get_msr(env, &msr_mcg_status, 1);
+    if (r == -1 || r == 0) {
+        return -1;
+    }
+    return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
+}
+
 struct kvm_x86_mce_data
 {
     CPUState *env;
     struct kvm_x86_mce *mce;
+    int abort_on_error;
 };
 
 static void kvm_do_inject_x86_mce(void *_data)
@@ -203,14 +239,26 @@ static void kvm_do_inject_x86_mce(void *
     struct kvm_x86_mce_data *data = _data;
     int r;
 
+    /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
+    r = kvm_mce_in_exception(data->env);
+    if (r == -1)
+        fprintf(stderr, "Failed to get MCE status\n");
+    else if (r && !(data->mce->status & MCI_STATUS_AR))
+        return;
+
     r = kvm_set_mce(data->env, data->mce);
-    if (r < 0)
+    if (r < 0) {
         perror("kvm_set_mce FAILED");
+        if (data->abort_on_error) {
+            abort();
+        }
+    }
 }
 #endif
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error)
 {
 #ifdef KVM_CAP_MCE
     struct kvm_x86_mce mce = {
@@ -225,7 +273,15 @@ void kvm_inject_x86_mce(CPUState *cenv, 
             .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
+
     run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+    if (abort_on_error)
+        abort();
 #endif
 }
 
@@ -1525,3 +1581,122 @@ bool kvm_arch_stop_on_emulation_error(CP
               ((env->segs[R_CS].selector  & 3) != 3);
 }
 
+static void hardware_memory_error(void)
+{
+    fprintf(stderr, "Hardware memory error!\n");
+    exit(1);
+}
+
+int kvm_on_sigbus(CPUState *env, int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    void *vaddr;
+    ram_addr_t ram_addr;
+    unsigned long paddr;
+    int r;
+
+    if (env->mcg_cap && addr
+        && (code == BUS_MCEERR_AR
+            || code == BUS_MCEERR_AO)) {
+        if (code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /*
+             * If there is an MCE excpetion being processed, ignore
+             * this SRAO MCE
+             */
+            r = kvm_mce_in_exception(env);
+            if (r == -1) {
+                fprintf(stderr, "Failed to get MCE status\n");
+            } else if (r) {
+                return 0;
+            }
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        vaddr = (void *)addr;
+        if (do_qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instaed of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (code == BUS_MCEERR_AO) {
+                return 0;
+            } else {
+                hardware_memory_error();
+            }
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            abort();
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int kvm_on_sigbus_vcpu(int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    if (first_cpu->mcg_cap && addr && code == BUS_MCEERR_AO) {
+        uint64_t status;
+        void *vaddr;
+        ram_addr_t ram_addr;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        vaddr = addr;
+        if (do_qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %p\n", addr);
+            return 0;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -1032,7 +1032,7 @@ void cpu_inject_x86_mce(CPUState *cenv, 
         return;
 
     if (kvm_enabled()) {
-        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
         return;
     }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- qemu.orig/target-i386/kvm_x86.h
+++ qemu/target-i386/kvm_x86.h
@@ -16,6 +16,7 @@
 #define __KVM_X86_H__
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error);
 
 #endif
Index: qemu/kvm-stub.c
===================================================================
--- qemu.orig/kvm-stub.c
+++ qemu/kvm-stub.c
@@ -141,3 +141,9 @@ int kvm_set_ioeventfd_mmio_long(int fd, 
 {
     return -ENOSYS;
 }
+
+int kvm_on_sigbus_vcpu(int code, void *addr)
+{
+    return 1;
+}
+

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 8/8] Add savevm/loadvm support for MCE
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-04 18:54   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: mce-save-restore --]
[-- Type: text/plain, Size: 2909 bytes --]

Port qemu-kvm's

commit 1bab5d11545d8de5facf46c28630085a2f9651ae
Author: Huang Ying <ying.huang@intel.com>
Date:   Wed Mar 3 16:52:46 2010 +0800

    Add savevm/loadvm support for MCE
    
    MCE registers are saved/load into/from CPUState in
    kvm_arch_save/load_regs. To simulate the MCG_STATUS clearing upon
    reset, MSR_MCG_STATUS is set to 0 for KVM_PUT_RESET_STATE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -774,7 +774,7 @@ static int kvm_put_msrs(CPUState *env, i
         struct kvm_msr_entry entries[100];
     } msr_data;
     struct kvm_msr_entry *msrs = msr_data.entries;
-    int n = 0;
+    int i, n = 0;
 
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
@@ -794,6 +794,18 @@ static int kvm_put_msrs(CPUState *env, i
                           env->system_time_msr);
         kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
     }
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        if (level == KVM_PUT_RESET_STATE)
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+        else if (level == KVM_PUT_FULL_STATE) {
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
+            for (i = 0; i < (env->mcg_cap & 0xff); i++)
+                kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
+        }
+    }
+#endif
 
     msr_data.info.nmsrs = n;
 
@@ -1001,6 +1013,15 @@ static int kvm_get_msrs(CPUState *env)
     msrs[n++].index = MSR_KVM_SYSTEM_TIME;
     msrs[n++].index = MSR_KVM_WALL_CLOCK;
 
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        msrs[n++].index = MSR_MCG_STATUS;
+        msrs[n++].index = MSR_MCG_CTL;
+        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+            msrs[n++].index = MSR_MC0_CTL + i;
+    }
+#endif
+
     msr_data.info.nmsrs = n;
     ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
     if (ret < 0)
@@ -1043,6 +1064,22 @@ static int kvm_get_msrs(CPUState *env)
         case MSR_KVM_WALL_CLOCK:
             env->wall_clock_msr = msrs[i].data;
             break;
+#ifdef KVM_CAP_MCE
+        case MSR_MCG_STATUS:
+            env->mcg_status = msrs[i].data;
+            break;
+        case MSR_MCG_CTL:
+            env->mcg_ctl = msrs[i].data;
+            break;
+#endif
+        default:
+#ifdef KVM_CAP_MCE
+            if (msrs[i].index >= MSR_MC0_CTL &&
+                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
+                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
+                break;
+            }
+#endif
         }
     }
 



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 8/8] Add savevm/loadvm support for MCE
@ 2010-10-04 18:54   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-04 18:54 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: mce-save-restore --]
[-- Type: text/plain, Size: 2907 bytes --]

Port qemu-kvm's

commit 1bab5d11545d8de5facf46c28630085a2f9651ae
Author: Huang Ying <ying.huang@intel.com>
Date:   Wed Mar 3 16:52:46 2010 +0800

    Add savevm/loadvm support for MCE
    
    MCE registers are saved/load into/from CPUState in
    kvm_arch_save/load_regs. To simulate the MCG_STATUS clearing upon
    reset, MSR_MCG_STATUS is set to 0 for KVM_PUT_RESET_STATE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -774,7 +774,7 @@ static int kvm_put_msrs(CPUState *env, i
         struct kvm_msr_entry entries[100];
     } msr_data;
     struct kvm_msr_entry *msrs = msr_data.entries;
-    int n = 0;
+    int i, n = 0;
 
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
@@ -794,6 +794,18 @@ static int kvm_put_msrs(CPUState *env, i
                           env->system_time_msr);
         kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
     }
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        if (level == KVM_PUT_RESET_STATE)
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+        else if (level == KVM_PUT_FULL_STATE) {
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
+            for (i = 0; i < (env->mcg_cap & 0xff); i++)
+                kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
+        }
+    }
+#endif
 
     msr_data.info.nmsrs = n;
 
@@ -1001,6 +1013,15 @@ static int kvm_get_msrs(CPUState *env)
     msrs[n++].index = MSR_KVM_SYSTEM_TIME;
     msrs[n++].index = MSR_KVM_WALL_CLOCK;
 
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        msrs[n++].index = MSR_MCG_STATUS;
+        msrs[n++].index = MSR_MCG_CTL;
+        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+            msrs[n++].index = MSR_MC0_CTL + i;
+    }
+#endif
+
     msr_data.info.nmsrs = n;
     ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
     if (ret < 0)
@@ -1043,6 +1064,22 @@ static int kvm_get_msrs(CPUState *env)
         case MSR_KVM_WALL_CLOCK:
             env->wall_clock_msr = msrs[i].data;
             break;
+#ifdef KVM_CAP_MCE
+        case MSR_MCG_STATUS:
+            env->mcg_status = msrs[i].data;
+            break;
+        case MSR_MCG_CTL:
+            env->mcg_ctl = msrs[i].data;
+            break;
+#endif
+        default:
+#ifdef KVM_CAP_MCE
+            if (msrs[i].index >= MSR_MC0_CTL &&
+                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
+                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
+                break;
+            }
+#endif
         }
     }
 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 5/8] Export qemu_ram_addr_from_host
  2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-05 12:57     ` Anthony Liguori
  -1 siblings, 0 replies; 93+ messages in thread
From: Anthony Liguori @ 2010-10-05 12:57 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

On 10/04/2010 01:54 PM, Marcelo Tosatti wrote:
> To be used by next patches.
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>
> Index: qemu/cpu-common.h
> ===================================================================
> --- qemu.orig/cpu-common.h
> +++ qemu/cpu-common.h
> @@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
>   /* This should only be used for ram local to a device.  */
>   void *qemu_get_ram_ptr(ram_addr_t addr);
>   /* This should not be used by devices.  */
> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
>    

This is not a great name for a function.  A better way to do this would 
be to make the existing qemu_ram_addr_from_host() -> 
qemu_ram_addr_from_host_nofail().

Regards,

Anthony Liguori

>   ram_addr_t qemu_ram_addr_from_host(void *ptr);
>
>   int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
> Index: qemu/exec.c
> ===================================================================
> --- qemu.orig/exec.c
> +++ qemu/exec.c
> @@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>       return NULL;
>   }
>
> -/* Some of the softmmu routines need to translate from a host pointer
> -   (typically a TLB entry) back to a ram offset.  */
> -ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
>   {
>       RAMBlock *block;
>       uint8_t *host = ptr;
>
>       QLIST_FOREACH(block,&ram_list.blocks, next) {
>           if (host - block->host<  block->length) {
> -            return block->offset + (host - block->host);
> +            *ram_addr = block->offset + (host - block->host);
> +            return 0;
>           }
>       }
> +    return -1;
> +}
>
> -    fprintf(stderr, "Bad ram pointer %p\n", ptr);
> -    abort();
> +/* Some of the softmmu routines need to translate from a host pointer
> +   (typically a TLB entry) back to a ram offset.  */
> +ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +{
> +    ram_addr_t ram_addr;
>
> -    return 0;
> +    if (do_qemu_ram_addr_from_host(ptr,&ram_addr)) {
> +        fprintf(stderr, "Bad ram pointer %p\n", ptr);
> +        abort();
> +    }
> +    return ram_addr;
>   }
>
>   static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 5/8] Export qemu_ram_addr_from_host
@ 2010-10-05 12:57     ` Anthony Liguori
  0 siblings, 0 replies; 93+ messages in thread
From: Anthony Liguori @ 2010-10-05 12:57 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

On 10/04/2010 01:54 PM, Marcelo Tosatti wrote:
> To be used by next patches.
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>
> Index: qemu/cpu-common.h
> ===================================================================
> --- qemu.orig/cpu-common.h
> +++ qemu/cpu-common.h
> @@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
>   /* This should only be used for ram local to a device.  */
>   void *qemu_get_ram_ptr(ram_addr_t addr);
>   /* This should not be used by devices.  */
> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
>    

This is not a great name for a function.  A better way to do this would 
be to make the existing qemu_ram_addr_from_host() -> 
qemu_ram_addr_from_host_nofail().

Regards,

Anthony Liguori

>   ram_addr_t qemu_ram_addr_from_host(void *ptr);
>
>   int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
> Index: qemu/exec.c
> ===================================================================
> --- qemu.orig/exec.c
> +++ qemu/exec.c
> @@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>       return NULL;
>   }
>
> -/* Some of the softmmu routines need to translate from a host pointer
> -   (typically a TLB entry) back to a ram offset.  */
> -ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
>   {
>       RAMBlock *block;
>       uint8_t *host = ptr;
>
>       QLIST_FOREACH(block,&ram_list.blocks, next) {
>           if (host - block->host<  block->length) {
> -            return block->offset + (host - block->host);
> +            *ram_addr = block->offset + (host - block->host);
> +            return 0;
>           }
>       }
> +    return -1;
> +}
>
> -    fprintf(stderr, "Bad ram pointer %p\n", ptr);
> -    abort();
> +/* Some of the softmmu routines need to translate from a host pointer
> +   (typically a TLB entry) back to a ram offset.  */
> +ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +{
> +    ram_addr_t ram_addr;
>
> -    return 0;
> +    if (do_qemu_ram_addr_from_host(ptr,&ram_addr)) {
> +        fprintf(stderr, "Bad ram pointer %p\n", ptr);
> +        abort();
> +    }
> +    return ram_addr;
>   }
>
>   static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [Qemu-devel] [patch uq/master 0/8] port qemu-kvm's MCE support
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
                   ` (8 preceding siblings ...)
  (?)
@ 2010-10-05 16:31 ` Andreas Färber
  2010-10-05 18:58   ` Chris Wright
  -1 siblings, 1 reply; 93+ messages in thread
From: Andreas Färber @ 2010-10-05 16:31 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: QEMU Developers, kvm

Marcelo,

Am 04.10.2010 um 20:54 schrieb Marcelo Tosatti:

>
>
>

I assume something went wrong with your cover letter here. It would've  
been nice to see MCE spelled out or summarized for those of us that  
don't speak x86.

Cheers,
Andreas

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [Qemu-devel] [patch uq/master 0/8] port qemu-kvm's MCE support
  2010-10-05 16:31 ` [Qemu-devel] [patch uq/master 0/8] port qemu-kvm's MCE support Andreas Färber
@ 2010-10-05 18:58   ` Chris Wright
  2010-10-05 20:24     ` Marcelo Tosatti
  0 siblings, 1 reply; 93+ messages in thread
From: Chris Wright @ 2010-10-05 18:58 UTC (permalink / raw)
  To: Andreas Färber; +Cc: Marcelo Tosatti, QEMU Developers, kvm

* Andreas Färber (andreas.faerber@web.de) wrote:
> Am 04.10.2010 um 20:54 schrieb Marcelo Tosatti:
> 
> I assume something went wrong with your cover letter here. It
> would've been nice to see MCE spelled out or summarized for those of
> us that don't speak x86.

It would help.  The acronym is Machine Check Exception.  The patchset
should allow (on newer Intel x86 hw with a newer linux kernel) a class of
memory errors delivered to the host OS as MCEs to be propagated to the
guest OS.  Without the patchset, the qemu process assoicated with the
memory where the error took place would be killed.  With the patchset,
qemu can propagate the error into the guest and allow the guest to kill
only the process within the guest that is assocated with the memory error.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 5/8] Export qemu_ram_addr_from_host
  2010-10-05 12:57     ` [Qemu-devel] " Anthony Liguori
@ 2010-10-05 20:13       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-05 20:13 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

On Tue, Oct 05, 2010 at 07:57:14AM -0500, Anthony Liguori wrote:
> On 10/04/2010 01:54 PM, Marcelo Tosatti wrote:
> >To be used by next patches.
> >
> >Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
> >
> >Index: qemu/cpu-common.h
> >===================================================================
> >--- qemu.orig/cpu-common.h
> >+++ qemu/cpu-common.h
> >@@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
> >  /* This should only be used for ram local to a device.  */
> >  void *qemu_get_ram_ptr(ram_addr_t addr);
> >  /* This should not be used by devices.  */
> >+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
> 
> This is not a great name for a function.  A better way to do this
> would be to make the existing qemu_ram_addr_from_host() ->
> qemu_ram_addr_from_host_nofail().

It should fail for all callers in tree now, where address from
qemu_get_ram_ptr() is saved somewhere. MCE handler is an exception to
that.

Are you OK with this:


Index: qemu/cpu-common.h
===================================================================
--- qemu.orig/cpu-common.h
+++ qemu/cpu-common.h
@@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
+int qemu_ram_addr_from_host_nofail(void *ptr, ram_addr_t *ram_addr);
 ram_addr_t qemu_ram_addr_from_host(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return NULL;
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int qemu_ram_addr_from_host_nofail(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (host - block->host < block->length) {
-            return block->offset + (host - block->host);
+            *ram_addr = block->offset + (host - block->host);
+            return 0;
         }
     }
+    return -1;
+}
 
-    fprintf(stderr, "Bad ram pointer %p\n", ptr);
-    abort();
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
+{
+    ram_addr_t ram_addr;
 
-    return 0;
+    if (qemu_ram_addr_from_host_nofail(ptr, &ram_addr)) {
+        fprintf(stderr, "Bad ram pointer %p\n", ptr);
+        abort();
+    }
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 5/8] Export qemu_ram_addr_from_host
@ 2010-10-05 20:13       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-05 20:13 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

On Tue, Oct 05, 2010 at 07:57:14AM -0500, Anthony Liguori wrote:
> On 10/04/2010 01:54 PM, Marcelo Tosatti wrote:
> >To be used by next patches.
> >
> >Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
> >
> >Index: qemu/cpu-common.h
> >===================================================================
> >--- qemu.orig/cpu-common.h
> >+++ qemu/cpu-common.h
> >@@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
> >  /* This should only be used for ram local to a device.  */
> >  void *qemu_get_ram_ptr(ram_addr_t addr);
> >  /* This should not be used by devices.  */
> >+int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
> 
> This is not a great name for a function.  A better way to do this
> would be to make the existing qemu_ram_addr_from_host() ->
> qemu_ram_addr_from_host_nofail().

It should fail for all callers in tree now, where address from
qemu_get_ram_ptr() is saved somewhere. MCE handler is an exception to
that.

Are you OK with this:


Index: qemu/cpu-common.h
===================================================================
--- qemu.orig/cpu-common.h
+++ qemu/cpu-common.h
@@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
+int qemu_ram_addr_from_host_nofail(void *ptr, ram_addr_t *ram_addr);
 ram_addr_t qemu_ram_addr_from_host(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return NULL;
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int qemu_ram_addr_from_host_nofail(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (host - block->host < block->length) {
-            return block->offset + (host - block->host);
+            *ram_addr = block->offset + (host - block->host);
+            return 0;
         }
     }
+    return -1;
+}
 
-    fprintf(stderr, "Bad ram pointer %p\n", ptr);
-    abort();
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
+{
+    ram_addr_t ram_addr;
 
-    return 0;
+    if (qemu_ram_addr_from_host_nofail(ptr, &ram_addr)) {
+        fprintf(stderr, "Bad ram pointer %p\n", ptr);
+        abort();
+    }
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [Qemu-devel] [patch uq/master 0/8] port qemu-kvm's MCE support
  2010-10-05 18:58   ` Chris Wright
@ 2010-10-05 20:24     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-05 20:24 UTC (permalink / raw)
  To: Chris Wright; +Cc: Andreas Färber, QEMU Developers, kvm

On Tue, Oct 05, 2010 at 11:58:13AM -0700, Chris Wright wrote:
> * Andreas Färber (andreas.faerber@web.de) wrote:
> > Am 04.10.2010 um 20:54 schrieb Marcelo Tosatti:
> > 
> > I assume something went wrong with your cover letter here. It
> > would've been nice to see MCE spelled out or summarized for those of
> > us that don't speak x86.

Sorry about that. Will improve on next submission.

> It would help.  The acronym is Machine Check Exception.  The patchset
> should allow (on newer Intel x86 hw with a newer linux kernel) a class of
> memory errors delivered to the host OS as MCEs to be propagated to the
> guest OS.  Without the patchset, the qemu process assoicated with the
> memory where the error took place would be killed.  With the patchset,
> qemu can propagate the error into the guest and allow the guest to kill
> only the process within the guest that is assocated with the memory error.
> --

Thanks Chris.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 5/8] Export qemu_ram_addr_from_host
  2010-10-05 20:13       ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-05 20:48         ` Anthony Liguori
  -1 siblings, 0 replies; 93+ messages in thread
From: Anthony Liguori @ 2010-10-05 20:48 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

On 10/05/2010 03:13 PM, Marcelo Tosatti wrote:
> On Tue, Oct 05, 2010 at 07:57:14AM -0500, Anthony Liguori wrote:
>    
>> On 10/04/2010 01:54 PM, Marcelo Tosatti wrote:
>>      
>>> To be used by next patches.
>>>
>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>>
>>> Index: qemu/cpu-common.h
>>> ===================================================================
>>> --- qemu.orig/cpu-common.h
>>> +++ qemu/cpu-common.h
>>> @@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
>>>   /* This should only be used for ram local to a device.  */
>>>   void *qemu_get_ram_ptr(ram_addr_t addr);
>>>   /* This should not be used by devices.  */
>>> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
>>>        
>> This is not a great name for a function.  A better way to do this
>> would be to make the existing qemu_ram_addr_from_host() ->
>> qemu_ram_addr_from_host_nofail().
>>      
> It should fail for all callers in tree now, where address from
> qemu_get_ram_ptr() is saved somewhere. MCE handler is an exception to
> that.
>
> Are you OK with this:
>    

I meant the inverse of naming.  nofail means something can never fail 
(because if it does, it aborts).  That happens to be the way we 
currently use that naming convention.

An example is qdev_init() vs. qdev_init_nofail().

Regards,

Anthony Liguori

> Index: qemu/cpu-common.h
> ===================================================================
> --- qemu.orig/cpu-common.h
> +++ qemu/cpu-common.h
> @@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
>   /* This should only be used for ram local to a device.  */
>   void *qemu_get_ram_ptr(ram_addr_t addr);
>   /* This should not be used by devices.  */
> +int qemu_ram_addr_from_host_nofail(void *ptr, ram_addr_t *ram_addr);
>   ram_addr_t qemu_ram_addr_from_host(void *ptr);
>
>   int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
> Index: qemu/exec.c
> ===================================================================
> --- qemu.orig/exec.c
> +++ qemu/exec.c
> @@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>       return NULL;
>   }
>
> -/* Some of the softmmu routines need to translate from a host pointer
> -   (typically a TLB entry) back to a ram offset.  */
> -ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +int qemu_ram_addr_from_host_nofail(void *ptr, ram_addr_t *ram_addr)
>   {
>       RAMBlock *block;
>       uint8_t *host = ptr;
>
>       QLIST_FOREACH(block,&ram_list.blocks, next) {
>           if (host - block->host<  block->length) {
> -            return block->offset + (host - block->host);
> +            *ram_addr = block->offset + (host - block->host);
> +            return 0;
>           }
>       }
> +    return -1;
> +}
>
> -    fprintf(stderr, "Bad ram pointer %p\n", ptr);
> -    abort();
> +/* Some of the softmmu routines need to translate from a host pointer
> +   (typically a TLB entry) back to a ram offset.  */
> +ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +{
> +    ram_addr_t ram_addr;
>
> -    return 0;
> +    if (qemu_ram_addr_from_host_nofail(ptr,&ram_addr)) {
> +        fprintf(stderr, "Bad ram pointer %p\n", ptr);
> +        abort();
> +    }
> +    return ram_addr;
>   }
>
>   static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
>    


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 5/8] Export qemu_ram_addr_from_host
@ 2010-10-05 20:48         ` Anthony Liguori
  0 siblings, 0 replies; 93+ messages in thread
From: Anthony Liguori @ 2010-10-05 20:48 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

On 10/05/2010 03:13 PM, Marcelo Tosatti wrote:
> On Tue, Oct 05, 2010 at 07:57:14AM -0500, Anthony Liguori wrote:
>    
>> On 10/04/2010 01:54 PM, Marcelo Tosatti wrote:
>>      
>>> To be used by next patches.
>>>
>>> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>>>
>>> Index: qemu/cpu-common.h
>>> ===================================================================
>>> --- qemu.orig/cpu-common.h
>>> +++ qemu/cpu-common.h
>>> @@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
>>>   /* This should only be used for ram local to a device.  */
>>>   void *qemu_get_ram_ptr(ram_addr_t addr);
>>>   /* This should not be used by devices.  */
>>> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
>>>        
>> This is not a great name for a function.  A better way to do this
>> would be to make the existing qemu_ram_addr_from_host() ->
>> qemu_ram_addr_from_host_nofail().
>>      
> It should fail for all callers in tree now, where address from
> qemu_get_ram_ptr() is saved somewhere. MCE handler is an exception to
> that.
>
> Are you OK with this:
>    

I meant the inverse of naming.  nofail means something can never fail 
(because if it does, it aborts).  That happens to be the way we 
currently use that naming convention.

An example is qdev_init() vs. qdev_init_nofail().

Regards,

Anthony Liguori

> Index: qemu/cpu-common.h
> ===================================================================
> --- qemu.orig/cpu-common.h
> +++ qemu/cpu-common.h
> @@ -47,6 +47,7 @@ void qemu_ram_free(ram_addr_t addr);
>   /* This should only be used for ram local to a device.  */
>   void *qemu_get_ram_ptr(ram_addr_t addr);
>   /* This should not be used by devices.  */
> +int qemu_ram_addr_from_host_nofail(void *ptr, ram_addr_t *ram_addr);
>   ram_addr_t qemu_ram_addr_from_host(void *ptr);
>
>   int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
> Index: qemu/exec.c
> ===================================================================
> --- qemu.orig/exec.c
> +++ qemu/exec.c
> @@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>       return NULL;
>   }
>
> -/* Some of the softmmu routines need to translate from a host pointer
> -   (typically a TLB entry) back to a ram offset.  */
> -ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +int qemu_ram_addr_from_host_nofail(void *ptr, ram_addr_t *ram_addr)
>   {
>       RAMBlock *block;
>       uint8_t *host = ptr;
>
>       QLIST_FOREACH(block,&ram_list.blocks, next) {
>           if (host - block->host<  block->length) {
> -            return block->offset + (host - block->host);
> +            *ram_addr = block->offset + (host - block->host);
> +            return 0;
>           }
>       }
> +    return -1;
> +}
>
> -    fprintf(stderr, "Bad ram pointer %p\n", ptr);
> -    abort();
> +/* Some of the softmmu routines need to translate from a host pointer
> +   (typically a TLB entry) back to a ram offset.  */
> +ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +{
> +    ram_addr_t ram_addr;
>
> -    return 0;
> +    if (qemu_ram_addr_from_host_nofail(ptr,&ram_addr)) {
> +        fprintf(stderr, "Bad ram pointer %p\n", ptr);
> +        abort();
> +    }
> +    return ram_addr;
>   }
>
>   static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
>    

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06  1:10     ` Hidetoshi Seto
  -1 siblings, 0 replies; 93+ messages in thread
From: Hidetoshi Seto @ 2010-10-06  1:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

(2010/10/05 3:54), Marcelo Tosatti wrote:
> Port qemu-kvm's
> 
> commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
> Author: Huang Ying <ying.huang@intel.com>
> Date:   Mon Sep 21 10:43:25 2009 +0800
> 
>     MCE: Relay UCR MCE to guest
>     
>     UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
>     where some hardware error such as some memory error can be reported
>     without PCC (processor context corrupted). To recover from such MCE,
>     the corresponding memory will be unmapped, and all processes accessing
>     the memory will be killed via SIGBUS.
>     
>     For KVM, if QEMU/KVM is killed, all guest processes will be killed
>     too. So we relay SIGBUS from host OS to guest system via a UCR MCE
>     injection. Then guest OS can isolate corresponding memory and kill
>     necessary guest processes only. SIGBUS sent to main thread (not VCPU
>     threads) will be broadcast to all VCPU threads as UCR MCE.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 

(snip)

> +static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
> +                           void *ctx)
> +{
> +#if defined(TARGET_I386)
> +    if (kvm_on_sigbus_vcpu(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
> +#endif
> +        sigbus_reraise();
> +}
> +
>  static void qemu_kvm_eat_signal(CPUState *env, int timeout)
>  {
>      struct timespec ts;
>      int r, e;
>      siginfo_t siginfo;
>      sigset_t waitset;
> +    sigset_t chkset;
>  
>      ts.tv_sec = timeout / 1000;
>      ts.tv_nsec = (timeout % 1000) * 1000000;
>  
>      sigemptyset(&waitset);
>      sigaddset(&waitset, SIG_IPI);
> +    sigaddset(&waitset, SIGBUS);
>  
> -    qemu_mutex_unlock(&qemu_global_mutex);
> -    r = sigtimedwait(&waitset, &siginfo, &ts);
> -    e = errno;
> -    qemu_mutex_lock(&qemu_global_mutex);
> +    do {
> +        qemu_mutex_unlock(&qemu_global_mutex);
>  
> -    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
> -        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> -        exit(1);
> -    }
> +        r = sigtimedwait(&waitset, &siginfo, &ts);
> +        e = errno;
> +
> +        qemu_mutex_lock(&qemu_global_mutex);
> +
> +        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
> +            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> +            exit(1);
> +        }
> +
> +        switch (r) {
> +        case SIGBUS:
> +#ifdef TARGET_I386
> +            if (kvm_on_sigbus(env, siginfo.si_code, siginfo.si_addr))
> +#endif
> +                sigbus_reraise();
> +            break;
> +        default:
> +            break;
> +        }
> +
> +        r = sigpending(&chkset);
> +        if (r == -1) {
> +            fprintf(stderr, "sigpending: %s\n", strerror(e));
> +            exit(1);
> +        }
> +    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
>  }
>  
>  static void qemu_kvm_wait_io_event(CPUState *env)

(snip)

> Index: qemu/kvm.h
> ===================================================================
> --- qemu.orig/kvm.h
> +++ qemu/kvm.h
> @@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
>  
>  void kvm_arch_reset_vcpu(CPUState *env);
>  
> +int kvm_on_sigbus(CPUState *env, int code, void *addr);
> +int kvm_on_sigbus_vcpu(int code, void *addr);
> +
>  struct kvm_guest_debug;
>  struct kvm_debug_exit_arch;
>  

So kvm_on_sigbus() is called from qemu_kvm_eat_signal() that is
called on vcpu thread, while kvm_on_sigbus_vcpu() is called via
sigbus_handler that invoked on iothread using signalfd.

... Inverse naming?


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-06  1:10     ` Hidetoshi Seto
  0 siblings, 0 replies; 93+ messages in thread
From: Hidetoshi Seto @ 2010-10-06  1:10 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

(2010/10/05 3:54), Marcelo Tosatti wrote:
> Port qemu-kvm's
> 
> commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
> Author: Huang Ying <ying.huang@intel.com>
> Date:   Mon Sep 21 10:43:25 2009 +0800
> 
>     MCE: Relay UCR MCE to guest
>     
>     UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
>     where some hardware error such as some memory error can be reported
>     without PCC (processor context corrupted). To recover from such MCE,
>     the corresponding memory will be unmapped, and all processes accessing
>     the memory will be killed via SIGBUS.
>     
>     For KVM, if QEMU/KVM is killed, all guest processes will be killed
>     too. So we relay SIGBUS from host OS to guest system via a UCR MCE
>     injection. Then guest OS can isolate corresponding memory and kill
>     necessary guest processes only. SIGBUS sent to main thread (not VCPU
>     threads) will be broadcast to all VCPU threads as UCR MCE.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 

(snip)

> +static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
> +                           void *ctx)
> +{
> +#if defined(TARGET_I386)
> +    if (kvm_on_sigbus_vcpu(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
> +#endif
> +        sigbus_reraise();
> +}
> +
>  static void qemu_kvm_eat_signal(CPUState *env, int timeout)
>  {
>      struct timespec ts;
>      int r, e;
>      siginfo_t siginfo;
>      sigset_t waitset;
> +    sigset_t chkset;
>  
>      ts.tv_sec = timeout / 1000;
>      ts.tv_nsec = (timeout % 1000) * 1000000;
>  
>      sigemptyset(&waitset);
>      sigaddset(&waitset, SIG_IPI);
> +    sigaddset(&waitset, SIGBUS);
>  
> -    qemu_mutex_unlock(&qemu_global_mutex);
> -    r = sigtimedwait(&waitset, &siginfo, &ts);
> -    e = errno;
> -    qemu_mutex_lock(&qemu_global_mutex);
> +    do {
> +        qemu_mutex_unlock(&qemu_global_mutex);
>  
> -    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
> -        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> -        exit(1);
> -    }
> +        r = sigtimedwait(&waitset, &siginfo, &ts);
> +        e = errno;
> +
> +        qemu_mutex_lock(&qemu_global_mutex);
> +
> +        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
> +            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> +            exit(1);
> +        }
> +
> +        switch (r) {
> +        case SIGBUS:
> +#ifdef TARGET_I386
> +            if (kvm_on_sigbus(env, siginfo.si_code, siginfo.si_addr))
> +#endif
> +                sigbus_reraise();
> +            break;
> +        default:
> +            break;
> +        }
> +
> +        r = sigpending(&chkset);
> +        if (r == -1) {
> +            fprintf(stderr, "sigpending: %s\n", strerror(e));
> +            exit(1);
> +        }
> +    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
>  }
>  
>  static void qemu_kvm_wait_io_event(CPUState *env)

(snip)

> Index: qemu/kvm.h
> ===================================================================
> --- qemu.orig/kvm.h
> +++ qemu/kvm.h
> @@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
>  
>  void kvm_arch_reset_vcpu(CPUState *env);
>  
> +int kvm_on_sigbus(CPUState *env, int code, void *addr);
> +int kvm_on_sigbus_vcpu(int code, void *addr);
> +
>  struct kvm_guest_debug;
>  struct kvm_debug_exit_arch;
>  

So kvm_on_sigbus() is called from qemu_kvm_eat_signal() that is
called on vcpu thread, while kvm_on_sigbus_vcpu() is called via
sigbus_handler that invoked on iothread using signalfd.

... Inverse naming?


Thanks,
H.Seto

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06  1:58     ` Hidetoshi Seto
  -1 siblings, 0 replies; 93+ messages in thread
From: Hidetoshi Seto @ 2010-10-06  1:58 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

I got some more question:

(2010/10/05 3:54), Marcelo Tosatti wrote:
> Index: qemu/target-i386/cpu.h
> ===================================================================
> --- qemu.orig/target-i386/cpu.h
> +++ qemu/target-i386/cpu.h
> @@ -250,16 +250,32 @@
>  #define PG_ERROR_RSVD_MASK 0x08
>  #define PG_ERROR_I_D_MASK  0x10
>  
> -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
> +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
> +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
>  
> -#define MCE_CAP_DEF	MCG_CTL_P
> +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
>  #define MCE_BANKS_DEF	10
>  

It seems that current kvm doesn't support SER_P, so injecting SRAO
to guest will mean that guest receives VAL|UC|!PCC and RIPV event
from virtual processor that doesn't have SER_P.

I think most OSes don't expect that it can receives MCE with !PCC
on traditional x86 processor without SER_P.

Q1: Is it safe to expect that guests can handle such !PCC event?
Q2: What is the expected behavior on the guest?
Q3: What happen if guest reboots itself in response to the MCE?

Thanks,
H.Seto

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-06  1:58     ` Hidetoshi Seto
  0 siblings, 0 replies; 93+ messages in thread
From: Hidetoshi Seto @ 2010-10-06  1:58 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

I got some more question:

(2010/10/05 3:54), Marcelo Tosatti wrote:
> Index: qemu/target-i386/cpu.h
> ===================================================================
> --- qemu.orig/target-i386/cpu.h
> +++ qemu/target-i386/cpu.h
> @@ -250,16 +250,32 @@
>  #define PG_ERROR_RSVD_MASK 0x08
>  #define PG_ERROR_I_D_MASK  0x10
>  
> -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
> +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
> +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
>  
> -#define MCE_CAP_DEF	MCG_CTL_P
> +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
>  #define MCE_BANKS_DEF	10
>  

It seems that current kvm doesn't support SER_P, so injecting SRAO
to guest will mean that guest receives VAL|UC|!PCC and RIPV event
from virtual processor that doesn't have SER_P.

I think most OSes don't expect that it can receives MCE with !PCC
on traditional x86 processor without SER_P.

Q1: Is it safe to expect that guests can handle such !PCC event?
Q2: What is the expected behavior on the guest?
Q3: What happen if guest reboots itself in response to the MCE?

Thanks,
H.Seto

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-06  1:10     ` [Qemu-devel] " Hidetoshi Seto
@ 2010-10-06 16:02       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 16:02 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

On Wed, Oct 06, 2010 at 10:10:51AM +0900, Hidetoshi Seto wrote:
> 
> (snip)
> 
> > Index: qemu/kvm.h
> > ===================================================================
> > --- qemu.orig/kvm.h
> > +++ qemu/kvm.h
> > @@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
> >  
> >  void kvm_arch_reset_vcpu(CPUState *env);
> >  
> > +int kvm_on_sigbus(CPUState *env, int code, void *addr);
> > +int kvm_on_sigbus_vcpu(int code, void *addr);
> > +
> >  struct kvm_guest_debug;
> >  struct kvm_debug_exit_arch;
> >  
> 
> So kvm_on_sigbus() is called from qemu_kvm_eat_signal() that is
> called on vcpu thread, while kvm_on_sigbus_vcpu() is called via
> sigbus_handler that invoked on iothread using signalfd.
> 
> ... Inverse naming?

Yes, fixed.


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-06 16:02       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 16:02 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

On Wed, Oct 06, 2010 at 10:10:51AM +0900, Hidetoshi Seto wrote:
> 
> (snip)
> 
> > Index: qemu/kvm.h
> > ===================================================================
> > --- qemu.orig/kvm.h
> > +++ qemu/kvm.h
> > @@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
> >  
> >  void kvm_arch_reset_vcpu(CPUState *env);
> >  
> > +int kvm_on_sigbus(CPUState *env, int code, void *addr);
> > +int kvm_on_sigbus_vcpu(int code, void *addr);
> > +
> >  struct kvm_guest_debug;
> >  struct kvm_debug_exit_arch;
> >  
> 
> So kvm_on_sigbus() is called from qemu_kvm_eat_signal() that is
> called on vcpu thread, while kvm_on_sigbus_vcpu() is called via
> sigbus_handler that invoked on iothread using signalfd.
> 
> ... Inverse naming?

Yes, fixed.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-06  1:58     ` [Qemu-devel] " Hidetoshi Seto
@ 2010-10-06 16:05       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 16:05 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
> I got some more question:
> 
> (2010/10/05 3:54), Marcelo Tosatti wrote:
> > Index: qemu/target-i386/cpu.h
> > ===================================================================
> > --- qemu.orig/target-i386/cpu.h
> > +++ qemu/target-i386/cpu.h
> > @@ -250,16 +250,32 @@
> >  #define PG_ERROR_RSVD_MASK 0x08
> >  #define PG_ERROR_I_D_MASK  0x10
> >  
> > -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
> > +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
> > +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
> >  
> > -#define MCE_CAP_DEF	MCG_CTL_P
> > +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
> >  #define MCE_BANKS_DEF	10
> >  
> 
> It seems that current kvm doesn't support SER_P, so injecting SRAO
> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
> from virtual processor that doesn't have SER_P.

Dean also noted this. I don't think it was deliberate choice to not
expose SER_P. Huang?

> I think most OSes don't expect that it can receives MCE with !PCC
> on traditional x86 processor without SER_P.
> 
> Q1: Is it safe to expect that guests can handle such !PCC event?
> Q2: What is the expected behavior on the guest?
> Q3: What happen if guest reboots itself in response to the MCE?
> 
> 
> Thanks,
> H.Seto

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-06 16:05       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 16:05 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
> I got some more question:
> 
> (2010/10/05 3:54), Marcelo Tosatti wrote:
> > Index: qemu/target-i386/cpu.h
> > ===================================================================
> > --- qemu.orig/target-i386/cpu.h
> > +++ qemu/target-i386/cpu.h
> > @@ -250,16 +250,32 @@
> >  #define PG_ERROR_RSVD_MASK 0x08
> >  #define PG_ERROR_I_D_MASK  0x10
> >  
> > -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
> > +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
> > +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
> >  
> > -#define MCE_CAP_DEF	MCG_CTL_P
> > +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
> >  #define MCE_BANKS_DEF	10
> >  
> 
> It seems that current kvm doesn't support SER_P, so injecting SRAO
> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
> from virtual processor that doesn't have SER_P.

Dean also noted this. I don't think it was deliberate choice to not
expose SER_P. Huang?

> I think most OSes don't expect that it can receives MCE with !PCC
> on traditional x86 processor without SER_P.
> 
> Q1: Is it safe to expect that guests can handle such !PCC event?
> Q2: What is the expected behavior on the guest?
> Q3: What happen if guest reboots itself in response to the MCE?
> 
> 
> Thanks,
> H.Seto

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 0/8] port qemu-kvm's MCE support (v2)
  2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34   ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson

Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
allows qemu to propagate MCEs to the guest.

v2:
- rename do_qemu_ram_addr_from_host.
- fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
- fix bank register restoration (Dean Nelson).




^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 0/8] port qemu-kvm's MCE support (v2)
@ 2010-10-06 17:34   ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Huang Ying

Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
allows qemu to propagate MCEs to the guest.

v2:
- rename do_qemu_ram_addr_from_host.
- fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
- fix bank register restoration (Dean Nelson).

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 1/8] signalfd compatibility
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: signalfd --]
[-- Type: text/plain, Size: 5970 bytes --]

Port qemu-kvm's signalfd compat code.

commit 5a7fdd0abd7cd24dac205317a4195446ab8748b5
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed May 7 11:55:47 2008 -0500

    Use signalfd() in io-thread
    
    This patch reworks the IO thread to use signalfd() instead of sigtimedwait()
    This will eliminate the need to use SIGIO everywhere.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/compatfd.c
===================================================================
--- /dev/null
+++ qemu/compatfd.c
@@ -0,0 +1,117 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "compatfd.h"
+
+#include <sys/syscall.h>
+#include <pthread.h>
+
+struct sigfd_compat_info
+{
+    sigset_t mask;
+    int fd;
+};
+
+static void *sigwait_compat(void *opaque)
+{
+    struct sigfd_compat_info *info = opaque;
+    int err;
+    sigset_t all;
+
+    sigfillset(&all);
+    sigprocmask(SIG_BLOCK, &all, NULL);
+
+    do {
+        siginfo_t siginfo;
+
+        err = sigwaitinfo(&info->mask, &siginfo);
+        if (err == -1 && errno == EINTR) {
+            err = 0;
+            continue;
+        }
+
+        if (err > 0) {
+            char buffer[128];
+            size_t offset = 0;
+
+            memcpy(buffer, &err, sizeof(err));
+            while (offset < sizeof(buffer)) {
+                ssize_t len;
+
+                len = write(info->fd, buffer + offset,
+                            sizeof(buffer) - offset);
+                if (len == -1 && errno == EINTR)
+                    continue;
+
+                if (len <= 0) {
+                    err = -1;
+                    break;
+                }
+
+                offset += len;
+            }
+        }
+    } while (err >= 0);
+
+    return NULL;
+}
+
+static int qemu_signalfd_compat(const sigset_t *mask)
+{
+    pthread_attr_t attr;
+    pthread_t tid;
+    struct sigfd_compat_info *info;
+    int fds[2];
+
+    info = malloc(sizeof(*info));
+    if (info == NULL) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    if (pipe(fds) == -1) {
+        free(info);
+        return -1;
+    }
+
+    qemu_set_cloexec(fds[0]);
+    qemu_set_cloexec(fds[1]);
+
+    memcpy(&info->mask, mask, sizeof(*mask));
+    info->fd = fds[1];
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+    pthread_create(&tid, &attr, sigwait_compat, info);
+
+    pthread_attr_destroy(&attr);
+
+    return fds[0];
+}
+
+int qemu_signalfd(const sigset_t *mask)
+{
+#if defined(CONFIG_SIGNALFD)
+    int ret;
+
+    ret = syscall(SYS_signalfd, -1, mask, _NSIG / 8);
+    if (ret != -1) {
+        qemu_set_cloexec(ret);
+        return ret;
+    }
+#endif
+
+    return qemu_signalfd_compat(mask);
+}
Index: qemu/compatfd.h
===================================================================
--- /dev/null
+++ qemu/compatfd.h
@@ -0,0 +1,43 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COMPATFD_H
+#define QEMU_COMPATFD_H
+
+#include <signal.h>
+
+struct qemu_signalfd_siginfo {
+    uint32_t ssi_signo;   /* Signal number */
+    int32_t  ssi_errno;   /* Error number (unused) */
+    int32_t  ssi_code;    /* Signal code */
+    uint32_t ssi_pid;     /* PID of sender */
+    uint32_t ssi_uid;     /* Real UID of sender */
+    int32_t  ssi_fd;      /* File descriptor (SIGIO) */
+    uint32_t ssi_tid;     /* Kernel timer ID (POSIX timers) */
+    uint32_t ssi_band;    /* Band event (SIGIO) */
+    uint32_t ssi_overrun; /* POSIX timer overrun count */
+    uint32_t ssi_trapno;  /* Trap number that caused signal */
+    int32_t  ssi_status;  /* Exit status or signal (SIGCHLD) */
+    int32_t  ssi_int;     /* Integer sent by sigqueue(2) */
+    uint64_t ssi_ptr;     /* Pointer sent by sigqueue(2) */
+    uint64_t ssi_utime;   /* User CPU time consumed (SIGCHLD) */
+    uint64_t ssi_stime;   /* System CPU time consumed (SIGCHLD) */
+    uint64_t ssi_addr;    /* Address that generated signal
+                             (for hardware-generated signals) */
+    uint8_t  pad[48];     /* Pad size to 128 bytes (allow for
+                             additional fields in the future) */
+};
+
+int qemu_signalfd(const sigset_t *mask);
+
+#endif
Index: qemu/Makefile.objs
===================================================================
--- qemu.orig/Makefile.objs
+++ qemu/Makefile.objs
@@ -121,6 +121,7 @@ common-obj-y += $(addprefix ui/, $(ui-ob
 
 common-obj-y += iov.o acl.o
 common-obj-$(CONFIG_THREAD) += qemu-thread.o
+common-obj-$(CONFIG_IOTHREAD) += compatfd.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o
 
Index: qemu/configure
===================================================================
--- qemu.orig/configure
+++ qemu/configure
@@ -1936,6 +1936,21 @@ if compile_prog "" "" ; then
   splice=yes
 fi
 
+##########################################
+# signalfd probe
+signalfd="no"
+cat > $TMPC << EOF
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <signal.h>
+int main(void) { return syscall(SYS_signalfd, -1, NULL, _NSIG / 8); }
+EOF
+
+if compile_prog "" "" ; then
+  signalfd=yes
+fi
+
 # check if eventfd is supported
 eventfd=no
 cat > $TMPC << EOF
@@ -2509,6 +2524,9 @@ fi
 if test "$fdt" = "yes" ; then
   echo "CONFIG_FDT=y" >> $config_host_mak
 fi
+if test "$signalfd" = "yes" ; then
+  echo "CONFIG_SIGNALFD=y" >> $config_host_mak
+fi
 if test "$need_offsetof" = "yes" ; then
   echo "CONFIG_NEED_OFFSETOF=y" >> $config_host_mak
 fi



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 1/8] signalfd compatibility
@ 2010-10-06 17:34     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: signalfd --]
[-- Type: text/plain, Size: 5968 bytes --]

Port qemu-kvm's signalfd compat code.

commit 5a7fdd0abd7cd24dac205317a4195446ab8748b5
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed May 7 11:55:47 2008 -0500

    Use signalfd() in io-thread
    
    This patch reworks the IO thread to use signalfd() instead of sigtimedwait()
    This will eliminate the need to use SIGIO everywhere.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/compatfd.c
===================================================================
--- /dev/null
+++ qemu/compatfd.c
@@ -0,0 +1,117 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "compatfd.h"
+
+#include <sys/syscall.h>
+#include <pthread.h>
+
+struct sigfd_compat_info
+{
+    sigset_t mask;
+    int fd;
+};
+
+static void *sigwait_compat(void *opaque)
+{
+    struct sigfd_compat_info *info = opaque;
+    int err;
+    sigset_t all;
+
+    sigfillset(&all);
+    sigprocmask(SIG_BLOCK, &all, NULL);
+
+    do {
+        siginfo_t siginfo;
+
+        err = sigwaitinfo(&info->mask, &siginfo);
+        if (err == -1 && errno == EINTR) {
+            err = 0;
+            continue;
+        }
+
+        if (err > 0) {
+            char buffer[128];
+            size_t offset = 0;
+
+            memcpy(buffer, &err, sizeof(err));
+            while (offset < sizeof(buffer)) {
+                ssize_t len;
+
+                len = write(info->fd, buffer + offset,
+                            sizeof(buffer) - offset);
+                if (len == -1 && errno == EINTR)
+                    continue;
+
+                if (len <= 0) {
+                    err = -1;
+                    break;
+                }
+
+                offset += len;
+            }
+        }
+    } while (err >= 0);
+
+    return NULL;
+}
+
+static int qemu_signalfd_compat(const sigset_t *mask)
+{
+    pthread_attr_t attr;
+    pthread_t tid;
+    struct sigfd_compat_info *info;
+    int fds[2];
+
+    info = malloc(sizeof(*info));
+    if (info == NULL) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    if (pipe(fds) == -1) {
+        free(info);
+        return -1;
+    }
+
+    qemu_set_cloexec(fds[0]);
+    qemu_set_cloexec(fds[1]);
+
+    memcpy(&info->mask, mask, sizeof(*mask));
+    info->fd = fds[1];
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+    pthread_create(&tid, &attr, sigwait_compat, info);
+
+    pthread_attr_destroy(&attr);
+
+    return fds[0];
+}
+
+int qemu_signalfd(const sigset_t *mask)
+{
+#if defined(CONFIG_SIGNALFD)
+    int ret;
+
+    ret = syscall(SYS_signalfd, -1, mask, _NSIG / 8);
+    if (ret != -1) {
+        qemu_set_cloexec(ret);
+        return ret;
+    }
+#endif
+
+    return qemu_signalfd_compat(mask);
+}
Index: qemu/compatfd.h
===================================================================
--- /dev/null
+++ qemu/compatfd.h
@@ -0,0 +1,43 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COMPATFD_H
+#define QEMU_COMPATFD_H
+
+#include <signal.h>
+
+struct qemu_signalfd_siginfo {
+    uint32_t ssi_signo;   /* Signal number */
+    int32_t  ssi_errno;   /* Error number (unused) */
+    int32_t  ssi_code;    /* Signal code */
+    uint32_t ssi_pid;     /* PID of sender */
+    uint32_t ssi_uid;     /* Real UID of sender */
+    int32_t  ssi_fd;      /* File descriptor (SIGIO) */
+    uint32_t ssi_tid;     /* Kernel timer ID (POSIX timers) */
+    uint32_t ssi_band;    /* Band event (SIGIO) */
+    uint32_t ssi_overrun; /* POSIX timer overrun count */
+    uint32_t ssi_trapno;  /* Trap number that caused signal */
+    int32_t  ssi_status;  /* Exit status or signal (SIGCHLD) */
+    int32_t  ssi_int;     /* Integer sent by sigqueue(2) */
+    uint64_t ssi_ptr;     /* Pointer sent by sigqueue(2) */
+    uint64_t ssi_utime;   /* User CPU time consumed (SIGCHLD) */
+    uint64_t ssi_stime;   /* System CPU time consumed (SIGCHLD) */
+    uint64_t ssi_addr;    /* Address that generated signal
+                             (for hardware-generated signals) */
+    uint8_t  pad[48];     /* Pad size to 128 bytes (allow for
+                             additional fields in the future) */
+};
+
+int qemu_signalfd(const sigset_t *mask);
+
+#endif
Index: qemu/Makefile.objs
===================================================================
--- qemu.orig/Makefile.objs
+++ qemu/Makefile.objs
@@ -121,6 +121,7 @@ common-obj-y += $(addprefix ui/, $(ui-ob
 
 common-obj-y += iov.o acl.o
 common-obj-$(CONFIG_THREAD) += qemu-thread.o
+common-obj-$(CONFIG_IOTHREAD) += compatfd.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o
 
Index: qemu/configure
===================================================================
--- qemu.orig/configure
+++ qemu/configure
@@ -1936,6 +1936,21 @@ if compile_prog "" "" ; then
   splice=yes
 fi
 
+##########################################
+# signalfd probe
+signalfd="no"
+cat > $TMPC << EOF
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <signal.h>
+int main(void) { return syscall(SYS_signalfd, -1, NULL, _NSIG / 8); }
+EOF
+
+if compile_prog "" "" ; then
+  signalfd=yes
+fi
+
 # check if eventfd is supported
 eventfd=no
 cat > $TMPC << EOF
@@ -2509,6 +2524,9 @@ fi
 if test "$fdt" = "yes" ; then
   echo "CONFIG_FDT=y" >> $config_host_mak
 fi
+if test "$signalfd" = "yes" ; then
+  echo "CONFIG_SIGNALFD=y" >> $config_host_mak
+fi
 if test "$need_offsetof" = "yes" ; then
   echo "CONFIG_NEED_OFFSETOF=y" >> $config_host_mak
 fi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 2/8] iothread: use signalfd
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: block-io-signals-in-iothread --]
[-- Type: text/plain, Size: 3272 bytes --]

Block SIGALRM, SIGIO and consume them via signalfd.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -33,6 +33,7 @@
 #include "exec-all.h"
 
 #include "cpus.h"
+#include "compatfd.h"
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -329,14 +330,75 @@ static QemuCond qemu_work_cond;
 
 static void tcg_init_ipi(void);
 static void kvm_init_ipi(CPUState *env);
-static void unblock_io_signals(void);
+static sigset_t block_io_signals(void);
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+static void sigfd_handler(void *opaque)
+{
+    int fd = (unsigned long) opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            printf("read from sigfd returned %zd: %m\n", len);
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signalfd_init(sigset_t mask)
+{
+    int sigfd;
+
+    sigfd = qemu_signalfd(&mask);
+    if (sigfd == -1) {
+        fprintf(stderr, "failed to create signalfd\n");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
+                         (void *)(unsigned long) sigfd);
+
+    return 0;
+}
 
 int qemu_init_main_loop(void)
 {
     int ret;
+    sigset_t blocked_signals;
 
     cpu_set_debug_excp_handler(cpu_debug_handler);
 
+    blocked_signals = block_io_signals();
+
+    ret = qemu_signalfd_init(blocked_signals);
+    if (ret)
+        return ret;
+
+    /* Note eventfd must be drained before signalfd handlers run */
     ret = qemu_event_init();
     if (ret)
         return ret;
@@ -347,7 +409,6 @@ int qemu_init_main_loop(void)
     qemu_mutex_init(&qemu_global_mutex);
     qemu_mutex_lock(&qemu_global_mutex);
 
-    unblock_io_signals();
     qemu_thread_self(&io_thread);
 
     return 0;
@@ -586,19 +647,22 @@ static void kvm_init_ipi(CPUState *env)
     }
 }
 
-static void unblock_io_signals(void)
+static sigset_t block_io_signals(void)
 {
     sigset_t set;
 
+    /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
     sigaddset(&set, SIGUSR2);
-    sigaddset(&set, SIGIO);
-    sigaddset(&set, SIGALRM);
     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 
     sigemptyset(&set);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    return set;
 }
 
 void qemu_mutex_lock_iothread(void)



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 2/8] iothread: use signalfd
@ 2010-10-06 17:34     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: block-io-signals-in-iothread --]
[-- Type: text/plain, Size: 3270 bytes --]

Block SIGALRM, SIGIO and consume them via signalfd.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -33,6 +33,7 @@
 #include "exec-all.h"
 
 #include "cpus.h"
+#include "compatfd.h"
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -329,14 +330,75 @@ static QemuCond qemu_work_cond;
 
 static void tcg_init_ipi(void);
 static void kvm_init_ipi(CPUState *env);
-static void unblock_io_signals(void);
+static sigset_t block_io_signals(void);
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+static void sigfd_handler(void *opaque)
+{
+    int fd = (unsigned long) opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            printf("read from sigfd returned %zd: %m\n", len);
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signalfd_init(sigset_t mask)
+{
+    int sigfd;
+
+    sigfd = qemu_signalfd(&mask);
+    if (sigfd == -1) {
+        fprintf(stderr, "failed to create signalfd\n");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
+                         (void *)(unsigned long) sigfd);
+
+    return 0;
+}
 
 int qemu_init_main_loop(void)
 {
     int ret;
+    sigset_t blocked_signals;
 
     cpu_set_debug_excp_handler(cpu_debug_handler);
 
+    blocked_signals = block_io_signals();
+
+    ret = qemu_signalfd_init(blocked_signals);
+    if (ret)
+        return ret;
+
+    /* Note eventfd must be drained before signalfd handlers run */
     ret = qemu_event_init();
     if (ret)
         return ret;
@@ -347,7 +409,6 @@ int qemu_init_main_loop(void)
     qemu_mutex_init(&qemu_global_mutex);
     qemu_mutex_lock(&qemu_global_mutex);
 
-    unblock_io_signals();
     qemu_thread_self(&io_thread);
 
     return 0;
@@ -586,19 +647,22 @@ static void kvm_init_ipi(CPUState *env)
     }
 }
 
-static void unblock_io_signals(void)
+static sigset_t block_io_signals(void)
 {
     sigset_t set;
 
+    /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
     sigaddset(&set, SIGUSR2);
-    sigaddset(&set, SIGIO);
-    sigaddset(&set, SIGALRM);
     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 
     sigemptyset(&set);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    return set;
 }
 
 void qemu_mutex_lock_iothread(void)

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 3/8] Expose thread_id in info cpus
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: thread-id --]
[-- Type: text/plain, Size: 3812 bytes --]

commit ce6325ff1af34dbaee91c8d28e792277e43f1227
Author: Glauber Costa <gcosta@redhat.com>
Date:   Wed Mar 5 17:01:10 2008 -0300

    Augment info cpus
    
    This patch exposes the thread id associated with each
    cpu through the already well known 'info cpus' interface.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-defs.h
===================================================================
--- qemu.orig/cpu-defs.h
+++ qemu/cpu-defs.h
@@ -197,6 +197,7 @@ typedef struct CPUWatchpoint {
     int nr_cores;  /* number of cores within this CPU package */        \
     int nr_threads;/* number of threads within this CPU */              \
     int running; /* Nonzero if cpu is currently running(usermode).  */  \
+    int thread_id;                                                      \
     /* user data */                                                     \
     void *opaque;                                                       \
                                                                         \
Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -539,6 +539,7 @@ static void *kvm_cpu_thread_fn(void *arg
 
     qemu_mutex_lock(&qemu_global_mutex);
     qemu_thread_self(env->thread);
+    env->thread_id = get_thread_id();
     if (kvm_enabled())
         kvm_init_vcpu(env);
 
@@ -578,6 +579,10 @@ static void *tcg_cpu_thread_fn(void *arg
     while (!qemu_system_ready)
         qemu_cond_timedwait(&qemu_system_cond, &qemu_global_mutex, 100);
 
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        env->thread_id = get_thread_id();
+    }
+
     while (1) {
         cpu_exec_all();
         qemu_tcg_wait_io_event();
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -637,6 +637,7 @@ void cpu_exec_init(CPUState *env)
     env->numa_node = 0;
     QTAILQ_INIT(&env->breakpoints);
     QTAILQ_INIT(&env->watchpoints);
+    env->thread_id = get_thread_id();
     *penv = env;
 #if defined(CONFIG_USER_ONLY)
     cpu_list_unlock();
Index: qemu/osdep.c
===================================================================
--- qemu.orig/osdep.c
+++ qemu/osdep.c
@@ -44,6 +44,10 @@
 extern int madvise(caddr_t, size_t, int);
 #endif
 
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+
 #ifdef CONFIG_EVENTFD
 #include <sys/eventfd.h>
 #endif
@@ -200,6 +204,17 @@ int qemu_create_pidfile(const char *file
     return 0;
 }
 
+int get_thread_id(void)
+{
+#if defined (_WIN32)
+    return GetCurrentThreadId();
+#elif defined (__linux__)
+    return syscall(SYS_gettid);
+#else
+    return getpid();
+#endif
+}
+
 #ifdef _WIN32
 
 /* mingw32 needs ffs for compilations without optimization. */
Index: qemu/osdep.h
===================================================================
--- qemu.orig/osdep.h
+++ qemu/osdep.h
@@ -126,6 +126,7 @@ void qemu_vfree(void *ptr);
 int qemu_madvise(void *addr, size_t len, int advice);
 
 int qemu_create_pidfile(const char *filename);
+int get_thread_id(void);
 
 #ifdef _WIN32
 int ffs(int i);
Index: qemu/monitor.c
===================================================================
--- qemu.orig/monitor.c
+++ qemu/monitor.c
@@ -878,6 +878,9 @@ static void print_cpu_iter(QObject *obj,
         monitor_printf(mon, " (halted)");
     }
 
+    monitor_printf(mon, " thread_id=%" PRId64 " ",
+					qdict_get_int(cpu, "thread_id"));
+
     monitor_printf(mon, "\n");
 }
 
@@ -922,6 +925,7 @@ static void do_info_cpus(Monitor *mon, Q
 #elif defined(TARGET_MIPS)
         qdict_put(cpu, "PC", qint_from_int(env->active_tc.PC));
 #endif
+        qdict_put(cpu, "thread_id", qint_from_int(env->thread_id));
 
         qlist_append(cpu_list, cpu);
     }



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 3/8] Expose thread_id in info cpus
@ 2010-10-06 17:34     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: thread-id --]
[-- Type: text/plain, Size: 3810 bytes --]

commit ce6325ff1af34dbaee91c8d28e792277e43f1227
Author: Glauber Costa <gcosta@redhat.com>
Date:   Wed Mar 5 17:01:10 2008 -0300

    Augment info cpus
    
    This patch exposes the thread id associated with each
    cpu through the already well known 'info cpus' interface.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-defs.h
===================================================================
--- qemu.orig/cpu-defs.h
+++ qemu/cpu-defs.h
@@ -197,6 +197,7 @@ typedef struct CPUWatchpoint {
     int nr_cores;  /* number of cores within this CPU package */        \
     int nr_threads;/* number of threads within this CPU */              \
     int running; /* Nonzero if cpu is currently running(usermode).  */  \
+    int thread_id;                                                      \
     /* user data */                                                     \
     void *opaque;                                                       \
                                                                         \
Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -539,6 +539,7 @@ static void *kvm_cpu_thread_fn(void *arg
 
     qemu_mutex_lock(&qemu_global_mutex);
     qemu_thread_self(env->thread);
+    env->thread_id = get_thread_id();
     if (kvm_enabled())
         kvm_init_vcpu(env);
 
@@ -578,6 +579,10 @@ static void *tcg_cpu_thread_fn(void *arg
     while (!qemu_system_ready)
         qemu_cond_timedwait(&qemu_system_cond, &qemu_global_mutex, 100);
 
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        env->thread_id = get_thread_id();
+    }
+
     while (1) {
         cpu_exec_all();
         qemu_tcg_wait_io_event();
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -637,6 +637,7 @@ void cpu_exec_init(CPUState *env)
     env->numa_node = 0;
     QTAILQ_INIT(&env->breakpoints);
     QTAILQ_INIT(&env->watchpoints);
+    env->thread_id = get_thread_id();
     *penv = env;
 #if defined(CONFIG_USER_ONLY)
     cpu_list_unlock();
Index: qemu/osdep.c
===================================================================
--- qemu.orig/osdep.c
+++ qemu/osdep.c
@@ -44,6 +44,10 @@
 extern int madvise(caddr_t, size_t, int);
 #endif
 
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+
 #ifdef CONFIG_EVENTFD
 #include <sys/eventfd.h>
 #endif
@@ -200,6 +204,17 @@ int qemu_create_pidfile(const char *file
     return 0;
 }
 
+int get_thread_id(void)
+{
+#if defined (_WIN32)
+    return GetCurrentThreadId();
+#elif defined (__linux__)
+    return syscall(SYS_gettid);
+#else
+    return getpid();
+#endif
+}
+
 #ifdef _WIN32
 
 /* mingw32 needs ffs for compilations without optimization. */
Index: qemu/osdep.h
===================================================================
--- qemu.orig/osdep.h
+++ qemu/osdep.h
@@ -126,6 +126,7 @@ void qemu_vfree(void *ptr);
 int qemu_madvise(void *addr, size_t len, int advice);
 
 int qemu_create_pidfile(const char *filename);
+int get_thread_id(void);
 
 #ifdef _WIN32
 int ffs(int i);
Index: qemu/monitor.c
===================================================================
--- qemu.orig/monitor.c
+++ qemu/monitor.c
@@ -878,6 +878,9 @@ static void print_cpu_iter(QObject *obj,
         monitor_printf(mon, " (halted)");
     }
 
+    monitor_printf(mon, " thread_id=%" PRId64 " ",
+					qdict_get_int(cpu, "thread_id"));
+
     monitor_printf(mon, "\n");
 }
 
@@ -922,6 +925,7 @@ static void do_info_cpus(Monitor *mon, Q
 #elif defined(TARGET_MIPS)
         qdict_put(cpu, "PC", qint_from_int(env->active_tc.PC));
 #endif
+        qdict_put(cpu, "thread_id", qint_from_int(env->thread_id));
 
         qlist_append(cpu_list, cpu);
     }

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 4/8] kvm: x86: add mce support
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: mce --]
[-- Type: text/plain, Size: 4542 bytes --]

Port qemu-kvm's MCE support

commit c68b2374c9048812f488e00ffb95db66c0bc07a7
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Jul 20 10:00:53 2009 +0800

    Add MCE simulation support to qemu/kvm
    
    KVM ioctls are used to initialize MCE simulation and inject MCE. The
    real MCE simulation is implemented in Linux kernel. The Kernel part
    has been merged.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -27,6 +27,7 @@
 #include "exec-all.h"
 #include "qemu-common.h"
 #include "kvm.h"
+#include "kvm_x86.h"
 
 //#define DEBUG_MMU
 
@@ -1030,6 +1031,11 @@ void cpu_inject_x86_mce(CPUState *cenv, 
     if (bank >= bank_num || !(status & MCI_STATUS_VAL))
         return;
 
+    if (kvm_enabled()) {
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        return;
+    }
+
     /*
      * if MSR_MCG_CTL is not all 1s, the uncorrected error
      * reporting is disabled
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -27,6 +27,7 @@
 #include "hw/pc.h"
 #include "hw/apic.h"
 #include "ioport.h"
+#include "kvm_x86.h"
 
 #ifdef CONFIG_KVM_PARA
 #include <linux/kvm_para.h>
@@ -167,6 +168,67 @@ static int get_para_features(CPUState *e
 }
 #endif
 
+#ifdef KVM_CAP_MCE
+static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
+                                     int *max_banks)
+{
+    int r;
+
+    r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
+    if (r > 0) {
+        *max_banks = r;
+        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
+    }
+    return -ENOSYS;
+}
+
+static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
+}
+
+static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
+}
+
+struct kvm_x86_mce_data
+{
+    CPUState *env;
+    struct kvm_x86_mce *mce;
+};
+
+static void kvm_do_inject_x86_mce(void *_data)
+{
+    struct kvm_x86_mce_data *data = _data;
+    int r;
+
+    r = kvm_set_mce(data->env, data->mce);
+    if (r < 0)
+        perror("kvm_set_mce FAILED");
+}
+#endif
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+#ifdef KVM_CAP_MCE
+    struct kvm_x86_mce mce = {
+        .bank = bank,
+        .status = status,
+        .mcg_status = mcg_status,
+        .addr = addr,
+        .misc = misc,
+    };
+    struct kvm_x86_mce_data data = {
+            .env = cenv,
+            .mce = &mce,
+    };
+
+    run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#endif
+}
+
 int kvm_arch_init_vcpu(CPUState *env)
 {
     struct {
@@ -274,6 +336,28 @@ int kvm_arch_init_vcpu(CPUState *env)
 
     cpuid_data.cpuid.nent = cpuid_i;
 
+#ifdef KVM_CAP_MCE
+    if (((env->cpuid_version >> 8)&0xF) >= 6
+        && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
+        && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
+        uint64_t mcg_cap;
+        int banks;
+
+        if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks))
+            perror("kvm_get_mce_cap_supported FAILED");
+        else {
+            if (banks > MCE_BANKS_DEF)
+                banks = MCE_BANKS_DEF;
+            mcg_cap &= MCE_CAP_DEF;
+            mcg_cap |= banks;
+            if (kvm_setup_mce(env, &mcg_cap))
+                perror("kvm_setup_mce FAILED");
+            else
+                env->mcg_cap = mcg_cap;
+        }
+    }
+#endif
+
     return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
 }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- /dev/null
+++ qemu/target-i386/kvm_x86.h
@@ -0,0 +1,21 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright (C) 2009 Red Hat Inc.
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __KVM_X86_H__
+#define __KVM_X86_H__
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+
+#endif



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 4/8] kvm: x86: add mce support
@ 2010-10-06 17:34     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: mce --]
[-- Type: text/plain, Size: 4540 bytes --]

Port qemu-kvm's MCE support

commit c68b2374c9048812f488e00ffb95db66c0bc07a7
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Jul 20 10:00:53 2009 +0800

    Add MCE simulation support to qemu/kvm
    
    KVM ioctls are used to initialize MCE simulation and inject MCE. The
    real MCE simulation is implemented in Linux kernel. The Kernel part
    has been merged.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -27,6 +27,7 @@
 #include "exec-all.h"
 #include "qemu-common.h"
 #include "kvm.h"
+#include "kvm_x86.h"
 
 //#define DEBUG_MMU
 
@@ -1030,6 +1031,11 @@ void cpu_inject_x86_mce(CPUState *cenv, 
     if (bank >= bank_num || !(status & MCI_STATUS_VAL))
         return;
 
+    if (kvm_enabled()) {
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        return;
+    }
+
     /*
      * if MSR_MCG_CTL is not all 1s, the uncorrected error
      * reporting is disabled
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -27,6 +27,7 @@
 #include "hw/pc.h"
 #include "hw/apic.h"
 #include "ioport.h"
+#include "kvm_x86.h"
 
 #ifdef CONFIG_KVM_PARA
 #include <linux/kvm_para.h>
@@ -167,6 +168,67 @@ static int get_para_features(CPUState *e
 }
 #endif
 
+#ifdef KVM_CAP_MCE
+static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
+                                     int *max_banks)
+{
+    int r;
+
+    r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
+    if (r > 0) {
+        *max_banks = r;
+        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
+    }
+    return -ENOSYS;
+}
+
+static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
+}
+
+static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
+}
+
+struct kvm_x86_mce_data
+{
+    CPUState *env;
+    struct kvm_x86_mce *mce;
+};
+
+static void kvm_do_inject_x86_mce(void *_data)
+{
+    struct kvm_x86_mce_data *data = _data;
+    int r;
+
+    r = kvm_set_mce(data->env, data->mce);
+    if (r < 0)
+        perror("kvm_set_mce FAILED");
+}
+#endif
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+#ifdef KVM_CAP_MCE
+    struct kvm_x86_mce mce = {
+        .bank = bank,
+        .status = status,
+        .mcg_status = mcg_status,
+        .addr = addr,
+        .misc = misc,
+    };
+    struct kvm_x86_mce_data data = {
+            .env = cenv,
+            .mce = &mce,
+    };
+
+    run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#endif
+}
+
 int kvm_arch_init_vcpu(CPUState *env)
 {
     struct {
@@ -274,6 +336,28 @@ int kvm_arch_init_vcpu(CPUState *env)
 
     cpuid_data.cpuid.nent = cpuid_i;
 
+#ifdef KVM_CAP_MCE
+    if (((env->cpuid_version >> 8)&0xF) >= 6
+        && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
+        && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
+        uint64_t mcg_cap;
+        int banks;
+
+        if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks))
+            perror("kvm_get_mce_cap_supported FAILED");
+        else {
+            if (banks > MCE_BANKS_DEF)
+                banks = MCE_BANKS_DEF;
+            mcg_cap &= MCE_CAP_DEF;
+            mcg_cap |= banks;
+            if (kvm_setup_mce(env, &mcg_cap))
+                perror("kvm_setup_mce FAILED");
+            else
+                env->mcg_cap = mcg_cap;
+        }
+    }
+#endif
+
     return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
 }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- /dev/null
+++ qemu/target-i386/kvm_x86.h
@@ -0,0 +1,21 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright (C) 2009 Red Hat Inc.
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __KVM_X86_H__
+#define __KVM_X86_H__
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+
+#endif

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 5/8] Export qemu_ram_addr_from_host
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: do_qemu_ram_addr_from_host --]
[-- Type: text/plain, Size: 3271 bytes --]

To be used by next patches.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-common.h
===================================================================
--- qemu.orig/cpu-common.h
+++ qemu/cpu-common.h
@@ -47,7 +47,8 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr);
+int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
                            CPUWriteMemoryFunc * const *mem_write,
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2086,7 +2086,7 @@ static inline void tlb_update_dirty(CPUT
     if ((tlb_entry->addr_write & ~TARGET_PAGE_MASK) == IO_MEM_RAM) {
         p = (void *)(unsigned long)((tlb_entry->addr_write & TARGET_PAGE_MASK)
             + tlb_entry->addend);
-        ram_addr = qemu_ram_addr_from_host(p);
+        ram_addr = qemu_ram_addr_from_host_nofail(p);
         if (!cpu_physical_memory_is_dirty(ram_addr)) {
             tlb_entry->addr_write |= TLB_NOTDIRTY;
         }
@@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return NULL;
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (host - block->host < block->length) {
-            return block->offset + (host - block->host);
+            *ram_addr = block->offset + (host - block->host);
+            return 0;
         }
     }
+    return -1;
+}
 
-    fprintf(stderr, "Bad ram pointer %p\n", ptr);
-    abort();
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
+{
+    ram_addr_t ram_addr;
 
-    return 0;
+    if (qemu_ram_addr_from_host(ptr, &ram_addr)) {
+        fprintf(stderr, "Bad ram pointer %p\n", ptr);
+        abort();
+    }
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
@@ -3703,7 +3711,7 @@ void cpu_physical_memory_unmap(void *buf
 {
     if (buffer != bounce.buffer) {
         if (is_write) {
-            ram_addr_t addr1 = qemu_ram_addr_from_host(buffer);
+            ram_addr_t addr1 = qemu_ram_addr_from_host_nofail(buffer);
             while (access_len) {
                 unsigned l;
                 l = TARGET_PAGE_SIZE;
Index: qemu/exec-all.h
===================================================================
--- qemu.orig/exec-all.h
+++ qemu/exec-all.h
@@ -334,7 +334,7 @@ static inline tb_page_addr_t get_page_ad
     }
     p = (void *)(unsigned long)addr
         + env1->tlb_table[mmu_idx][page_index].addend;
-    return qemu_ram_addr_from_host(p);
+    return qemu_ram_addr_from_host_nofail(p);
 }
 #endif
 



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 5/8] Export qemu_ram_addr_from_host
@ 2010-10-06 17:34     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: do_qemu_ram_addr_from_host --]
[-- Type: text/plain, Size: 3269 bytes --]

To be used by next patches.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-common.h
===================================================================
--- qemu.orig/cpu-common.h
+++ qemu/cpu-common.h
@@ -47,7 +47,8 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr);
+int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
                            CPUWriteMemoryFunc * const *mem_write,
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2086,7 +2086,7 @@ static inline void tlb_update_dirty(CPUT
     if ((tlb_entry->addr_write & ~TARGET_PAGE_MASK) == IO_MEM_RAM) {
         p = (void *)(unsigned long)((tlb_entry->addr_write & TARGET_PAGE_MASK)
             + tlb_entry->addend);
-        ram_addr = qemu_ram_addr_from_host(p);
+        ram_addr = qemu_ram_addr_from_host_nofail(p);
         if (!cpu_physical_memory_is_dirty(ram_addr)) {
             tlb_entry->addr_write |= TLB_NOTDIRTY;
         }
@@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return NULL;
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (host - block->host < block->length) {
-            return block->offset + (host - block->host);
+            *ram_addr = block->offset + (host - block->host);
+            return 0;
         }
     }
+    return -1;
+}
 
-    fprintf(stderr, "Bad ram pointer %p\n", ptr);
-    abort();
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
+{
+    ram_addr_t ram_addr;
 
-    return 0;
+    if (qemu_ram_addr_from_host(ptr, &ram_addr)) {
+        fprintf(stderr, "Bad ram pointer %p\n", ptr);
+        abort();
+    }
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
@@ -3703,7 +3711,7 @@ void cpu_physical_memory_unmap(void *buf
 {
     if (buffer != bounce.buffer) {
         if (is_write) {
-            ram_addr_t addr1 = qemu_ram_addr_from_host(buffer);
+            ram_addr_t addr1 = qemu_ram_addr_from_host_nofail(buffer);
             while (access_len) {
                 unsigned l;
                 l = TARGET_PAGE_SIZE;
Index: qemu/exec-all.h
===================================================================
--- qemu.orig/exec-all.h
+++ qemu/exec-all.h
@@ -334,7 +334,7 @@ static inline tb_page_addr_t get_page_ad
     }
     p = (void *)(unsigned long)addr
         + env1->tlb_table[mmu_idx][page_index].addend;
-    return qemu_ram_addr_from_host(p);
+    return qemu_ram_addr_from_host_nofail(p);
 }
 #endif
 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 6/8] Add RAM -> physical addr mapping in MCE simulation
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: kvm_physical_memory_addr_from_ram --]
[-- Type: text/plain, Size: 1710 bytes --]

From: Huang Ying <ying.huang@intel.com>

In QEMU-KVM, physical address != RAM address. While MCE simulation
needs physical address instead of RAM address. So
kvm_physical_memory_addr_from_ram() is implemented to do the
conversion, and it is invoked before being filled in the IA32_MCi_ADDR
MSR.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/kvm-all.c
===================================================================
--- qemu.orig/kvm-all.c
+++ qemu/kvm-all.c
@@ -137,6 +137,24 @@ static KVMSlot *kvm_lookup_overlapping_s
     return found;
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+        KVMSlot *mem = &s->slots[i];
+
+        if (ram_addr >= mem->phys_offset &&
+            ram_addr < mem->phys_offset + mem->memory_size) {
+            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 {
     struct kvm_userspace_memory_region mem;
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -174,6 +174,9 @@ static inline void cpu_synchronize_post_
     }
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr);
+
 #endif
 int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign);
 



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 6/8] Add RAM -> physical addr mapping in MCE simulation
@ 2010-10-06 17:34     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: kvm_physical_memory_addr_from_ram --]
[-- Type: text/plain, Size: 1708 bytes --]

From: Huang Ying <ying.huang@intel.com>

In QEMU-KVM, physical address != RAM address. While MCE simulation
needs physical address instead of RAM address. So
kvm_physical_memory_addr_from_ram() is implemented to do the
conversion, and it is invoked before being filled in the IA32_MCi_ADDR
MSR.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/kvm-all.c
===================================================================
--- qemu.orig/kvm-all.c
+++ qemu/kvm-all.c
@@ -137,6 +137,24 @@ static KVMSlot *kvm_lookup_overlapping_s
     return found;
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+        KVMSlot *mem = &s->slots[i];
+
+        if (ram_addr >= mem->phys_offset &&
+            ram_addr < mem->phys_offset + mem->memory_size) {
+            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 {
     struct kvm_userspace_memory_region mem;
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -174,6 +174,9 @@ static inline void cpu_synchronize_post_
     }
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr);
+
 #endif
 int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign);
 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: kvm-mce-sigbus --]
[-- Type: text/plain, Size: 14879 bytes --]

Port qemu-kvm's

commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Sep 21 10:43:25 2009 +0800

    MCE: Relay UCR MCE to guest
    
    UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
    where some hardware error such as some memory error can be reported
    without PCC (processor context corrupted). To recover from such MCE,
    the corresponding memory will be unmapped, and all processes accessing
    the memory will be killed via SIGBUS.
    
    For KVM, if QEMU/KVM is killed, all guest processes will be killed
    too. So we relay SIGBUS from host OS to guest system via a UCR MCE
    injection. Then guest OS can isolate corresponding memory and kill
    necessary guest processes only. SIGBUS sent to main thread (not VCPU
    threads) will be broadcast to all VCPU threads as UCR MCE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -34,6 +34,10 @@
 
 #include "cpus.h"
 #include "compatfd.h"
+#ifdef CONFIG_LINUX
+#include <sys/prctl.h>
+#include <sys/signalfd.h>
+#endif
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -41,6 +45,10 @@
 #define SIG_IPI SIGUSR1
 #endif
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
 static CPUState *next_cpu;
 
 /***********************************************************/
@@ -498,28 +506,77 @@ static void qemu_tcg_wait_io_event(void)
     }
 }
 
+static void sigbus_reraise(void)
+{
+    sigset_t set;
+    struct sigaction action;
+
+    memset(&action, 0, sizeof(action));
+    action.sa_handler = SIG_DFL;
+    if (!sigaction(SIGBUS, &action, NULL)) {
+        raise(SIGBUS);
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        sigprocmask(SIG_UNBLOCK, &set, NULL);
+    }
+    perror("Failed to re-raise SIGBUS!\n");
+    abort();
+}
+
+static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
+                           void *ctx)
+{
+#if defined(TARGET_I386)
+    if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
+#endif
+        sigbus_reraise();
+}
+
 static void qemu_kvm_eat_signal(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
 
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    qemu_mutex_unlock(&qemu_global_mutex);
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
-    qemu_mutex_lock(&qemu_global_mutex);
+    do {
+        qemu_mutex_unlock(&qemu_global_mutex);
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
+
+        qemu_mutex_lock(&qemu_global_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+#ifdef TARGET_I386
+            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
+#endif
+                sigbus_reraise();
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            fprintf(stderr, "sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 }
 
 static void qemu_kvm_wait_io_event(CPUState *env)
@@ -645,6 +702,7 @@ static void kvm_init_ipi(CPUState *env)
 
     pthread_sigmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
     r = kvm_set_signal_mask(env, &set);
     if (r) {
         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
@@ -655,6 +713,7 @@ static void kvm_init_ipi(CPUState *env)
 static sigset_t block_io_signals(void)
 {
     sigset_t set;
+    struct sigaction action;
 
     /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
@@ -665,8 +724,15 @@ static sigset_t block_io_signals(void)
     sigaddset(&set, SIGIO);
     sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
+    sigaddset(&set, SIGBUS);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1, 0, 0);
+
     return set;
 }
 
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
 
 void kvm_arch_reset_vcpu(CPUState *env);
 
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr);
+int kvm_on_sigbus(int code, void *addr);
+
 struct kvm_guest_debug;
 struct kvm_debug_exit_arch;
 
Index: qemu/target-i386/cpu.h
===================================================================
--- qemu.orig/target-i386/cpu.h
+++ qemu/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -46,6 +46,13 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #ifdef KVM_CAP_EXT_CPUID
 
 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
@@ -192,10 +199,39 @@ static int kvm_set_mce(CPUState *env, st
     return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
 }
 
+static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
+{
+    struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
+    int r;
+
+    kmsrs->nmsrs = n;
+    memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
+    r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
+    memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
+    free(kmsrs);
+    return r;
+}
+
+/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
+static int kvm_mce_in_exception(CPUState *env)
+{
+    struct kvm_msr_entry msr_mcg_status = {
+        .index = MSR_MCG_STATUS,
+    };
+    int r;
+
+    r = kvm_get_msr(env, &msr_mcg_status, 1);
+    if (r == -1 || r == 0) {
+        return -1;
+    }
+    return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
+}
+
 struct kvm_x86_mce_data
 {
     CPUState *env;
     struct kvm_x86_mce *mce;
+    int abort_on_error;
 };
 
 static void kvm_do_inject_x86_mce(void *_data)
@@ -203,14 +239,26 @@ static void kvm_do_inject_x86_mce(void *
     struct kvm_x86_mce_data *data = _data;
     int r;
 
+    /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
+    r = kvm_mce_in_exception(data->env);
+    if (r == -1)
+        fprintf(stderr, "Failed to get MCE status\n");
+    else if (r && !(data->mce->status & MCI_STATUS_AR))
+        return;
+
     r = kvm_set_mce(data->env, data->mce);
-    if (r < 0)
+    if (r < 0) {
         perror("kvm_set_mce FAILED");
+        if (data->abort_on_error) {
+            abort();
+        }
+    }
 }
 #endif
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error)
 {
 #ifdef KVM_CAP_MCE
     struct kvm_x86_mce mce = {
@@ -225,7 +273,15 @@ void kvm_inject_x86_mce(CPUState *cenv, 
             .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
+
     run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+    if (abort_on_error)
+        abort();
 #endif
 }
 
@@ -1525,3 +1581,122 @@ bool kvm_arch_stop_on_emulation_error(CP
               ((env->segs[R_CS].selector  & 3) != 3);
 }
 
+static void hardware_memory_error(void)
+{
+    fprintf(stderr, "Hardware memory error!\n");
+    exit(1);
+}
+
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    void *vaddr;
+    ram_addr_t ram_addr;
+    unsigned long paddr;
+    int r;
+
+    if (env->mcg_cap && addr
+        && (code == BUS_MCEERR_AR
+            || code == BUS_MCEERR_AO)) {
+        if (code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /*
+             * If there is an MCE excpetion being processed, ignore
+             * this SRAO MCE
+             */
+            r = kvm_mce_in_exception(env);
+            if (r == -1) {
+                fprintf(stderr, "Failed to get MCE status\n");
+            } else if (r) {
+                return 0;
+            }
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        vaddr = (void *)addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instaed of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (code == BUS_MCEERR_AO) {
+                return 0;
+            } else {
+                hardware_memory_error();
+            }
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            abort();
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int kvm_on_sigbus(int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    if (first_cpu->mcg_cap && addr && code == BUS_MCEERR_AO) {
+        uint64_t status;
+        void *vaddr;
+        ram_addr_t ram_addr;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        vaddr = addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %p\n", addr);
+            return 0;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -1032,7 +1032,7 @@ void cpu_inject_x86_mce(CPUState *cenv, 
         return;
 
     if (kvm_enabled()) {
-        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
         return;
     }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- qemu.orig/target-i386/kvm_x86.h
+++ qemu/target-i386/kvm_x86.h
@@ -16,6 +16,7 @@
 #define __KVM_X86_H__
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error);
 
 #endif
Index: qemu/kvm-stub.c
===================================================================
--- qemu.orig/kvm-stub.c
+++ qemu/kvm-stub.c
@@ -141,3 +141,9 @@ int kvm_set_ioeventfd_mmio_long(int fd, 
 {
     return -ENOSYS;
 }
+
+int kvm_on_sigbus(int code, void *addr)
+{
+    return 1;
+}
+



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-06 17:34     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: kvm-mce-sigbus --]
[-- Type: text/plain, Size: 14877 bytes --]

Port qemu-kvm's

commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Sep 21 10:43:25 2009 +0800

    MCE: Relay UCR MCE to guest
    
    UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
    where some hardware error such as some memory error can be reported
    without PCC (processor context corrupted). To recover from such MCE,
    the corresponding memory will be unmapped, and all processes accessing
    the memory will be killed via SIGBUS.
    
    For KVM, if QEMU/KVM is killed, all guest processes will be killed
    too. So we relay SIGBUS from host OS to guest system via a UCR MCE
    injection. Then guest OS can isolate corresponding memory and kill
    necessary guest processes only. SIGBUS sent to main thread (not VCPU
    threads) will be broadcast to all VCPU threads as UCR MCE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -34,6 +34,10 @@
 
 #include "cpus.h"
 #include "compatfd.h"
+#ifdef CONFIG_LINUX
+#include <sys/prctl.h>
+#include <sys/signalfd.h>
+#endif
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -41,6 +45,10 @@
 #define SIG_IPI SIGUSR1
 #endif
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
 static CPUState *next_cpu;
 
 /***********************************************************/
@@ -498,28 +506,77 @@ static void qemu_tcg_wait_io_event(void)
     }
 }
 
+static void sigbus_reraise(void)
+{
+    sigset_t set;
+    struct sigaction action;
+
+    memset(&action, 0, sizeof(action));
+    action.sa_handler = SIG_DFL;
+    if (!sigaction(SIGBUS, &action, NULL)) {
+        raise(SIGBUS);
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        sigprocmask(SIG_UNBLOCK, &set, NULL);
+    }
+    perror("Failed to re-raise SIGBUS!\n");
+    abort();
+}
+
+static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
+                           void *ctx)
+{
+#if defined(TARGET_I386)
+    if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
+#endif
+        sigbus_reraise();
+}
+
 static void qemu_kvm_eat_signal(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
 
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    qemu_mutex_unlock(&qemu_global_mutex);
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
-    qemu_mutex_lock(&qemu_global_mutex);
+    do {
+        qemu_mutex_unlock(&qemu_global_mutex);
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
+
+        qemu_mutex_lock(&qemu_global_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+#ifdef TARGET_I386
+            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
+#endif
+                sigbus_reraise();
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            fprintf(stderr, "sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 }
 
 static void qemu_kvm_wait_io_event(CPUState *env)
@@ -645,6 +702,7 @@ static void kvm_init_ipi(CPUState *env)
 
     pthread_sigmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
     r = kvm_set_signal_mask(env, &set);
     if (r) {
         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
@@ -655,6 +713,7 @@ static void kvm_init_ipi(CPUState *env)
 static sigset_t block_io_signals(void)
 {
     sigset_t set;
+    struct sigaction action;
 
     /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
@@ -665,8 +724,15 @@ static sigset_t block_io_signals(void)
     sigaddset(&set, SIGIO);
     sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
+    sigaddset(&set, SIGBUS);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1, 0, 0);
+
     return set;
 }
 
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
 
 void kvm_arch_reset_vcpu(CPUState *env);
 
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr);
+int kvm_on_sigbus(int code, void *addr);
+
 struct kvm_guest_debug;
 struct kvm_debug_exit_arch;
 
Index: qemu/target-i386/cpu.h
===================================================================
--- qemu.orig/target-i386/cpu.h
+++ qemu/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -46,6 +46,13 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #ifdef KVM_CAP_EXT_CPUID
 
 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
@@ -192,10 +199,39 @@ static int kvm_set_mce(CPUState *env, st
     return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
 }
 
+static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
+{
+    struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
+    int r;
+
+    kmsrs->nmsrs = n;
+    memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
+    r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
+    memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
+    free(kmsrs);
+    return r;
+}
+
+/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
+static int kvm_mce_in_exception(CPUState *env)
+{
+    struct kvm_msr_entry msr_mcg_status = {
+        .index = MSR_MCG_STATUS,
+    };
+    int r;
+
+    r = kvm_get_msr(env, &msr_mcg_status, 1);
+    if (r == -1 || r == 0) {
+        return -1;
+    }
+    return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
+}
+
 struct kvm_x86_mce_data
 {
     CPUState *env;
     struct kvm_x86_mce *mce;
+    int abort_on_error;
 };
 
 static void kvm_do_inject_x86_mce(void *_data)
@@ -203,14 +239,26 @@ static void kvm_do_inject_x86_mce(void *
     struct kvm_x86_mce_data *data = _data;
     int r;
 
+    /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
+    r = kvm_mce_in_exception(data->env);
+    if (r == -1)
+        fprintf(stderr, "Failed to get MCE status\n");
+    else if (r && !(data->mce->status & MCI_STATUS_AR))
+        return;
+
     r = kvm_set_mce(data->env, data->mce);
-    if (r < 0)
+    if (r < 0) {
         perror("kvm_set_mce FAILED");
+        if (data->abort_on_error) {
+            abort();
+        }
+    }
 }
 #endif
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error)
 {
 #ifdef KVM_CAP_MCE
     struct kvm_x86_mce mce = {
@@ -225,7 +273,15 @@ void kvm_inject_x86_mce(CPUState *cenv, 
             .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
+
     run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+    if (abort_on_error)
+        abort();
 #endif
 }
 
@@ -1525,3 +1581,122 @@ bool kvm_arch_stop_on_emulation_error(CP
               ((env->segs[R_CS].selector  & 3) != 3);
 }
 
+static void hardware_memory_error(void)
+{
+    fprintf(stderr, "Hardware memory error!\n");
+    exit(1);
+}
+
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    void *vaddr;
+    ram_addr_t ram_addr;
+    unsigned long paddr;
+    int r;
+
+    if (env->mcg_cap && addr
+        && (code == BUS_MCEERR_AR
+            || code == BUS_MCEERR_AO)) {
+        if (code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /*
+             * If there is an MCE excpetion being processed, ignore
+             * this SRAO MCE
+             */
+            r = kvm_mce_in_exception(env);
+            if (r == -1) {
+                fprintf(stderr, "Failed to get MCE status\n");
+            } else if (r) {
+                return 0;
+            }
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        vaddr = (void *)addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instaed of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (code == BUS_MCEERR_AO) {
+                return 0;
+            } else {
+                hardware_memory_error();
+            }
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            abort();
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int kvm_on_sigbus(int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    if (first_cpu->mcg_cap && addr && code == BUS_MCEERR_AO) {
+        uint64_t status;
+        void *vaddr;
+        ram_addr_t ram_addr;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        vaddr = addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %p\n", addr);
+            return 0;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -1032,7 +1032,7 @@ void cpu_inject_x86_mce(CPUState *cenv, 
         return;
 
     if (kvm_enabled()) {
-        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
         return;
     }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- qemu.orig/target-i386/kvm_x86.h
+++ qemu/target-i386/kvm_x86.h
@@ -16,6 +16,7 @@
 #define __KVM_X86_H__
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error);
 
 #endif
Index: qemu/kvm-stub.c
===================================================================
--- qemu.orig/kvm-stub.c
+++ qemu/kvm-stub.c
@@ -141,3 +141,9 @@ int kvm_set_ioeventfd_mmio_long(int fd, 
 {
     return -ENOSYS;
 }
+
+int kvm_on_sigbus(int code, void *addr)
+{
+    return 1;
+}
+

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch uq/master 8/8] Add savevm/loadvm support for MCE
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 17:34     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: mce-save-restore --]
[-- Type: text/plain, Size: 2913 bytes --]

Port qemu-kvm's

commit 1bab5d11545d8de5facf46c28630085a2f9651ae
Author: Huang Ying <ying.huang@intel.com>
Date:   Wed Mar 3 16:52:46 2010 +0800

    Add savevm/loadvm support for MCE
    
    MCE registers are saved/load into/from CPUState in
    kvm_arch_save/load_regs. To simulate the MCG_STATUS clearing upon
    reset, MSR_MCG_STATUS is set to 0 for KVM_PUT_RESET_STATE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -774,7 +774,7 @@ static int kvm_put_msrs(CPUState *env, i
         struct kvm_msr_entry entries[100];
     } msr_data;
     struct kvm_msr_entry *msrs = msr_data.entries;
-    int n = 0;
+    int i, n = 0;
 
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
@@ -794,6 +794,18 @@ static int kvm_put_msrs(CPUState *env, i
                           env->system_time_msr);
         kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
     }
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        if (level == KVM_PUT_RESET_STATE)
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+        else if (level == KVM_PUT_FULL_STATE) {
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
+            for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+                kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
+        }
+    }
+#endif
 
     msr_data.info.nmsrs = n;
 
@@ -1001,6 +1013,15 @@ static int kvm_get_msrs(CPUState *env)
     msrs[n++].index = MSR_KVM_SYSTEM_TIME;
     msrs[n++].index = MSR_KVM_WALL_CLOCK;
 
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        msrs[n++].index = MSR_MCG_STATUS;
+        msrs[n++].index = MSR_MCG_CTL;
+        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+            msrs[n++].index = MSR_MC0_CTL + i;
+    }
+#endif
+
     msr_data.info.nmsrs = n;
     ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
     if (ret < 0)
@@ -1043,6 +1064,22 @@ static int kvm_get_msrs(CPUState *env)
         case MSR_KVM_WALL_CLOCK:
             env->wall_clock_msr = msrs[i].data;
             break;
+#ifdef KVM_CAP_MCE
+        case MSR_MCG_STATUS:
+            env->mcg_status = msrs[i].data;
+            break;
+        case MSR_MCG_CTL:
+            env->mcg_ctl = msrs[i].data;
+            break;
+#endif
+        default:
+#ifdef KVM_CAP_MCE
+            if (msrs[i].index >= MSR_MC0_CTL &&
+                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
+                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
+                break;
+            }
+#endif
         }
     }
 



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch uq/master 8/8] Add savevm/loadvm support for MCE
@ 2010-10-06 17:34     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-06 17:34 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: mce-save-restore --]
[-- Type: text/plain, Size: 2911 bytes --]

Port qemu-kvm's

commit 1bab5d11545d8de5facf46c28630085a2f9651ae
Author: Huang Ying <ying.huang@intel.com>
Date:   Wed Mar 3 16:52:46 2010 +0800

    Add savevm/loadvm support for MCE
    
    MCE registers are saved/load into/from CPUState in
    kvm_arch_save/load_regs. To simulate the MCG_STATUS clearing upon
    reset, MSR_MCG_STATUS is set to 0 for KVM_PUT_RESET_STATE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -774,7 +774,7 @@ static int kvm_put_msrs(CPUState *env, i
         struct kvm_msr_entry entries[100];
     } msr_data;
     struct kvm_msr_entry *msrs = msr_data.entries;
-    int n = 0;
+    int i, n = 0;
 
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
@@ -794,6 +794,18 @@ static int kvm_put_msrs(CPUState *env, i
                           env->system_time_msr);
         kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
     }
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        if (level == KVM_PUT_RESET_STATE)
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+        else if (level == KVM_PUT_FULL_STATE) {
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
+            for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+                kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
+        }
+    }
+#endif
 
     msr_data.info.nmsrs = n;
 
@@ -1001,6 +1013,15 @@ static int kvm_get_msrs(CPUState *env)
     msrs[n++].index = MSR_KVM_SYSTEM_TIME;
     msrs[n++].index = MSR_KVM_WALL_CLOCK;
 
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        msrs[n++].index = MSR_MCG_STATUS;
+        msrs[n++].index = MSR_MCG_CTL;
+        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+            msrs[n++].index = MSR_MC0_CTL + i;
+    }
+#endif
+
     msr_data.info.nmsrs = n;
     ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
     if (ret < 0)
@@ -1043,6 +1064,22 @@ static int kvm_get_msrs(CPUState *env)
         case MSR_KVM_WALL_CLOCK:
             env->wall_clock_msr = msrs[i].data;
             break;
+#ifdef KVM_CAP_MCE
+        case MSR_MCG_STATUS:
+            env->mcg_status = msrs[i].data;
+            break;
+        case MSR_MCG_CTL:
+            env->mcg_ctl = msrs[i].data;
+            break;
+#endif
+        default:
+#ifdef KVM_CAP_MCE
+            if (msrs[i].index >= MSR_MC0_CTL &&
+                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
+                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
+                break;
+            }
+#endif
         }
     }
 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-06 16:05       ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 18:10         ` Dean Nelson
  -1 siblings, 0 replies; 93+ messages in thread
From: Dean Nelson @ 2010-10-06 18:10 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Marcelo Tosatti, kvm, qemu-devel, Huang Ying

On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
>> I got some more question:
>>
>> (2010/10/05 3:54), Marcelo Tosatti wrote:
>>> Index: qemu/target-i386/cpu.h
>>> ===================================================================
>>> --- qemu.orig/target-i386/cpu.h
>>> +++ qemu/target-i386/cpu.h
>>> @@ -250,16 +250,32 @@
>>>   #define PG_ERROR_RSVD_MASK 0x08
>>>   #define PG_ERROR_I_D_MASK  0x10
>>>
>>> -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
>>> +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
>>> +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
>>>
>>> -#define MCE_CAP_DEF	MCG_CTL_P
>>> +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
>>>   #define MCE_BANKS_DEF	10
>>>
>>
>> It seems that current kvm doesn't support SER_P, so injecting SRAO
>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
>> from virtual processor that doesn't have SER_P.
>
> Dean also noted this. I don't think it was deliberate choice to not
> expose SER_P. Huang?

In my testing, I found that MCG_SER_P was not being set (and I was
running on a Nehalem-EX system). Injecting a MCE resulted in the
guest entering into panic() from mce_panic(). If crash_kexec()
finds a kexec_crash_image the system ends up rebooting, otherwise,
what happens next requires operator intervention.

When I applied a patch to the guest's kernel which forces mce_ser to be
set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
that when the memory page was 'owned' by a guest process, the process
would be killed (if the page was dirty), and the guest would stay
running. The HWPoisoned page would be sidelined and not cause any more
issues.

>> I think most OSes don't expect that it can receives MCE with !PCC
>> on traditional x86 processor without SER_P.
>>
>> Q1: Is it safe to expect that guests can handle such !PCC event?

This might be best answered by Huang, but as I mentioned above, without
MCG_SER_P being set, the result was an orderly system panic on the
guest.

>> Q2: What is the expected behavior on the guest?

I think I answered this above.

>> Q3: What happen if guest reboots itself in response to the MCE?

That depends...

And the following issue also holds for a guest that is rebooted at
some point having successfully sidelined the bad page.

After the guest has panic'd, a system_reset of the guest or a restart
initiated by crash_kexec() (called by panic() on the guest), usually
results in the guest hanging because the bad page still belongs
to qemu-kvm and is now being referenced by the new guest in some way.
(It actually may not hang, but successfully reboot and be runnable,
with the bad page lurking in the background. It all seems to depend on
where the bad page ends up, and whether it's ever referenced.)

I believe there was an attempt to deal with this in kvm on the host.
See kvm_handle_bad_page(). This function was suppose to result in the
sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
which in theory would result in the right thing happening. But commit
96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
sent. So this mechanism needs to be re-worked, and the issue remains.

I would think that if the the bad page can't be sidelined, such that
the newly booting guest can't use it, then the new guest shouldn't be
allowed to boot. But perhaps there is some merit in letting it try to
boot and see if one gets 'lucky'.

I understand that Huang is looking into what should be done. He can
give you better information than I in answer to your questions.

Dean

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-06 18:10         ` Dean Nelson
  0 siblings, 0 replies; 93+ messages in thread
From: Dean Nelson @ 2010-10-06 18:10 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Marcelo Tosatti, qemu-devel, kvm, Huang Ying

On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
>> I got some more question:
>>
>> (2010/10/05 3:54), Marcelo Tosatti wrote:
>>> Index: qemu/target-i386/cpu.h
>>> ===================================================================
>>> --- qemu.orig/target-i386/cpu.h
>>> +++ qemu/target-i386/cpu.h
>>> @@ -250,16 +250,32 @@
>>>   #define PG_ERROR_RSVD_MASK 0x08
>>>   #define PG_ERROR_I_D_MASK  0x10
>>>
>>> -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
>>> +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
>>> +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
>>>
>>> -#define MCE_CAP_DEF	MCG_CTL_P
>>> +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
>>>   #define MCE_BANKS_DEF	10
>>>
>>
>> It seems that current kvm doesn't support SER_P, so injecting SRAO
>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
>> from virtual processor that doesn't have SER_P.
>
> Dean also noted this. I don't think it was deliberate choice to not
> expose SER_P. Huang?

In my testing, I found that MCG_SER_P was not being set (and I was
running on a Nehalem-EX system). Injecting a MCE resulted in the
guest entering into panic() from mce_panic(). If crash_kexec()
finds a kexec_crash_image the system ends up rebooting, otherwise,
what happens next requires operator intervention.

When I applied a patch to the guest's kernel which forces mce_ser to be
set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
that when the memory page was 'owned' by a guest process, the process
would be killed (if the page was dirty), and the guest would stay
running. The HWPoisoned page would be sidelined and not cause any more
issues.

>> I think most OSes don't expect that it can receives MCE with !PCC
>> on traditional x86 processor without SER_P.
>>
>> Q1: Is it safe to expect that guests can handle such !PCC event?

This might be best answered by Huang, but as I mentioned above, without
MCG_SER_P being set, the result was an orderly system panic on the
guest.

>> Q2: What is the expected behavior on the guest?

I think I answered this above.

>> Q3: What happen if guest reboots itself in response to the MCE?

That depends...

And the following issue also holds for a guest that is rebooted at
some point having successfully sidelined the bad page.

After the guest has panic'd, a system_reset of the guest or a restart
initiated by crash_kexec() (called by panic() on the guest), usually
results in the guest hanging because the bad page still belongs
to qemu-kvm and is now being referenced by the new guest in some way.
(It actually may not hang, but successfully reboot and be runnable,
with the bad page lurking in the background. It all seems to depend on
where the bad page ends up, and whether it's ever referenced.)

I believe there was an attempt to deal with this in kvm on the host.
See kvm_handle_bad_page(). This function was suppose to result in the
sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
which in theory would result in the right thing happening. But commit
96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
sent. So this mechanism needs to be re-worked, and the issue remains.

I would think that if the the bad page can't be sidelined, such that
the newly booting guest can't use it, then the new guest shouldn't be
allowed to boot. But perhaps there is some merit in letting it try to
boot and see if one gets 'lucky'.

I understand that Huang is looking into what should be done. He can
give you better information than I in answer to your questions.

Dean

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 4/8] kvm: x86: add mce support
  2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-06 19:32       ` Anthony Liguori
  -1 siblings, 0 replies; 93+ messages in thread
From: Anthony Liguori @ 2010-10-06 19:32 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

On 10/06/2010 12:34 PM, Marcelo Tosatti wrote:
> Port qemu-kvm's MCE support
>
> commit c68b2374c9048812f488e00ffb95db66c0bc07a7
> Author: Huang Ying<ying.huang@intel.com>
> Date:   Mon Jul 20 10:00:53 2009 +0800
>
>      Add MCE simulation support to qemu/kvm
>
>      KVM ioctls are used to initialize MCE simulation and inject MCE. The
>      real MCE simulation is implemented in Linux kernel. The Kernel part
>      has been merged.
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>
> Index: qemu/target-i386/helper.c
> ===================================================================
> --- qemu.orig/target-i386/helper.c
> +++ qemu/target-i386/helper.c
> @@ -27,6 +27,7 @@
>   #include "exec-all.h"
>   #include "qemu-common.h"
>   #include "kvm.h"
> +#include "kvm_x86.h"
>
>   //#define DEBUG_MMU
>
> @@ -1030,6 +1031,11 @@ void cpu_inject_x86_mce(CPUState *cenv,
>       if (bank>= bank_num || !(status&  MCI_STATUS_VAL))
>           return;
>
> +    if (kvm_enabled()) {
> +        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
> +        return;
> +    }
> +
>       /*
>        * if MSR_MCG_CTL is not all 1s, the uncorrected error
>        * reporting is disabled
> Index: qemu/target-i386/kvm.c
> ===================================================================
> --- qemu.orig/target-i386/kvm.c
> +++ qemu/target-i386/kvm.c
> @@ -27,6 +27,7 @@
>   #include "hw/pc.h"
>   #include "hw/apic.h"
>   #include "ioport.h"
> +#include "kvm_x86.h"
>
>   #ifdef CONFIG_KVM_PARA
>   #include<linux/kvm_para.h>
> @@ -167,6 +168,67 @@ static int get_para_features(CPUState *e
>   }
>   #endif
>
> +#ifdef KVM_CAP_MCE
> +static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
> +                                     int *max_banks)
> +{
> +    int r;
> +
> +    r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
> +    if (r>  0) {
> +        *max_banks = r;
> +        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
> +    }
> +    return -ENOSYS;
> +}
> +
> +static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
> +{
> +    return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
> +}
> +
> +static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
> +{
> +    return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
> +}
> +
> +struct kvm_x86_mce_data
> +{
> +    CPUState *env;
> +    struct kvm_x86_mce *mce;
> +};
> +
> +static void kvm_do_inject_x86_mce(void *_data)
> +{
> +    struct kvm_x86_mce_data *data = _data;
> +    int r;
> +
> +    r = kvm_set_mce(data->env, data->mce);
> +    if (r<  0)
> +        perror("kvm_set_mce FAILED");
> +}
> +#endif
> +
> +void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> +                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
> +{
> +#ifdef KVM_CAP_MCE
> +    struct kvm_x86_mce mce = {
> +        .bank = bank,
> +        .status = status,
> +        .mcg_status = mcg_status,
> +        .addr = addr,
> +        .misc = misc,
> +    };
> +    struct kvm_x86_mce_data data = {
> +            .env = cenv,
> +            .mce =&mce,
> +    };
> +
> +    run_on_cpu(cenv, kvm_do_inject_x86_mce,&data);
> +#endif
> +}
> +
>   int kvm_arch_init_vcpu(CPUState *env)
>   {
>       struct {
> @@ -274,6 +336,28 @@ int kvm_arch_init_vcpu(CPUState *env)
>
>       cpuid_data.cpuid.nent = cpuid_i;
>
> +#ifdef KVM_CAP_MCE
> +    if (((env->cpuid_version>>  8)&0xF)>= 6
> +&&  (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
> +&&  kvm_check_extension(env->kvm_state, KVM_CAP_MCE)>  0) {
> +        uint64_t mcg_cap;
> +        int banks;
> +
> +        if (kvm_get_mce_cap_supported(env->kvm_state,&mcg_cap,&banks))
> +            perror("kvm_get_mce_cap_supported FAILED");
> +        else {
> +            if (banks>  MCE_BANKS_DEF)
> +                banks = MCE_BANKS_DEF;
> +            mcg_cap&= MCE_CAP_DEF;
> +            mcg_cap |= banks;
> +            if (kvm_setup_mce(env,&mcg_cap))
> +                perror("kvm_setup_mce FAILED");
> +            else
> +                env->mcg_cap = mcg_cap;
> +        }
> +    }
> +#endif
> +
>       return kvm_vcpu_ioctl(env, KVM_SET_CPUID2,&cpuid_data);
>   }
>
> Index: qemu/target-i386/kvm_x86.h
> ===================================================================
> --- /dev/null
> +++ qemu/target-i386/kvm_x86.h
> @@ -0,0 +1,21 @@
> +/*
> + * QEMU KVM support
> + *
> + * Copyright (C) 2009 Red Hat Inc.
> + * Copyright IBM, Corp. 2008
> + *
> + * Authors:
> + *  Anthony Liguori<aliguori@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
>    

BTW, I'm fairly sure I didn't write any of this code so this copyright 
statement is probably bogus.

Regards,

Anthony Liguori

> +
> +#ifndef __KVM_X86_H__
> +#define __KVM_X86_H__
> +
> +void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> +                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
> +
> +#endif
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 4/8] kvm: x86: add mce support
@ 2010-10-06 19:32       ` Anthony Liguori
  0 siblings, 0 replies; 93+ messages in thread
From: Anthony Liguori @ 2010-10-06 19:32 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

On 10/06/2010 12:34 PM, Marcelo Tosatti wrote:
> Port qemu-kvm's MCE support
>
> commit c68b2374c9048812f488e00ffb95db66c0bc07a7
> Author: Huang Ying<ying.huang@intel.com>
> Date:   Mon Jul 20 10:00:53 2009 +0800
>
>      Add MCE simulation support to qemu/kvm
>
>      KVM ioctls are used to initialize MCE simulation and inject MCE. The
>      real MCE simulation is implemented in Linux kernel. The Kernel part
>      has been merged.
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>
> Index: qemu/target-i386/helper.c
> ===================================================================
> --- qemu.orig/target-i386/helper.c
> +++ qemu/target-i386/helper.c
> @@ -27,6 +27,7 @@
>   #include "exec-all.h"
>   #include "qemu-common.h"
>   #include "kvm.h"
> +#include "kvm_x86.h"
>
>   //#define DEBUG_MMU
>
> @@ -1030,6 +1031,11 @@ void cpu_inject_x86_mce(CPUState *cenv,
>       if (bank>= bank_num || !(status&  MCI_STATUS_VAL))
>           return;
>
> +    if (kvm_enabled()) {
> +        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
> +        return;
> +    }
> +
>       /*
>        * if MSR_MCG_CTL is not all 1s, the uncorrected error
>        * reporting is disabled
> Index: qemu/target-i386/kvm.c
> ===================================================================
> --- qemu.orig/target-i386/kvm.c
> +++ qemu/target-i386/kvm.c
> @@ -27,6 +27,7 @@
>   #include "hw/pc.h"
>   #include "hw/apic.h"
>   #include "ioport.h"
> +#include "kvm_x86.h"
>
>   #ifdef CONFIG_KVM_PARA
>   #include<linux/kvm_para.h>
> @@ -167,6 +168,67 @@ static int get_para_features(CPUState *e
>   }
>   #endif
>
> +#ifdef KVM_CAP_MCE
> +static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
> +                                     int *max_banks)
> +{
> +    int r;
> +
> +    r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
> +    if (r>  0) {
> +        *max_banks = r;
> +        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
> +    }
> +    return -ENOSYS;
> +}
> +
> +static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
> +{
> +    return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
> +}
> +
> +static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
> +{
> +    return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
> +}
> +
> +struct kvm_x86_mce_data
> +{
> +    CPUState *env;
> +    struct kvm_x86_mce *mce;
> +};
> +
> +static void kvm_do_inject_x86_mce(void *_data)
> +{
> +    struct kvm_x86_mce_data *data = _data;
> +    int r;
> +
> +    r = kvm_set_mce(data->env, data->mce);
> +    if (r<  0)
> +        perror("kvm_set_mce FAILED");
> +}
> +#endif
> +
> +void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> +                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
> +{
> +#ifdef KVM_CAP_MCE
> +    struct kvm_x86_mce mce = {
> +        .bank = bank,
> +        .status = status,
> +        .mcg_status = mcg_status,
> +        .addr = addr,
> +        .misc = misc,
> +    };
> +    struct kvm_x86_mce_data data = {
> +            .env = cenv,
> +            .mce =&mce,
> +    };
> +
> +    run_on_cpu(cenv, kvm_do_inject_x86_mce,&data);
> +#endif
> +}
> +
>   int kvm_arch_init_vcpu(CPUState *env)
>   {
>       struct {
> @@ -274,6 +336,28 @@ int kvm_arch_init_vcpu(CPUState *env)
>
>       cpuid_data.cpuid.nent = cpuid_i;
>
> +#ifdef KVM_CAP_MCE
> +    if (((env->cpuid_version>>  8)&0xF)>= 6
> +&&  (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
> +&&  kvm_check_extension(env->kvm_state, KVM_CAP_MCE)>  0) {
> +        uint64_t mcg_cap;
> +        int banks;
> +
> +        if (kvm_get_mce_cap_supported(env->kvm_state,&mcg_cap,&banks))
> +            perror("kvm_get_mce_cap_supported FAILED");
> +        else {
> +            if (banks>  MCE_BANKS_DEF)
> +                banks = MCE_BANKS_DEF;
> +            mcg_cap&= MCE_CAP_DEF;
> +            mcg_cap |= banks;
> +            if (kvm_setup_mce(env,&mcg_cap))
> +                perror("kvm_setup_mce FAILED");
> +            else
> +                env->mcg_cap = mcg_cap;
> +        }
> +    }
> +#endif
> +
>       return kvm_vcpu_ioctl(env, KVM_SET_CPUID2,&cpuid_data);
>   }
>
> Index: qemu/target-i386/kvm_x86.h
> ===================================================================
> --- /dev/null
> +++ qemu/target-i386/kvm_x86.h
> @@ -0,0 +1,21 @@
> +/*
> + * QEMU KVM support
> + *
> + * Copyright (C) 2009 Red Hat Inc.
> + * Copyright IBM, Corp. 2008
> + *
> + * Authors:
> + *  Anthony Liguori<aliguori@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
>    

BTW, I'm fairly sure I didn't write any of this code so this copyright 
statement is probably bogus.

Regards,

Anthony Liguori

> +
> +#ifndef __KVM_X86_H__
> +#define __KVM_X86_H__
> +
> +void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> +                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
> +
> +#endif
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-06 18:10         ` [Qemu-devel] " Dean Nelson
@ 2010-10-07  3:41           ` Hidetoshi Seto
  -1 siblings, 0 replies; 93+ messages in thread
From: Hidetoshi Seto @ 2010-10-07  3:41 UTC (permalink / raw)
  To: Dean Nelson; +Cc: Marcelo Tosatti, kvm, qemu-devel, Huang Ying

(2010/10/07 3:10), Dean Nelson wrote:
> On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
>> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
>>> I got some more question:
>>>
>>> (2010/10/05 3:54), Marcelo Tosatti wrote:
>>>> Index: qemu/target-i386/cpu.h
>>>> ===================================================================
>>>> --- qemu.orig/target-i386/cpu.h
>>>> +++ qemu/target-i386/cpu.h
>>>> @@ -250,16 +250,32 @@
>>>>   #define PG_ERROR_RSVD_MASK 0x08
>>>>   #define PG_ERROR_I_D_MASK  0x10
>>>>
>>>> -#define MCG_CTL_P    (1UL<<8)   /* MCG_CAP register available */
>>>> +#define MCG_CTL_P    (1ULL<<8)   /* MCG_CAP register available */
>>>> +#define MCG_SER_P    (1ULL<<24) /* MCA recovery/new status bits */
>>>>
>>>> -#define MCE_CAP_DEF    MCG_CTL_P
>>>> +#define MCE_CAP_DEF    (MCG_CTL_P|MCG_SER_P)
>>>>   #define MCE_BANKS_DEF    10
>>>>
>>>
>>> It seems that current kvm doesn't support SER_P, so injecting SRAO
>>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
>>> from virtual processor that doesn't have SER_P.
>>
>> Dean also noted this. I don't think it was deliberate choice to not
>> expose SER_P. Huang?
> 
> In my testing, I found that MCG_SER_P was not being set (and I was
> running on a Nehalem-EX system). Injecting a MCE resulted in the
> guest entering into panic() from mce_panic(). If crash_kexec()
> finds a kexec_crash_image the system ends up rebooting, otherwise,
> what happens next requires operator intervention.

Good to know.
What I'm concerning is that if memory scrubbing SRAO event is
injected when !SER_P, linux guest with certain mce tolerant level
might grade it as "UC" severity and continue running with none of
panicking, killing and poisoning because of !PCC and RIPV.

Could you provide the panic message of the guest in your test?
I think it can tell me why the mce handler decided to go panic.

> When I applied a patch to the guest's kernel which forces mce_ser to be
> set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
> that when the memory page was 'owned' by a guest process, the process
> would be killed (if the page was dirty), and the guest would stay
> running. The HWPoisoned page would be sidelined and not cause any more
> issues.

Excellent.
So while guest kernel knows which page is poisoned, guest processes
are controlled not to touch the page.

... Therefore rebooting the vm and renewing kernel will lost the
information where is poisoned.

>>> I think most OSes don't expect that it can receives MCE with !PCC
>>> on traditional x86 processor without SER_P.
>>>
>>> Q1: Is it safe to expect that guests can handle such !PCC event?
> 
> This might be best answered by Huang, but as I mentioned above, without
> MCG_SER_P being set, the result was an orderly system panic on the
> guest.

Though I'll wait Huang (I think he is on holiday), I believe that
system panic is just a possible option for AO (Action Optional)
event, no matter how the SER_P is.

>>> Q2: What is the expected behavior on the guest?
> 
> I think I answered this above.

Yeah, thanks.

> 
>>> Q3: What happen if guest reboots itself in response to the MCE?
> 
> That depends...
> 
> And the following issue also holds for a guest that is rebooted at
> some point having successfully sidelined the bad page.
> 
> After the guest has panic'd, a system_reset of the guest or a restart
> initiated by crash_kexec() (called by panic() on the guest), usually
> results in the guest hanging because the bad page still belongs
> to qemu-kvm and is now being referenced by the new guest in some way.

Yes. In other words my concern about reboot is that new guest kernel
including kdump kernel might try to read the bad page.  If there is
no AR-SIGBUS etc., we need some tricks to inhibit such accesses.

> (It actually may not hang, but successfully reboot and be runnable,
> with the bad page lurking in the background. It all seems to depend on
> where the bad page ends up, and whether it's ever referenced.)

I know some tough guys using their PC with buggy DIMMs :-)

> 
> I believe there was an attempt to deal with this in kvm on the host.
> See kvm_handle_bad_page(). This function was suppose to result in the
> sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
> which in theory would result in the right thing happening. But commit
> 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
> sent. So this mechanism needs to be re-worked, and the issue remains.

Definitely.
I guess Huang has some plan or hint for rework this point.

> 
> I would think that if the the bad page can't be sidelined, such that
> the newly booting guest can't use it, then the new guest shouldn't be
> allowed to boot. But perhaps there is some merit in letting it try to
> boot and see if one gets 'lucky'.

In case of booting a real machine in real world, hardware and firmware
usually (or often) do self-test before passing control to OS.
Some platform can boot OS with degraded configuration (for example,
fewer memory) if it has trouble on its component.  Some BIOS may
stop booting and show messages like "please reseat [component]" on the
screen.  So we could implement/request qemu to have such mechanism.

I can understand the merit you mentioned here, in some degree. But I
think it is hard to say "unlucky" to customer in business...

> 
> I understand that Huang is looking into what should be done. He can
> give you better information than I in answer to your questions.

Agreed. Thank you very much!


Thanks,
H.Seto


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-07  3:41           ` Hidetoshi Seto
  0 siblings, 0 replies; 93+ messages in thread
From: Hidetoshi Seto @ 2010-10-07  3:41 UTC (permalink / raw)
  To: Dean Nelson; +Cc: Marcelo Tosatti, qemu-devel, kvm, Huang Ying

(2010/10/07 3:10), Dean Nelson wrote:
> On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
>> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
>>> I got some more question:
>>>
>>> (2010/10/05 3:54), Marcelo Tosatti wrote:
>>>> Index: qemu/target-i386/cpu.h
>>>> ===================================================================
>>>> --- qemu.orig/target-i386/cpu.h
>>>> +++ qemu/target-i386/cpu.h
>>>> @@ -250,16 +250,32 @@
>>>>   #define PG_ERROR_RSVD_MASK 0x08
>>>>   #define PG_ERROR_I_D_MASK  0x10
>>>>
>>>> -#define MCG_CTL_P    (1UL<<8)   /* MCG_CAP register available */
>>>> +#define MCG_CTL_P    (1ULL<<8)   /* MCG_CAP register available */
>>>> +#define MCG_SER_P    (1ULL<<24) /* MCA recovery/new status bits */
>>>>
>>>> -#define MCE_CAP_DEF    MCG_CTL_P
>>>> +#define MCE_CAP_DEF    (MCG_CTL_P|MCG_SER_P)
>>>>   #define MCE_BANKS_DEF    10
>>>>
>>>
>>> It seems that current kvm doesn't support SER_P, so injecting SRAO
>>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
>>> from virtual processor that doesn't have SER_P.
>>
>> Dean also noted this. I don't think it was deliberate choice to not
>> expose SER_P. Huang?
> 
> In my testing, I found that MCG_SER_P was not being set (and I was
> running on a Nehalem-EX system). Injecting a MCE resulted in the
> guest entering into panic() from mce_panic(). If crash_kexec()
> finds a kexec_crash_image the system ends up rebooting, otherwise,
> what happens next requires operator intervention.

Good to know.
What I'm concerning is that if memory scrubbing SRAO event is
injected when !SER_P, linux guest with certain mce tolerant level
might grade it as "UC" severity and continue running with none of
panicking, killing and poisoning because of !PCC and RIPV.

Could you provide the panic message of the guest in your test?
I think it can tell me why the mce handler decided to go panic.

> When I applied a patch to the guest's kernel which forces mce_ser to be
> set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
> that when the memory page was 'owned' by a guest process, the process
> would be killed (if the page was dirty), and the guest would stay
> running. The HWPoisoned page would be sidelined and not cause any more
> issues.

Excellent.
So while guest kernel knows which page is poisoned, guest processes
are controlled not to touch the page.

... Therefore rebooting the vm and renewing kernel will lost the
information where is poisoned.

>>> I think most OSes don't expect that it can receives MCE with !PCC
>>> on traditional x86 processor without SER_P.
>>>
>>> Q1: Is it safe to expect that guests can handle such !PCC event?
> 
> This might be best answered by Huang, but as I mentioned above, without
> MCG_SER_P being set, the result was an orderly system panic on the
> guest.

Though I'll wait Huang (I think he is on holiday), I believe that
system panic is just a possible option for AO (Action Optional)
event, no matter how the SER_P is.

>>> Q2: What is the expected behavior on the guest?
> 
> I think I answered this above.

Yeah, thanks.

> 
>>> Q3: What happen if guest reboots itself in response to the MCE?
> 
> That depends...
> 
> And the following issue also holds for a guest that is rebooted at
> some point having successfully sidelined the bad page.
> 
> After the guest has panic'd, a system_reset of the guest or a restart
> initiated by crash_kexec() (called by panic() on the guest), usually
> results in the guest hanging because the bad page still belongs
> to qemu-kvm and is now being referenced by the new guest in some way.

Yes. In other words my concern about reboot is that new guest kernel
including kdump kernel might try to read the bad page.  If there is
no AR-SIGBUS etc., we need some tricks to inhibit such accesses.

> (It actually may not hang, but successfully reboot and be runnable,
> with the bad page lurking in the background. It all seems to depend on
> where the bad page ends up, and whether it's ever referenced.)

I know some tough guys using their PC with buggy DIMMs :-)

> 
> I believe there was an attempt to deal with this in kvm on the host.
> See kvm_handle_bad_page(). This function was suppose to result in the
> sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
> which in theory would result in the right thing happening. But commit
> 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
> sent. So this mechanism needs to be re-worked, and the issue remains.

Definitely.
I guess Huang has some plan or hint for rework this point.

> 
> I would think that if the the bad page can't be sidelined, such that
> the newly booting guest can't use it, then the new guest shouldn't be
> allowed to boot. But perhaps there is some merit in letting it try to
> boot and see if one gets 'lucky'.

In case of booting a real machine in real world, hardware and firmware
usually (or often) do self-test before passing control to OS.
Some platform can boot OS with degraded configuration (for example,
fewer memory) if it has trouble on its component.  Some BIOS may
stop booting and show messages like "please reseat [component]" on the
screen.  So we could implement/request qemu to have such mechanism.

I can understand the merit you mentioned here, in some degree. But I
think it is hard to say "unlucky" to customer in business...

> 
> I understand that Huang is looking into what should be done. He can
> give you better information than I in answer to your questions.

Agreed. Thank you very much!


Thanks,
H.Seto

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-07  3:41           ` [Qemu-devel] " Hidetoshi Seto
@ 2010-10-07 15:23             ` Dean Nelson
  -1 siblings, 0 replies; 93+ messages in thread
From: Dean Nelson @ 2010-10-07 15:23 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Marcelo Tosatti, kvm, qemu-devel, Huang Ying

On 10/06/2010 10:41 PM, Hidetoshi Seto wrote:
> (2010/10/07 3:10), Dean Nelson wrote:
>> On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
>>> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
>>>> I got some more question:
>>>>
>>>> (2010/10/05 3:54), Marcelo Tosatti wrote:
>>>>> Index: qemu/target-i386/cpu.h
>>>>> ===================================================================
>>>>> --- qemu.orig/target-i386/cpu.h
>>>>> +++ qemu/target-i386/cpu.h
>>>>> @@ -250,16 +250,32 @@
>>>>>    #define PG_ERROR_RSVD_MASK 0x08
>>>>>    #define PG_ERROR_I_D_MASK  0x10
>>>>>
>>>>> -#define MCG_CTL_P    (1UL<<8)   /* MCG_CAP register available */
>>>>> +#define MCG_CTL_P    (1ULL<<8)   /* MCG_CAP register available */
>>>>> +#define MCG_SER_P    (1ULL<<24) /* MCA recovery/new status bits */
>>>>>
>>>>> -#define MCE_CAP_DEF    MCG_CTL_P
>>>>> +#define MCE_CAP_DEF    (MCG_CTL_P|MCG_SER_P)
>>>>>    #define MCE_BANKS_DEF    10
>>>>>
>>>>
>>>> It seems that current kvm doesn't support SER_P, so injecting SRAO
>>>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
>>>> from virtual processor that doesn't have SER_P.
>>>
>>> Dean also noted this. I don't think it was deliberate choice to not
>>> expose SER_P. Huang?
>>
>> In my testing, I found that MCG_SER_P was not being set (and I was
>> running on a Nehalem-EX system). Injecting a MCE resulted in the
>> guest entering into panic() from mce_panic(). If crash_kexec()
>> finds a kexec_crash_image the system ends up rebooting, otherwise,
>> what happens next requires operator intervention.
>
> Good to know.
> What I'm concerning is that if memory scrubbing SRAO event is
> injected when !SER_P, linux guest with certain mce tolerant level
> might grade it as "UC" severity and continue running with none of
> panicking, killing and poisoning because of !PCC and RIPV.
>
> Could you provide the panic message of the guest in your test?
> I think it can tell me why the mce handler decided to go panic.

Sure, I'll add the info below at the end of this email.


>> When I applied a patch to the guest's kernel which forces mce_ser to be
>> set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
>> that when the memory page was 'owned' by a guest process, the process
>> would be killed (if the page was dirty), and the guest would stay
>> running. The HWPoisoned page would be sidelined and not cause any more
>> issues.
>
> Excellent.
> So while guest kernel knows which page is poisoned, guest processes
> are controlled not to touch the page.
>
> ... Therefore rebooting the vm and renewing kernel will lost the
> information where is poisoned.

Correct.


>>>> I think most OSes don't expect that it can receives MCE with !PCC
>>>> on traditional x86 processor without SER_P.
>>>>
>>>> Q1: Is it safe to expect that guests can handle such !PCC event?
>>
>> This might be best answered by Huang, but as I mentioned above, without
>> MCG_SER_P being set, the result was an orderly system panic on the
>> guest.
>
> Though I'll wait Huang (I think he is on holiday), I believe that
> system panic is just a possible option for AO (Action Optional)
> event, no matter how the SER_P is.

I think you may be correct, but Huang will know for sure.


>>>> Q2: What is the expected behavior on the guest?
>>
>> I think I answered this above.
>
> Yeah, thanks.
>
>>
>>>> Q3: What happen if guest reboots itself in response to the MCE?
>>
>> That depends...
>>
>> And the following issue also holds for a guest that is rebooted at
>> some point having successfully sidelined the bad page.
>>
>> After the guest has panic'd, a system_reset of the guest or a restart
>> initiated by crash_kexec() (called by panic() on the guest), usually
>> results in the guest hanging because the bad page still belongs
>> to qemu-kvm and is now being referenced by the new guest in some way.
>
> Yes. In other words my concern about reboot is that new guest kernel
> including kdump kernel might try to read the bad page.  If there is
> no AR-SIGBUS etc., we need some tricks to inhibit such accesses.

Agreed.


>> (It actually may not hang, but successfully reboot and be runnable,
>> with the bad page lurking in the background. It all seems to depend on
>> where the bad page ends up, and whether it's ever referenced.)
>
> I know some tough guys using their PC with buggy DIMMs :-)
>
>>
>> I believe there was an attempt to deal with this in kvm on the host.
>> See kvm_handle_bad_page(). This function was suppose to result in the
>> sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
>> which in theory would result in the right thing happening. But commit
>> 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
>> sent. So this mechanism needs to be re-worked, and the issue remains.
>
> Definitely.
> I guess Huang has some plan or hint for rework this point.

Yeah, as far as I know Huang is looking into this.


>> I would think that if the the bad page can't be sidelined, such that
>> the newly booting guest can't use it, then the new guest shouldn't be
>> allowed to boot. But perhaps there is some merit in letting it try to
>> boot and see if one gets 'lucky'.
>
> In case of booting a real machine in real world, hardware and firmware
> usually (or often) do self-test before passing control to OS.
> Some platform can boot OS with degraded configuration (for example,
> fewer memory) if it has trouble on its component.  Some BIOS may
> stop booting and show messages like "please reseat [component]" on the
> screen.  So we could implement/request qemu to have such mechanism.
>
> I can understand the merit you mentioned here, in some degree. But I
> think it is hard to say "unlucky" to customer in business...

I totally agree.


>> I understand that Huang is looking into what should be done. He can
>> give you better information than I in answer to your questions.
>
> Agreed. Thank you very much!

You're welcome.

Dean

> Thanks,
> H.Seto


::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

The test I'm running is the mce-test suite's kvm test. A portion of
the messages it outputted (to stdout) follows:

> Guest physical address is 0x71220000
> Host virtual address is 7f9dc5020
> Host physical address is 0x1051620000
> Guest physical klog address is 0x71220

And it called mce-inject with the following data file:

> [root@intel-s3e36-02 test]# cat SRAO
> CPU 0 BANK 2
> STATUS UNCORRECTED SRAO 0x17a
> MCGSTATUS MCIP RIPV
> MISC 0x8c
> ADDR 0x1051620000
> [root@intel-s3e36-02 test]#

The following is from the host's /var/log/messages:

> Oct  7 09:42:48 intel-s3e36-02 kernel: Triggering MCE exception on CPU 0
> Oct  7 09:42:48 intel-s3e36-02 kernel: Machine check events logged
> Oct  7 09:42:48 intel-s3e36-02 kernel: MCE exception done on CPU 0
> Oct  7 09:42:48 intel-s3e36-02 kernel: MCE 0x1051620: Killing qemu-system-x86:6867 early due to hardware memory corruption
> Oct  7 09:42:48 intel-s3e36-02 kernel: MCE 0x1051620: dirty LRU page recovery: Recovered

Lastly, the following is a screen grab from the guest's serial console:

> HARDWARE ERROR
> CPU 0: Machine Check Exception:                5 Bank 9: bd000000000000c0
> RIP !INEXACT! 33:<0000000000400428>
> TSC 17a67acd14 ADDR 71220000 MISC 8c
> PROCESSOR 0:6d3 TIME 1286458966 SOCKET 0 APIC 0
> No human readable MCE decoding support on this CPU type.
> Run the message through 'mcelog --ascii' to decode.
> This is not a software problem!
> Machine check: Uncorrected
> Kernel panic - not syncing: Fatal machine check on current CPU
> Pid:1493, comm: simple_process Tainted: B   M        ----------------  2.6.32.dnelson_test #48
>
> Call Trace:
>  <#MC>  [<ffffffff814c7c8d>] panic+0x78/0x137
>  [<ffffffff81027382>] mce_panic+0x1e2/0x210
>  [<ffffffff81028873>] do_machine_check+0x843/0xa70
>  [<ffffffff814cb0cc>] machine_check+0x1c/0x30
>  <<EOE>>



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-07 15:23             ` Dean Nelson
  0 siblings, 0 replies; 93+ messages in thread
From: Dean Nelson @ 2010-10-07 15:23 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Marcelo Tosatti, qemu-devel, kvm, Huang Ying

On 10/06/2010 10:41 PM, Hidetoshi Seto wrote:
> (2010/10/07 3:10), Dean Nelson wrote:
>> On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
>>> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
>>>> I got some more question:
>>>>
>>>> (2010/10/05 3:54), Marcelo Tosatti wrote:
>>>>> Index: qemu/target-i386/cpu.h
>>>>> ===================================================================
>>>>> --- qemu.orig/target-i386/cpu.h
>>>>> +++ qemu/target-i386/cpu.h
>>>>> @@ -250,16 +250,32 @@
>>>>>    #define PG_ERROR_RSVD_MASK 0x08
>>>>>    #define PG_ERROR_I_D_MASK  0x10
>>>>>
>>>>> -#define MCG_CTL_P    (1UL<<8)   /* MCG_CAP register available */
>>>>> +#define MCG_CTL_P    (1ULL<<8)   /* MCG_CAP register available */
>>>>> +#define MCG_SER_P    (1ULL<<24) /* MCA recovery/new status bits */
>>>>>
>>>>> -#define MCE_CAP_DEF    MCG_CTL_P
>>>>> +#define MCE_CAP_DEF    (MCG_CTL_P|MCG_SER_P)
>>>>>    #define MCE_BANKS_DEF    10
>>>>>
>>>>
>>>> It seems that current kvm doesn't support SER_P, so injecting SRAO
>>>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
>>>> from virtual processor that doesn't have SER_P.
>>>
>>> Dean also noted this. I don't think it was deliberate choice to not
>>> expose SER_P. Huang?
>>
>> In my testing, I found that MCG_SER_P was not being set (and I was
>> running on a Nehalem-EX system). Injecting a MCE resulted in the
>> guest entering into panic() from mce_panic(). If crash_kexec()
>> finds a kexec_crash_image the system ends up rebooting, otherwise,
>> what happens next requires operator intervention.
>
> Good to know.
> What I'm concerning is that if memory scrubbing SRAO event is
> injected when !SER_P, linux guest with certain mce tolerant level
> might grade it as "UC" severity and continue running with none of
> panicking, killing and poisoning because of !PCC and RIPV.
>
> Could you provide the panic message of the guest in your test?
> I think it can tell me why the mce handler decided to go panic.

Sure, I'll add the info below at the end of this email.


>> When I applied a patch to the guest's kernel which forces mce_ser to be
>> set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
>> that when the memory page was 'owned' by a guest process, the process
>> would be killed (if the page was dirty), and the guest would stay
>> running. The HWPoisoned page would be sidelined and not cause any more
>> issues.
>
> Excellent.
> So while guest kernel knows which page is poisoned, guest processes
> are controlled not to touch the page.
>
> ... Therefore rebooting the vm and renewing kernel will lost the
> information where is poisoned.

Correct.


>>>> I think most OSes don't expect that it can receives MCE with !PCC
>>>> on traditional x86 processor without SER_P.
>>>>
>>>> Q1: Is it safe to expect that guests can handle such !PCC event?
>>
>> This might be best answered by Huang, but as I mentioned above, without
>> MCG_SER_P being set, the result was an orderly system panic on the
>> guest.
>
> Though I'll wait Huang (I think he is on holiday), I believe that
> system panic is just a possible option for AO (Action Optional)
> event, no matter how the SER_P is.

I think you may be correct, but Huang will know for sure.


>>>> Q2: What is the expected behavior on the guest?
>>
>> I think I answered this above.
>
> Yeah, thanks.
>
>>
>>>> Q3: What happen if guest reboots itself in response to the MCE?
>>
>> That depends...
>>
>> And the following issue also holds for a guest that is rebooted at
>> some point having successfully sidelined the bad page.
>>
>> After the guest has panic'd, a system_reset of the guest or a restart
>> initiated by crash_kexec() (called by panic() on the guest), usually
>> results in the guest hanging because the bad page still belongs
>> to qemu-kvm and is now being referenced by the new guest in some way.
>
> Yes. In other words my concern about reboot is that new guest kernel
> including kdump kernel might try to read the bad page.  If there is
> no AR-SIGBUS etc., we need some tricks to inhibit such accesses.

Agreed.


>> (It actually may not hang, but successfully reboot and be runnable,
>> with the bad page lurking in the background. It all seems to depend on
>> where the bad page ends up, and whether it's ever referenced.)
>
> I know some tough guys using their PC with buggy DIMMs :-)
>
>>
>> I believe there was an attempt to deal with this in kvm on the host.
>> See kvm_handle_bad_page(). This function was suppose to result in the
>> sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
>> which in theory would result in the right thing happening. But commit
>> 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
>> sent. So this mechanism needs to be re-worked, and the issue remains.
>
> Definitely.
> I guess Huang has some plan or hint for rework this point.

Yeah, as far as I know Huang is looking into this.


>> I would think that if the the bad page can't be sidelined, such that
>> the newly booting guest can't use it, then the new guest shouldn't be
>> allowed to boot. But perhaps there is some merit in letting it try to
>> boot and see if one gets 'lucky'.
>
> In case of booting a real machine in real world, hardware and firmware
> usually (or often) do self-test before passing control to OS.
> Some platform can boot OS with degraded configuration (for example,
> fewer memory) if it has trouble on its component.  Some BIOS may
> stop booting and show messages like "please reseat [component]" on the
> screen.  So we could implement/request qemu to have such mechanism.
>
> I can understand the merit you mentioned here, in some degree. But I
> think it is hard to say "unlucky" to customer in business...

I totally agree.


>> I understand that Huang is looking into what should be done. He can
>> give you better information than I in answer to your questions.
>
> Agreed. Thank you very much!

You're welcome.

Dean

> Thanks,
> H.Seto


::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

The test I'm running is the mce-test suite's kvm test. A portion of
the messages it outputted (to stdout) follows:

> Guest physical address is 0x71220000
> Host virtual address is 7f9dc5020
> Host physical address is 0x1051620000
> Guest physical klog address is 0x71220

And it called mce-inject with the following data file:

> [root@intel-s3e36-02 test]# cat SRAO
> CPU 0 BANK 2
> STATUS UNCORRECTED SRAO 0x17a
> MCGSTATUS MCIP RIPV
> MISC 0x8c
> ADDR 0x1051620000
> [root@intel-s3e36-02 test]#

The following is from the host's /var/log/messages:

> Oct  7 09:42:48 intel-s3e36-02 kernel: Triggering MCE exception on CPU 0
> Oct  7 09:42:48 intel-s3e36-02 kernel: Machine check events logged
> Oct  7 09:42:48 intel-s3e36-02 kernel: MCE exception done on CPU 0
> Oct  7 09:42:48 intel-s3e36-02 kernel: MCE 0x1051620: Killing qemu-system-x86:6867 early due to hardware memory corruption
> Oct  7 09:42:48 intel-s3e36-02 kernel: MCE 0x1051620: dirty LRU page recovery: Recovered

Lastly, the following is a screen grab from the guest's serial console:

> HARDWARE ERROR
> CPU 0: Machine Check Exception:                5 Bank 9: bd000000000000c0
> RIP !INEXACT! 33:<0000000000400428>
> TSC 17a67acd14 ADDR 71220000 MISC 8c
> PROCESSOR 0:6d3 TIME 1286458966 SOCKET 0 APIC 0
> No human readable MCE decoding support on this CPU type.
> Run the message through 'mcelog --ascii' to decode.
> This is not a software problem!
> Machine check: Uncorrected
> Kernel panic - not syncing: Fatal machine check on current CPU
> Pid:1493, comm: simple_process Tainted: B   M        ----------------  2.6.32.dnelson_test #48
>
> Call Trace:
>  <#MC>  [<ffffffff814c7c8d>] panic+0x78/0x137
>  [<ffffffff81027382>] mce_panic+0x1e2/0x210
>  [<ffffffff81028873>] do_machine_check+0x843/0xa70
>  [<ffffffff814cb0cc>] machine_check+0x1c/0x30
>  <<EOE>>

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-06 16:05       ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-08  2:50         ` Huang Ying
  -1 siblings, 0 replies; 93+ messages in thread
From: Huang Ying @ 2010-10-08  2:50 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Hidetoshi Seto, kvm, qemu-devel, Dean Nelson

On Thu, 2010-10-07 at 00:05 +0800, Marcelo Tosatti wrote:
> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
> > I got some more question:
> > 
> > (2010/10/05 3:54), Marcelo Tosatti wrote:
> > > Index: qemu/target-i386/cpu.h
> > > ===================================================================
> > > --- qemu.orig/target-i386/cpu.h
> > > +++ qemu/target-i386/cpu.h
> > > @@ -250,16 +250,32 @@
> > >  #define PG_ERROR_RSVD_MASK 0x08
> > >  #define PG_ERROR_I_D_MASK  0x10
> > >  
> > > -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
> > > +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
> > > +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
> > >  
> > > -#define MCE_CAP_DEF	MCG_CTL_P
> > > +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
> > >  #define MCE_BANKS_DEF	10
> > >  
> > 
> > It seems that current kvm doesn't support SER_P, so injecting SRAO
> > to guest will mean that guest receives VAL|UC|!PCC and RIPV event
> > from virtual processor that doesn't have SER_P.
> 
> Dean also noted this. I don't think it was deliberate choice to not
> expose SER_P. Huang?

In fact, that should be a BUG. I will fix it as soon as possible.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-08  2:50         ` Huang Ying
  0 siblings, 0 replies; 93+ messages in thread
From: Huang Ying @ 2010-10-08  2:50 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Dean Nelson, Hidetoshi Seto, qemu-devel, kvm

On Thu, 2010-10-07 at 00:05 +0800, Marcelo Tosatti wrote:
> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
> > I got some more question:
> > 
> > (2010/10/05 3:54), Marcelo Tosatti wrote:
> > > Index: qemu/target-i386/cpu.h
> > > ===================================================================
> > > --- qemu.orig/target-i386/cpu.h
> > > +++ qemu/target-i386/cpu.h
> > > @@ -250,16 +250,32 @@
> > >  #define PG_ERROR_RSVD_MASK 0x08
> > >  #define PG_ERROR_I_D_MASK  0x10
> > >  
> > > -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
> > > +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
> > > +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
> > >  
> > > -#define MCE_CAP_DEF	MCG_CTL_P
> > > +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
> > >  #define MCE_BANKS_DEF	10
> > >  
> > 
> > It seems that current kvm doesn't support SER_P, so injecting SRAO
> > to guest will mean that guest receives VAL|UC|!PCC and RIPV event
> > from virtual processor that doesn't have SER_P.
> 
> Dean also noted this. I don't think it was deliberate choice to not
> expose SER_P. Huang?

In fact, that should be a BUG. I will fix it as soon as possible.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-07  3:41           ` [Qemu-devel] " Hidetoshi Seto
@ 2010-10-08  3:15             ` Huang Ying
  -1 siblings, 0 replies; 93+ messages in thread
From: Huang Ying @ 2010-10-08  3:15 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Dean Nelson, Marcelo Tosatti, kvm, qemu-devel

Hi, Seto,

On Thu, 2010-10-07 at 11:41 +0800, Hidetoshi Seto wrote:
> (2010/10/07 3:10), Dean Nelson wrote:
> > On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
> >> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
> >>> I got some more question:
> >>>
> >>> (2010/10/05 3:54), Marcelo Tosatti wrote:
> >>>> Index: qemu/target-i386/cpu.h
> >>>> ===================================================================
> >>>> --- qemu.orig/target-i386/cpu.h
> >>>> +++ qemu/target-i386/cpu.h
> >>>> @@ -250,16 +250,32 @@
> >>>>   #define PG_ERROR_RSVD_MASK 0x08
> >>>>   #define PG_ERROR_I_D_MASK  0x10
> >>>>
> >>>> -#define MCG_CTL_P    (1UL<<8)   /* MCG_CAP register available */
> >>>> +#define MCG_CTL_P    (1ULL<<8)   /* MCG_CAP register available */
> >>>> +#define MCG_SER_P    (1ULL<<24) /* MCA recovery/new status bits */
> >>>>
> >>>> -#define MCE_CAP_DEF    MCG_CTL_P
> >>>> +#define MCE_CAP_DEF    (MCG_CTL_P|MCG_SER_P)
> >>>>   #define MCE_BANKS_DEF    10
> >>>>
> >>>
> >>> It seems that current kvm doesn't support SER_P, so injecting SRAO
> >>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
> >>> from virtual processor that doesn't have SER_P.
> >>
> >> Dean also noted this. I don't think it was deliberate choice to not
> >> expose SER_P. Huang?
> > 
> > In my testing, I found that MCG_SER_P was not being set (and I was
> > running on a Nehalem-EX system). Injecting a MCE resulted in the
> > guest entering into panic() from mce_panic(). If crash_kexec()
> > finds a kexec_crash_image the system ends up rebooting, otherwise,
> > what happens next requires operator intervention.
> 
> Good to know.
> What I'm concerning is that if memory scrubbing SRAO event is
> injected when !SER_P, linux guest with certain mce tolerant level
> might grade it as "UC" severity and continue running with none of
> panicking, killing and poisoning because of !PCC and RIPV.
> 
> Could you provide the panic message of the guest in your test?
> I think it can tell me why the mce handler decided to go panic.

That is a bug that the SER_P is not in KVM_MCE_CAP_SUPPORTED in kernel.
I will fix it as soon as possible. And SRAO MCE should not be sent
when !SER_P, we should add that condition in qemu-kvm.

> > When I applied a patch to the guest's kernel which forces mce_ser to be
> > set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
> > that when the memory page was 'owned' by a guest process, the process
> > would be killed (if the page was dirty), and the guest would stay
> > running. The HWPoisoned page would be sidelined and not cause any more
> > issues.
> 
> Excellent.
> So while guest kernel knows which page is poisoned, guest processes
> are controlled not to touch the page.
> 
> ... Therefore rebooting the vm and renewing kernel will lost the
> information where is poisoned.

Yes. That is an issue. Dean suggests that make qemu-kvm to refuse reboot
the guest if there is poisoned page and ask for user to intervention. I
have another idea to replace the poison pages with good pages when
reboot, that is, recover without user intervention.

> >>> I think most OSes don't expect that it can receives MCE with !PCC
> >>> on traditional x86 processor without SER_P.
> >>>
> >>> Q1: Is it safe to expect that guests can handle such !PCC event?
> > 
> > This might be best answered by Huang, but as I mentioned above, without
> > MCG_SER_P being set, the result was an orderly system panic on the
> > guest.
> 
> Though I'll wait Huang (I think he is on holiday), I believe that
> system panic is just a possible option for AO (Action Optional)
> event, no matter how the SER_P is.

We should fix this as I said above.

> >>> Q2: What is the expected behavior on the guest?
> > 
> > I think I answered this above.
> 
> Yeah, thanks.
> 
> > 
> >>> Q3: What happen if guest reboots itself in response to the MCE?
> > 
> > That depends...
> > 
> > And the following issue also holds for a guest that is rebooted at
> > some point having successfully sidelined the bad page.
> > 
> > After the guest has panic'd, a system_reset of the guest or a restart
> > initiated by crash_kexec() (called by panic() on the guest), usually
> > results in the guest hanging because the bad page still belongs
> > to qemu-kvm and is now being referenced by the new guest in some way.
> 
> Yes. In other words my concern about reboot is that new guest kernel
> including kdump kernel might try to read the bad page.  If there is
> no AR-SIGBUS etc., we need some tricks to inhibit such accesses.
> 
> > (It actually may not hang, but successfully reboot and be runnable,
> > with the bad page lurking in the background. It all seems to depend on
> > where the bad page ends up, and whether it's ever referenced.)
> 
> I know some tough guys using their PC with buggy DIMMs :-)
> 
> > 
> > I believe there was an attempt to deal with this in kvm on the host.
> > See kvm_handle_bad_page(). This function was suppose to result in the
> > sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
> > which in theory would result in the right thing happening. But commit
> > 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
> > sent. So this mechanism needs to be re-worked, and the issue remains.
> 
> Definitely.
> I guess Huang has some plan or hint for rework this point.

Yes. This should be fixed. The SRAR SIGBUS should be sent directly
instead of being sent via touching poisoned virtual address.
 
> > I would think that if the the bad page can't be sidelined, such that
> > the newly booting guest can't use it, then the new guest shouldn't be
> > allowed to boot. But perhaps there is some merit in letting it try to
> > boot and see if one gets 'lucky'.
> 
> In case of booting a real machine in real world, hardware and firmware
> usually (or often) do self-test before passing control to OS.
> Some platform can boot OS with degraded configuration (for example,
> fewer memory) if it has trouble on its component.  Some BIOS may
> stop booting and show messages like "please reseat [component]" on the
> screen.  So we could implement/request qemu to have such mechanism.
> 
> I can understand the merit you mentioned here, in some degree. But I
> think it is hard to say "unlucky" to customer in business...

Because the contents of poisoned pages are not relevant after reboot.
Qemu can replace the poisoned pages with good pages when reboot guest.
Do you think that is good.

Best Regards,
Huang Ying



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-08  3:15             ` Huang Ying
  0 siblings, 0 replies; 93+ messages in thread
From: Huang Ying @ 2010-10-08  3:15 UTC (permalink / raw)
  To: Hidetoshi Seto; +Cc: Dean Nelson, Marcelo Tosatti, qemu-devel, kvm

Hi, Seto,

On Thu, 2010-10-07 at 11:41 +0800, Hidetoshi Seto wrote:
> (2010/10/07 3:10), Dean Nelson wrote:
> > On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
> >> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
> >>> I got some more question:
> >>>
> >>> (2010/10/05 3:54), Marcelo Tosatti wrote:
> >>>> Index: qemu/target-i386/cpu.h
> >>>> ===================================================================
> >>>> --- qemu.orig/target-i386/cpu.h
> >>>> +++ qemu/target-i386/cpu.h
> >>>> @@ -250,16 +250,32 @@
> >>>>   #define PG_ERROR_RSVD_MASK 0x08
> >>>>   #define PG_ERROR_I_D_MASK  0x10
> >>>>
> >>>> -#define MCG_CTL_P    (1UL<<8)   /* MCG_CAP register available */
> >>>> +#define MCG_CTL_P    (1ULL<<8)   /* MCG_CAP register available */
> >>>> +#define MCG_SER_P    (1ULL<<24) /* MCA recovery/new status bits */
> >>>>
> >>>> -#define MCE_CAP_DEF    MCG_CTL_P
> >>>> +#define MCE_CAP_DEF    (MCG_CTL_P|MCG_SER_P)
> >>>>   #define MCE_BANKS_DEF    10
> >>>>
> >>>
> >>> It seems that current kvm doesn't support SER_P, so injecting SRAO
> >>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
> >>> from virtual processor that doesn't have SER_P.
> >>
> >> Dean also noted this. I don't think it was deliberate choice to not
> >> expose SER_P. Huang?
> > 
> > In my testing, I found that MCG_SER_P was not being set (and I was
> > running on a Nehalem-EX system). Injecting a MCE resulted in the
> > guest entering into panic() from mce_panic(). If crash_kexec()
> > finds a kexec_crash_image the system ends up rebooting, otherwise,
> > what happens next requires operator intervention.
> 
> Good to know.
> What I'm concerning is that if memory scrubbing SRAO event is
> injected when !SER_P, linux guest with certain mce tolerant level
> might grade it as "UC" severity and continue running with none of
> panicking, killing and poisoning because of !PCC and RIPV.
> 
> Could you provide the panic message of the guest in your test?
> I think it can tell me why the mce handler decided to go panic.

That is a bug that the SER_P is not in KVM_MCE_CAP_SUPPORTED in kernel.
I will fix it as soon as possible. And SRAO MCE should not be sent
when !SER_P, we should add that condition in qemu-kvm.

> > When I applied a patch to the guest's kernel which forces mce_ser to be
> > set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
> > that when the memory page was 'owned' by a guest process, the process
> > would be killed (if the page was dirty), and the guest would stay
> > running. The HWPoisoned page would be sidelined and not cause any more
> > issues.
> 
> Excellent.
> So while guest kernel knows which page is poisoned, guest processes
> are controlled not to touch the page.
> 
> ... Therefore rebooting the vm and renewing kernel will lost the
> information where is poisoned.

Yes. That is an issue. Dean suggests that make qemu-kvm to refuse reboot
the guest if there is poisoned page and ask for user to intervention. I
have another idea to replace the poison pages with good pages when
reboot, that is, recover without user intervention.

> >>> I think most OSes don't expect that it can receives MCE with !PCC
> >>> on traditional x86 processor without SER_P.
> >>>
> >>> Q1: Is it safe to expect that guests can handle such !PCC event?
> > 
> > This might be best answered by Huang, but as I mentioned above, without
> > MCG_SER_P being set, the result was an orderly system panic on the
> > guest.
> 
> Though I'll wait Huang (I think he is on holiday), I believe that
> system panic is just a possible option for AO (Action Optional)
> event, no matter how the SER_P is.

We should fix this as I said above.

> >>> Q2: What is the expected behavior on the guest?
> > 
> > I think I answered this above.
> 
> Yeah, thanks.
> 
> > 
> >>> Q3: What happen if guest reboots itself in response to the MCE?
> > 
> > That depends...
> > 
> > And the following issue also holds for a guest that is rebooted at
> > some point having successfully sidelined the bad page.
> > 
> > After the guest has panic'd, a system_reset of the guest or a restart
> > initiated by crash_kexec() (called by panic() on the guest), usually
> > results in the guest hanging because the bad page still belongs
> > to qemu-kvm and is now being referenced by the new guest in some way.
> 
> Yes. In other words my concern about reboot is that new guest kernel
> including kdump kernel might try to read the bad page.  If there is
> no AR-SIGBUS etc., we need some tricks to inhibit such accesses.
> 
> > (It actually may not hang, but successfully reboot and be runnable,
> > with the bad page lurking in the background. It all seems to depend on
> > where the bad page ends up, and whether it's ever referenced.)
> 
> I know some tough guys using their PC with buggy DIMMs :-)
> 
> > 
> > I believe there was an attempt to deal with this in kvm on the host.
> > See kvm_handle_bad_page(). This function was suppose to result in the
> > sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
> > which in theory would result in the right thing happening. But commit
> > 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
> > sent. So this mechanism needs to be re-worked, and the issue remains.
> 
> Definitely.
> I guess Huang has some plan or hint for rework this point.

Yes. This should be fixed. The SRAR SIGBUS should be sent directly
instead of being sent via touching poisoned virtual address.
 
> > I would think that if the the bad page can't be sidelined, such that
> > the newly booting guest can't use it, then the new guest shouldn't be
> > allowed to boot. But perhaps there is some merit in letting it try to
> > boot and see if one gets 'lucky'.
> 
> In case of booting a real machine in real world, hardware and firmware
> usually (or often) do self-test before passing control to OS.
> Some platform can boot OS with degraded configuration (for example,
> fewer memory) if it has trouble on its component.  Some BIOS may
> stop booting and show messages like "please reseat [component]" on the
> screen.  So we could implement/request qemu to have such mechanism.
> 
> I can understand the merit you mentioned here, in some degree. But I
> think it is hard to say "unlucky" to customer in business...

Because the contents of poisoned pages are not relevant after reboot.
Qemu can replace the poisoned pages with good pages when reboot guest.
Do you think that is good.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-08  3:15             ` [Qemu-devel] " Huang Ying
@ 2010-10-08  5:54               ` Hidetoshi Seto
  -1 siblings, 0 replies; 93+ messages in thread
From: Hidetoshi Seto @ 2010-10-08  5:54 UTC (permalink / raw)
  To: Huang Ying; +Cc: Dean Nelson, Marcelo Tosatti, kvm, qemu-devel

Hi, Huang-san,

(2010/10/08 12:15), Huang Ying wrote:
> Hi, Seto,
> 
> On Thu, 2010-10-07 at 11:41 +0800, Hidetoshi Seto wrote:
>> (2010/10/07 3:10), Dean Nelson wrote:
>>> On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
>>>> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
>>>>> I got some more question:
>>>>>
>>>>> (2010/10/05 3:54), Marcelo Tosatti wrote:
>>>>>> Index: qemu/target-i386/cpu.h
>>>>>> ===================================================================
>>>>>> --- qemu.orig/target-i386/cpu.h
>>>>>> +++ qemu/target-i386/cpu.h
>>>>>> @@ -250,16 +250,32 @@
>>>>>>   #define PG_ERROR_RSVD_MASK 0x08
>>>>>>   #define PG_ERROR_I_D_MASK  0x10
>>>>>>
>>>>>> -#define MCG_CTL_P    (1UL<<8)   /* MCG_CAP register available */
>>>>>> +#define MCG_CTL_P    (1ULL<<8)   /* MCG_CAP register available */
>>>>>> +#define MCG_SER_P    (1ULL<<24) /* MCA recovery/new status bits */
>>>>>>
>>>>>> -#define MCE_CAP_DEF    MCG_CTL_P
>>>>>> +#define MCE_CAP_DEF    (MCG_CTL_P|MCG_SER_P)
>>>>>>   #define MCE_BANKS_DEF    10
>>>>>>
>>>>>
>>>>> It seems that current kvm doesn't support SER_P, so injecting SRAO
>>>>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
>>>>> from virtual processor that doesn't have SER_P.
>>>>
>>>> Dean also noted this. I don't think it was deliberate choice to not
>>>> expose SER_P. Huang?
>>>
>>> In my testing, I found that MCG_SER_P was not being set (and I was
>>> running on a Nehalem-EX system). Injecting a MCE resulted in the
>>> guest entering into panic() from mce_panic(). If crash_kexec()
>>> finds a kexec_crash_image the system ends up rebooting, otherwise,
>>> what happens next requires operator intervention.
>>
>> Good to know.
>> What I'm concerning is that if memory scrubbing SRAO event is
>> injected when !SER_P, linux guest with certain mce tolerant level
>> might grade it as "UC" severity and continue running with none of
>> panicking, killing and poisoning because of !PCC and RIPV.
>>
>> Could you provide the panic message of the guest in your test?
>> I think it can tell me why the mce handler decided to go panic.
> 
> That is a bug that the SER_P is not in KVM_MCE_CAP_SUPPORTED in kernel.
> I will fix it as soon as possible. And SRAO MCE should not be sent
> when !SER_P, we should add that condition in qemu-kvm.

That makes sense.
I think it is qemu's responsibility for what follows the AO-SIGBUS,
what action should be taken depends on the KVM's capability.

>>> When I applied a patch to the guest's kernel which forces mce_ser to be
>>> set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
>>> that when the memory page was 'owned' by a guest process, the process
>>> would be killed (if the page was dirty), and the guest would stay
>>> running. The HWPoisoned page would be sidelined and not cause any more
>>> issues.
>>
>> Excellent.
>> So while guest kernel knows which page is poisoned, guest processes
>> are controlled not to touch the page.
>>
>> ... Therefore rebooting the vm and renewing kernel will lost the
>> information where is poisoned.
> 
> Yes. That is an issue. Dean suggests that make qemu-kvm to refuse reboot
> the guest if there is poisoned page and ask for user to intervention. I
> have another idea to replace the poison pages with good pages when
> reboot, that is, recover without user intervention.

Sounds good.

I think it may be worth something to reserve pages for the replacement
before reboot is requested; at least we really don't want to fail
rebooting with 'no memory'.

>>>>> I think most OSes don't expect that it can receives MCE with !PCC
>>>>> on traditional x86 processor without SER_P.
>>>>>
>>>>> Q1: Is it safe to expect that guests can handle such !PCC event?
>>>
>>> This might be best answered by Huang, but as I mentioned above, without
>>> MCG_SER_P being set, the result was an orderly system panic on the
>>> guest.
>>
>> Though I'll wait Huang (I think he is on holiday), I believe that
>> system panic is just a possible option for AO (Action Optional)
>> event, no matter how the SER_P is.
> 
> We should fix this as I said above.
> 
>>>>> Q2: What is the expected behavior on the guest?
>>>
>>> I think I answered this above.
>>
>> Yeah, thanks.
>>
>>>
>>>>> Q3: What happen if guest reboots itself in response to the MCE?
>>>
>>> That depends...
>>>
>>> And the following issue also holds for a guest that is rebooted at
>>> some point having successfully sidelined the bad page.
>>>
>>> After the guest has panic'd, a system_reset of the guest or a restart
>>> initiated by crash_kexec() (called by panic() on the guest), usually
>>> results in the guest hanging because the bad page still belongs
>>> to qemu-kvm and is now being referenced by the new guest in some way.
>>
>> Yes. In other words my concern about reboot is that new guest kernel
>> including kdump kernel might try to read the bad page.  If there is
>> no AR-SIGBUS etc., we need some tricks to inhibit such accesses.
>>
>>> (It actually may not hang, but successfully reboot and be runnable,
>>> with the bad page lurking in the background. It all seems to depend on
>>> where the bad page ends up, and whether it's ever referenced.)
>>
>> I know some tough guys using their PC with buggy DIMMs :-)
>>
>>>
>>> I believe there was an attempt to deal with this in kvm on the host.
>>> See kvm_handle_bad_page(). This function was suppose to result in the
>>> sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
>>> which in theory would result in the right thing happening. But commit
>>> 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
>>> sent. So this mechanism needs to be re-worked, and the issue remains.
>>
>> Definitely.
>> I guess Huang has some plan or hint for rework this point.
> 
> Yes. This should be fixed. The SRAR SIGBUS should be sent directly
> instead of being sent via touching poisoned virtual address.

Good. It should work.

>>> I would think that if the the bad page can't be sidelined, such that
>>> the newly booting guest can't use it, then the new guest shouldn't be
>>> allowed to boot. But perhaps there is some merit in letting it try to
>>> boot and see if one gets 'lucky'.
>>
>> In case of booting a real machine in real world, hardware and firmware
>> usually (or often) do self-test before passing control to OS.
>> Some platform can boot OS with degraded configuration (for example,
>> fewer memory) if it has trouble on its component.  Some BIOS may
>> stop booting and show messages like "please reseat [component]" on the
>> screen.  So we could implement/request qemu to have such mechanism.
>>
>> I can understand the merit you mentioned here, in some degree. But I
>> think it is hard to say "unlucky" to customer in business...
> 
> Because the contents of poisoned pages are not relevant after reboot.
> Qemu can replace the poisoned pages with good pages when reboot guest.
> Do you think that is good.

Sure.

Of course this trick will not needed if user has done migration or
save/restore the guest before a reboot.

Thank you for answering!


Thanks,
H.Seto 


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-08  5:54               ` Hidetoshi Seto
  0 siblings, 0 replies; 93+ messages in thread
From: Hidetoshi Seto @ 2010-10-08  5:54 UTC (permalink / raw)
  To: Huang Ying; +Cc: Dean Nelson, Marcelo Tosatti, qemu-devel, kvm

Hi, Huang-san,

(2010/10/08 12:15), Huang Ying wrote:
> Hi, Seto,
> 
> On Thu, 2010-10-07 at 11:41 +0800, Hidetoshi Seto wrote:
>> (2010/10/07 3:10), Dean Nelson wrote:
>>> On 10/06/2010 11:05 AM, Marcelo Tosatti wrote:
>>>> On Wed, Oct 06, 2010 at 10:58:36AM +0900, Hidetoshi Seto wrote:
>>>>> I got some more question:
>>>>>
>>>>> (2010/10/05 3:54), Marcelo Tosatti wrote:
>>>>>> Index: qemu/target-i386/cpu.h
>>>>>> ===================================================================
>>>>>> --- qemu.orig/target-i386/cpu.h
>>>>>> +++ qemu/target-i386/cpu.h
>>>>>> @@ -250,16 +250,32 @@
>>>>>>   #define PG_ERROR_RSVD_MASK 0x08
>>>>>>   #define PG_ERROR_I_D_MASK  0x10
>>>>>>
>>>>>> -#define MCG_CTL_P    (1UL<<8)   /* MCG_CAP register available */
>>>>>> +#define MCG_CTL_P    (1ULL<<8)   /* MCG_CAP register available */
>>>>>> +#define MCG_SER_P    (1ULL<<24) /* MCA recovery/new status bits */
>>>>>>
>>>>>> -#define MCE_CAP_DEF    MCG_CTL_P
>>>>>> +#define MCE_CAP_DEF    (MCG_CTL_P|MCG_SER_P)
>>>>>>   #define MCE_BANKS_DEF    10
>>>>>>
>>>>>
>>>>> It seems that current kvm doesn't support SER_P, so injecting SRAO
>>>>> to guest will mean that guest receives VAL|UC|!PCC and RIPV event
>>>>> from virtual processor that doesn't have SER_P.
>>>>
>>>> Dean also noted this. I don't think it was deliberate choice to not
>>>> expose SER_P. Huang?
>>>
>>> In my testing, I found that MCG_SER_P was not being set (and I was
>>> running on a Nehalem-EX system). Injecting a MCE resulted in the
>>> guest entering into panic() from mce_panic(). If crash_kexec()
>>> finds a kexec_crash_image the system ends up rebooting, otherwise,
>>> what happens next requires operator intervention.
>>
>> Good to know.
>> What I'm concerning is that if memory scrubbing SRAO event is
>> injected when !SER_P, linux guest with certain mce tolerant level
>> might grade it as "UC" severity and continue running with none of
>> panicking, killing and poisoning because of !PCC and RIPV.
>>
>> Could you provide the panic message of the guest in your test?
>> I think it can tell me why the mce handler decided to go panic.
> 
> That is a bug that the SER_P is not in KVM_MCE_CAP_SUPPORTED in kernel.
> I will fix it as soon as possible. And SRAO MCE should not be sent
> when !SER_P, we should add that condition in qemu-kvm.

That makes sense.
I think it is qemu's responsibility for what follows the AO-SIGBUS,
what action should be taken depends on the KVM's capability.

>>> When I applied a patch to the guest's kernel which forces mce_ser to be
>>> set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
>>> that when the memory page was 'owned' by a guest process, the process
>>> would be killed (if the page was dirty), and the guest would stay
>>> running. The HWPoisoned page would be sidelined and not cause any more
>>> issues.
>>
>> Excellent.
>> So while guest kernel knows which page is poisoned, guest processes
>> are controlled not to touch the page.
>>
>> ... Therefore rebooting the vm and renewing kernel will lost the
>> information where is poisoned.
> 
> Yes. That is an issue. Dean suggests that make qemu-kvm to refuse reboot
> the guest if there is poisoned page and ask for user to intervention. I
> have another idea to replace the poison pages with good pages when
> reboot, that is, recover without user intervention.

Sounds good.

I think it may be worth something to reserve pages for the replacement
before reboot is requested; at least we really don't want to fail
rebooting with 'no memory'.

>>>>> I think most OSes don't expect that it can receives MCE with !PCC
>>>>> on traditional x86 processor without SER_P.
>>>>>
>>>>> Q1: Is it safe to expect that guests can handle such !PCC event?
>>>
>>> This might be best answered by Huang, but as I mentioned above, without
>>> MCG_SER_P being set, the result was an orderly system panic on the
>>> guest.
>>
>> Though I'll wait Huang (I think he is on holiday), I believe that
>> system panic is just a possible option for AO (Action Optional)
>> event, no matter how the SER_P is.
> 
> We should fix this as I said above.
> 
>>>>> Q2: What is the expected behavior on the guest?
>>>
>>> I think I answered this above.
>>
>> Yeah, thanks.
>>
>>>
>>>>> Q3: What happen if guest reboots itself in response to the MCE?
>>>
>>> That depends...
>>>
>>> And the following issue also holds for a guest that is rebooted at
>>> some point having successfully sidelined the bad page.
>>>
>>> After the guest has panic'd, a system_reset of the guest or a restart
>>> initiated by crash_kexec() (called by panic() on the guest), usually
>>> results in the guest hanging because the bad page still belongs
>>> to qemu-kvm and is now being referenced by the new guest in some way.
>>
>> Yes. In other words my concern about reboot is that new guest kernel
>> including kdump kernel might try to read the bad page.  If there is
>> no AR-SIGBUS etc., we need some tricks to inhibit such accesses.
>>
>>> (It actually may not hang, but successfully reboot and be runnable,
>>> with the bad page lurking in the background. It all seems to depend on
>>> where the bad page ends up, and whether it's ever referenced.)
>>
>> I know some tough guys using their PC with buggy DIMMs :-)
>>
>>>
>>> I believe there was an attempt to deal with this in kvm on the host.
>>> See kvm_handle_bad_page(). This function was suppose to result in the
>>> sending of a BUS_MCEERR_AR flavored SIGBUS by do_sigbus() to qemu-kvm
>>> which in theory would result in the right thing happening. But commit
>>> 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being
>>> sent. So this mechanism needs to be re-worked, and the issue remains.
>>
>> Definitely.
>> I guess Huang has some plan or hint for rework this point.
> 
> Yes. This should be fixed. The SRAR SIGBUS should be sent directly
> instead of being sent via touching poisoned virtual address.

Good. It should work.

>>> I would think that if the the bad page can't be sidelined, such that
>>> the newly booting guest can't use it, then the new guest shouldn't be
>>> allowed to boot. But perhaps there is some merit in letting it try to
>>> boot and see if one gets 'lucky'.
>>
>> In case of booting a real machine in real world, hardware and firmware
>> usually (or often) do self-test before passing control to OS.
>> Some platform can boot OS with degraded configuration (for example,
>> fewer memory) if it has trouble on its component.  Some BIOS may
>> stop booting and show messages like "please reseat [component]" on the
>> screen.  So we could implement/request qemu to have such mechanism.
>>
>> I can understand the merit you mentioned here, in some degree. But I
>> think it is hard to say "unlucky" to customer in business...
> 
> Because the contents of poisoned pages are not relevant after reboot.
> Qemu can replace the poisoned pages with good pages when reboot guest.
> Do you think that is good.

Sure.

Of course this trick will not needed if user has done migration or
save/restore the guest before a reboot.

Thank you for answering!


Thanks,
H.Seto 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
  2010-10-08  3:15             ` [Qemu-devel] " Huang Ying
@ 2010-10-08 12:02               ` Dean Nelson
  -1 siblings, 0 replies; 93+ messages in thread
From: Dean Nelson @ 2010-10-08 12:02 UTC (permalink / raw)
  To: Huang Ying; +Cc: Hidetoshi Seto, Marcelo Tosatti, kvm, qemu-devel

On 10/07/2010 10:15 PM, Huang Ying wrote:
> Hi, Seto,
>
> On Thu, 2010-10-07 at 11:41 +0800, Hidetoshi Seto wrote:
>> (2010/10/07 3:10), Dean Nelson wrote:
<snip>
>>> When I applied a patch to the guest's kernel which forces mce_ser to be
>>> set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
>>> that when the memory page was 'owned' by a guest process, the process
>>> would be killed (if the page was dirty), and the guest would stay
>>> running. The HWPoisoned page would be sidelined and not cause any more
>>> issues.
>>
>> Excellent.
>> So while guest kernel knows which page is poisoned, guest processes
>> are controlled not to touch the page.
>>
>> ... Therefore rebooting the vm and renewing kernel will lost the
>> information where is poisoned.
>
> Yes. That is an issue. Dean suggests that make qemu-kvm to refuse reboot
> the guest if there is poisoned page and ask for user to intervention. I
> have another idea to replace the poison pages with good pages when
> reboot, that is, recover without user intervention.

Hi, Huang, I much prefer the replacing of the poisoned pages with good
pages on reboot, over the refusing to reboot. So definitely go with
your idea.

Thanks,
Dean

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch uq/master 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-08 12:02               ` Dean Nelson
  0 siblings, 0 replies; 93+ messages in thread
From: Dean Nelson @ 2010-10-08 12:02 UTC (permalink / raw)
  To: Huang Ying; +Cc: Hidetoshi Seto, Marcelo Tosatti, qemu-devel, kvm

On 10/07/2010 10:15 PM, Huang Ying wrote:
> Hi, Seto,
>
> On Thu, 2010-10-07 at 11:41 +0800, Hidetoshi Seto wrote:
>> (2010/10/07 3:10), Dean Nelson wrote:
<snip>
>>> When I applied a patch to the guest's kernel which forces mce_ser to be
>>> set, as if MCG_SER_P was set (see __mcheck_cpu_cap_init()), I found
>>> that when the memory page was 'owned' by a guest process, the process
>>> would be killed (if the page was dirty), and the guest would stay
>>> running. The HWPoisoned page would be sidelined and not cause any more
>>> issues.
>>
>> Excellent.
>> So while guest kernel knows which page is poisoned, guest processes
>> are controlled not to touch the page.
>>
>> ... Therefore rebooting the vm and renewing kernel will lost the
>> information where is poisoned.
>
> Yes. That is an issue. Dean suggests that make qemu-kvm to refuse reboot
> the guest if there is poisoned page and ask for user to intervention. I
> have another idea to replace the poison pages with good pages when
> reboot, that is, recover without user intervention.

Hi, Huang, I much prefer the replacing of the poisoned pages with good
pages on reboot, over the refusing to reboot. So definitely go with
your idea.

Thanks,
Dean

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 0/8] port qemu-kvm's MCE support (v3)
  2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31     ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson

Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
allows qemu to propagate MCEs to the guest.

v2:
- rename do_qemu_ram_addr_from_host.
- fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
- fix bank register restoration (Dean Nelson).

v3:
- condition MCE generation on MCE_SEG_P bit (Huang Ying).



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 0/8] port qemu-kvm's MCE support (v3)
@ 2010-10-11 18:31     ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Huang Ying

Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
allows qemu to propagate MCEs to the guest.

v2:
- rename do_qemu_ram_addr_from_host.
- fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
- fix bank register restoration (Dean Nelson).

v3:
- condition MCE generation on MCE_SEG_P bit (Huang Ying).

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 1/8] signalfd compatibility
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: signalfd --]
[-- Type: text/plain, Size: 5970 bytes --]

Port qemu-kvm's signalfd compat code.

commit 5a7fdd0abd7cd24dac205317a4195446ab8748b5
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed May 7 11:55:47 2008 -0500

    Use signalfd() in io-thread
    
    This patch reworks the IO thread to use signalfd() instead of sigtimedwait()
    This will eliminate the need to use SIGIO everywhere.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/compatfd.c
===================================================================
--- /dev/null
+++ qemu/compatfd.c
@@ -0,0 +1,117 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "compatfd.h"
+
+#include <sys/syscall.h>
+#include <pthread.h>
+
+struct sigfd_compat_info
+{
+    sigset_t mask;
+    int fd;
+};
+
+static void *sigwait_compat(void *opaque)
+{
+    struct sigfd_compat_info *info = opaque;
+    int err;
+    sigset_t all;
+
+    sigfillset(&all);
+    sigprocmask(SIG_BLOCK, &all, NULL);
+
+    do {
+        siginfo_t siginfo;
+
+        err = sigwaitinfo(&info->mask, &siginfo);
+        if (err == -1 && errno == EINTR) {
+            err = 0;
+            continue;
+        }
+
+        if (err > 0) {
+            char buffer[128];
+            size_t offset = 0;
+
+            memcpy(buffer, &err, sizeof(err));
+            while (offset < sizeof(buffer)) {
+                ssize_t len;
+
+                len = write(info->fd, buffer + offset,
+                            sizeof(buffer) - offset);
+                if (len == -1 && errno == EINTR)
+                    continue;
+
+                if (len <= 0) {
+                    err = -1;
+                    break;
+                }
+
+                offset += len;
+            }
+        }
+    } while (err >= 0);
+
+    return NULL;
+}
+
+static int qemu_signalfd_compat(const sigset_t *mask)
+{
+    pthread_attr_t attr;
+    pthread_t tid;
+    struct sigfd_compat_info *info;
+    int fds[2];
+
+    info = malloc(sizeof(*info));
+    if (info == NULL) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    if (pipe(fds) == -1) {
+        free(info);
+        return -1;
+    }
+
+    qemu_set_cloexec(fds[0]);
+    qemu_set_cloexec(fds[1]);
+
+    memcpy(&info->mask, mask, sizeof(*mask));
+    info->fd = fds[1];
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+    pthread_create(&tid, &attr, sigwait_compat, info);
+
+    pthread_attr_destroy(&attr);
+
+    return fds[0];
+}
+
+int qemu_signalfd(const sigset_t *mask)
+{
+#if defined(CONFIG_SIGNALFD)
+    int ret;
+
+    ret = syscall(SYS_signalfd, -1, mask, _NSIG / 8);
+    if (ret != -1) {
+        qemu_set_cloexec(ret);
+        return ret;
+    }
+#endif
+
+    return qemu_signalfd_compat(mask);
+}
Index: qemu/compatfd.h
===================================================================
--- /dev/null
+++ qemu/compatfd.h
@@ -0,0 +1,43 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COMPATFD_H
+#define QEMU_COMPATFD_H
+
+#include <signal.h>
+
+struct qemu_signalfd_siginfo {
+    uint32_t ssi_signo;   /* Signal number */
+    int32_t  ssi_errno;   /* Error number (unused) */
+    int32_t  ssi_code;    /* Signal code */
+    uint32_t ssi_pid;     /* PID of sender */
+    uint32_t ssi_uid;     /* Real UID of sender */
+    int32_t  ssi_fd;      /* File descriptor (SIGIO) */
+    uint32_t ssi_tid;     /* Kernel timer ID (POSIX timers) */
+    uint32_t ssi_band;    /* Band event (SIGIO) */
+    uint32_t ssi_overrun; /* POSIX timer overrun count */
+    uint32_t ssi_trapno;  /* Trap number that caused signal */
+    int32_t  ssi_status;  /* Exit status or signal (SIGCHLD) */
+    int32_t  ssi_int;     /* Integer sent by sigqueue(2) */
+    uint64_t ssi_ptr;     /* Pointer sent by sigqueue(2) */
+    uint64_t ssi_utime;   /* User CPU time consumed (SIGCHLD) */
+    uint64_t ssi_stime;   /* System CPU time consumed (SIGCHLD) */
+    uint64_t ssi_addr;    /* Address that generated signal
+                             (for hardware-generated signals) */
+    uint8_t  pad[48];     /* Pad size to 128 bytes (allow for
+                             additional fields in the future) */
+};
+
+int qemu_signalfd(const sigset_t *mask);
+
+#endif
Index: qemu/Makefile.objs
===================================================================
--- qemu.orig/Makefile.objs
+++ qemu/Makefile.objs
@@ -121,6 +121,7 @@ common-obj-y += $(addprefix ui/, $(ui-ob
 
 common-obj-y += iov.o acl.o
 common-obj-$(CONFIG_THREAD) += qemu-thread.o
+common-obj-$(CONFIG_IOTHREAD) += compatfd.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o
 
Index: qemu/configure
===================================================================
--- qemu.orig/configure
+++ qemu/configure
@@ -1936,6 +1936,21 @@ if compile_prog "" "" ; then
   splice=yes
 fi
 
+##########################################
+# signalfd probe
+signalfd="no"
+cat > $TMPC << EOF
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <signal.h>
+int main(void) { return syscall(SYS_signalfd, -1, NULL, _NSIG / 8); }
+EOF
+
+if compile_prog "" "" ; then
+  signalfd=yes
+fi
+
 # check if eventfd is supported
 eventfd=no
 cat > $TMPC << EOF
@@ -2509,6 +2524,9 @@ fi
 if test "$fdt" = "yes" ; then
   echo "CONFIG_FDT=y" >> $config_host_mak
 fi
+if test "$signalfd" = "yes" ; then
+  echo "CONFIG_SIGNALFD=y" >> $config_host_mak
+fi
 if test "$need_offsetof" = "yes" ; then
   echo "CONFIG_NEED_OFFSETOF=y" >> $config_host_mak
 fi



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 1/8] signalfd compatibility
@ 2010-10-11 18:31       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: signalfd --]
[-- Type: text/plain, Size: 5968 bytes --]

Port qemu-kvm's signalfd compat code.

commit 5a7fdd0abd7cd24dac205317a4195446ab8748b5
Author: Anthony Liguori <aliguori@us.ibm.com>
Date:   Wed May 7 11:55:47 2008 -0500

    Use signalfd() in io-thread
    
    This patch reworks the IO thread to use signalfd() instead of sigtimedwait()
    This will eliminate the need to use SIGIO everywhere.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/compatfd.c
===================================================================
--- /dev/null
+++ qemu/compatfd.c
@@ -0,0 +1,117 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "compatfd.h"
+
+#include <sys/syscall.h>
+#include <pthread.h>
+
+struct sigfd_compat_info
+{
+    sigset_t mask;
+    int fd;
+};
+
+static void *sigwait_compat(void *opaque)
+{
+    struct sigfd_compat_info *info = opaque;
+    int err;
+    sigset_t all;
+
+    sigfillset(&all);
+    sigprocmask(SIG_BLOCK, &all, NULL);
+
+    do {
+        siginfo_t siginfo;
+
+        err = sigwaitinfo(&info->mask, &siginfo);
+        if (err == -1 && errno == EINTR) {
+            err = 0;
+            continue;
+        }
+
+        if (err > 0) {
+            char buffer[128];
+            size_t offset = 0;
+
+            memcpy(buffer, &err, sizeof(err));
+            while (offset < sizeof(buffer)) {
+                ssize_t len;
+
+                len = write(info->fd, buffer + offset,
+                            sizeof(buffer) - offset);
+                if (len == -1 && errno == EINTR)
+                    continue;
+
+                if (len <= 0) {
+                    err = -1;
+                    break;
+                }
+
+                offset += len;
+            }
+        }
+    } while (err >= 0);
+
+    return NULL;
+}
+
+static int qemu_signalfd_compat(const sigset_t *mask)
+{
+    pthread_attr_t attr;
+    pthread_t tid;
+    struct sigfd_compat_info *info;
+    int fds[2];
+
+    info = malloc(sizeof(*info));
+    if (info == NULL) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    if (pipe(fds) == -1) {
+        free(info);
+        return -1;
+    }
+
+    qemu_set_cloexec(fds[0]);
+    qemu_set_cloexec(fds[1]);
+
+    memcpy(&info->mask, mask, sizeof(*mask));
+    info->fd = fds[1];
+
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+    pthread_create(&tid, &attr, sigwait_compat, info);
+
+    pthread_attr_destroy(&attr);
+
+    return fds[0];
+}
+
+int qemu_signalfd(const sigset_t *mask)
+{
+#if defined(CONFIG_SIGNALFD)
+    int ret;
+
+    ret = syscall(SYS_signalfd, -1, mask, _NSIG / 8);
+    if (ret != -1) {
+        qemu_set_cloexec(ret);
+        return ret;
+    }
+#endif
+
+    return qemu_signalfd_compat(mask);
+}
Index: qemu/compatfd.h
===================================================================
--- /dev/null
+++ qemu/compatfd.h
@@ -0,0 +1,43 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COMPATFD_H
+#define QEMU_COMPATFD_H
+
+#include <signal.h>
+
+struct qemu_signalfd_siginfo {
+    uint32_t ssi_signo;   /* Signal number */
+    int32_t  ssi_errno;   /* Error number (unused) */
+    int32_t  ssi_code;    /* Signal code */
+    uint32_t ssi_pid;     /* PID of sender */
+    uint32_t ssi_uid;     /* Real UID of sender */
+    int32_t  ssi_fd;      /* File descriptor (SIGIO) */
+    uint32_t ssi_tid;     /* Kernel timer ID (POSIX timers) */
+    uint32_t ssi_band;    /* Band event (SIGIO) */
+    uint32_t ssi_overrun; /* POSIX timer overrun count */
+    uint32_t ssi_trapno;  /* Trap number that caused signal */
+    int32_t  ssi_status;  /* Exit status or signal (SIGCHLD) */
+    int32_t  ssi_int;     /* Integer sent by sigqueue(2) */
+    uint64_t ssi_ptr;     /* Pointer sent by sigqueue(2) */
+    uint64_t ssi_utime;   /* User CPU time consumed (SIGCHLD) */
+    uint64_t ssi_stime;   /* System CPU time consumed (SIGCHLD) */
+    uint64_t ssi_addr;    /* Address that generated signal
+                             (for hardware-generated signals) */
+    uint8_t  pad[48];     /* Pad size to 128 bytes (allow for
+                             additional fields in the future) */
+};
+
+int qemu_signalfd(const sigset_t *mask);
+
+#endif
Index: qemu/Makefile.objs
===================================================================
--- qemu.orig/Makefile.objs
+++ qemu/Makefile.objs
@@ -121,6 +121,7 @@ common-obj-y += $(addprefix ui/, $(ui-ob
 
 common-obj-y += iov.o acl.o
 common-obj-$(CONFIG_THREAD) += qemu-thread.o
+common-obj-$(CONFIG_IOTHREAD) += compatfd.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o
 
Index: qemu/configure
===================================================================
--- qemu.orig/configure
+++ qemu/configure
@@ -1936,6 +1936,21 @@ if compile_prog "" "" ; then
   splice=yes
 fi
 
+##########################################
+# signalfd probe
+signalfd="no"
+cat > $TMPC << EOF
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <signal.h>
+int main(void) { return syscall(SYS_signalfd, -1, NULL, _NSIG / 8); }
+EOF
+
+if compile_prog "" "" ; then
+  signalfd=yes
+fi
+
 # check if eventfd is supported
 eventfd=no
 cat > $TMPC << EOF
@@ -2509,6 +2524,9 @@ fi
 if test "$fdt" = "yes" ; then
   echo "CONFIG_FDT=y" >> $config_host_mak
 fi
+if test "$signalfd" = "yes" ; then
+  echo "CONFIG_SIGNALFD=y" >> $config_host_mak
+fi
 if test "$need_offsetof" = "yes" ; then
   echo "CONFIG_NEED_OFFSETOF=y" >> $config_host_mak
 fi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 2/8] iothread: use signalfd
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: block-io-signals-in-iothread --]
[-- Type: text/plain, Size: 3272 bytes --]

Block SIGALRM, SIGIO and consume them via signalfd.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -33,6 +33,7 @@
 #include "exec-all.h"
 
 #include "cpus.h"
+#include "compatfd.h"
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -329,14 +330,75 @@ static QemuCond qemu_work_cond;
 
 static void tcg_init_ipi(void);
 static void kvm_init_ipi(CPUState *env);
-static void unblock_io_signals(void);
+static sigset_t block_io_signals(void);
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+static void sigfd_handler(void *opaque)
+{
+    int fd = (unsigned long) opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            printf("read from sigfd returned %zd: %m\n", len);
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signalfd_init(sigset_t mask)
+{
+    int sigfd;
+
+    sigfd = qemu_signalfd(&mask);
+    if (sigfd == -1) {
+        fprintf(stderr, "failed to create signalfd\n");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
+                         (void *)(unsigned long) sigfd);
+
+    return 0;
+}
 
 int qemu_init_main_loop(void)
 {
     int ret;
+    sigset_t blocked_signals;
 
     cpu_set_debug_excp_handler(cpu_debug_handler);
 
+    blocked_signals = block_io_signals();
+
+    ret = qemu_signalfd_init(blocked_signals);
+    if (ret)
+        return ret;
+
+    /* Note eventfd must be drained before signalfd handlers run */
     ret = qemu_event_init();
     if (ret)
         return ret;
@@ -347,7 +409,6 @@ int qemu_init_main_loop(void)
     qemu_mutex_init(&qemu_global_mutex);
     qemu_mutex_lock(&qemu_global_mutex);
 
-    unblock_io_signals();
     qemu_thread_self(&io_thread);
 
     return 0;
@@ -586,19 +647,22 @@ static void kvm_init_ipi(CPUState *env)
     }
 }
 
-static void unblock_io_signals(void)
+static sigset_t block_io_signals(void)
 {
     sigset_t set;
 
+    /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
     sigaddset(&set, SIGUSR2);
-    sigaddset(&set, SIGIO);
-    sigaddset(&set, SIGALRM);
     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 
     sigemptyset(&set);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    return set;
 }
 
 void qemu_mutex_lock_iothread(void)



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 2/8] iothread: use signalfd
@ 2010-10-11 18:31       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: block-io-signals-in-iothread --]
[-- Type: text/plain, Size: 3270 bytes --]

Block SIGALRM, SIGIO and consume them via signalfd.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -33,6 +33,7 @@
 #include "exec-all.h"
 
 #include "cpus.h"
+#include "compatfd.h"
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -329,14 +330,75 @@ static QemuCond qemu_work_cond;
 
 static void tcg_init_ipi(void);
 static void kvm_init_ipi(CPUState *env);
-static void unblock_io_signals(void);
+static sigset_t block_io_signals(void);
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+static void sigfd_handler(void *opaque)
+{
+    int fd = (unsigned long) opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            printf("read from sigfd returned %zd: %m\n", len);
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signalfd_init(sigset_t mask)
+{
+    int sigfd;
+
+    sigfd = qemu_signalfd(&mask);
+    if (sigfd == -1) {
+        fprintf(stderr, "failed to create signalfd\n");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
+                         (void *)(unsigned long) sigfd);
+
+    return 0;
+}
 
 int qemu_init_main_loop(void)
 {
     int ret;
+    sigset_t blocked_signals;
 
     cpu_set_debug_excp_handler(cpu_debug_handler);
 
+    blocked_signals = block_io_signals();
+
+    ret = qemu_signalfd_init(blocked_signals);
+    if (ret)
+        return ret;
+
+    /* Note eventfd must be drained before signalfd handlers run */
     ret = qemu_event_init();
     if (ret)
         return ret;
@@ -347,7 +409,6 @@ int qemu_init_main_loop(void)
     qemu_mutex_init(&qemu_global_mutex);
     qemu_mutex_lock(&qemu_global_mutex);
 
-    unblock_io_signals();
     qemu_thread_self(&io_thread);
 
     return 0;
@@ -586,19 +647,22 @@ static void kvm_init_ipi(CPUState *env)
     }
 }
 
-static void unblock_io_signals(void)
+static sigset_t block_io_signals(void)
 {
     sigset_t set;
 
+    /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
     sigaddset(&set, SIGUSR2);
-    sigaddset(&set, SIGIO);
-    sigaddset(&set, SIGALRM);
     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 
     sigemptyset(&set);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    return set;
 }
 
 void qemu_mutex_lock_iothread(void)

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 3/8] Expose thread_id in info cpus
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: thread-id --]
[-- Type: text/plain, Size: 3812 bytes --]

commit ce6325ff1af34dbaee91c8d28e792277e43f1227
Author: Glauber Costa <gcosta@redhat.com>
Date:   Wed Mar 5 17:01:10 2008 -0300

    Augment info cpus
    
    This patch exposes the thread id associated with each
    cpu through the already well known 'info cpus' interface.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-defs.h
===================================================================
--- qemu.orig/cpu-defs.h
+++ qemu/cpu-defs.h
@@ -197,6 +197,7 @@ typedef struct CPUWatchpoint {
     int nr_cores;  /* number of cores within this CPU package */        \
     int nr_threads;/* number of threads within this CPU */              \
     int running; /* Nonzero if cpu is currently running(usermode).  */  \
+    int thread_id;                                                      \
     /* user data */                                                     \
     void *opaque;                                                       \
                                                                         \
Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -539,6 +539,7 @@ static void *kvm_cpu_thread_fn(void *arg
 
     qemu_mutex_lock(&qemu_global_mutex);
     qemu_thread_self(env->thread);
+    env->thread_id = get_thread_id();
     if (kvm_enabled())
         kvm_init_vcpu(env);
 
@@ -578,6 +579,10 @@ static void *tcg_cpu_thread_fn(void *arg
     while (!qemu_system_ready)
         qemu_cond_timedwait(&qemu_system_cond, &qemu_global_mutex, 100);
 
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        env->thread_id = get_thread_id();
+    }
+
     while (1) {
         cpu_exec_all();
         qemu_tcg_wait_io_event();
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -637,6 +637,7 @@ void cpu_exec_init(CPUState *env)
     env->numa_node = 0;
     QTAILQ_INIT(&env->breakpoints);
     QTAILQ_INIT(&env->watchpoints);
+    env->thread_id = get_thread_id();
     *penv = env;
 #if defined(CONFIG_USER_ONLY)
     cpu_list_unlock();
Index: qemu/osdep.c
===================================================================
--- qemu.orig/osdep.c
+++ qemu/osdep.c
@@ -44,6 +44,10 @@
 extern int madvise(caddr_t, size_t, int);
 #endif
 
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+
 #ifdef CONFIG_EVENTFD
 #include <sys/eventfd.h>
 #endif
@@ -200,6 +204,17 @@ int qemu_create_pidfile(const char *file
     return 0;
 }
 
+int get_thread_id(void)
+{
+#if defined (_WIN32)
+    return GetCurrentThreadId();
+#elif defined (__linux__)
+    return syscall(SYS_gettid);
+#else
+    return getpid();
+#endif
+}
+
 #ifdef _WIN32
 
 /* mingw32 needs ffs for compilations without optimization. */
Index: qemu/osdep.h
===================================================================
--- qemu.orig/osdep.h
+++ qemu/osdep.h
@@ -126,6 +126,7 @@ void qemu_vfree(void *ptr);
 int qemu_madvise(void *addr, size_t len, int advice);
 
 int qemu_create_pidfile(const char *filename);
+int get_thread_id(void);
 
 #ifdef _WIN32
 int ffs(int i);
Index: qemu/monitor.c
===================================================================
--- qemu.orig/monitor.c
+++ qemu/monitor.c
@@ -878,6 +878,9 @@ static void print_cpu_iter(QObject *obj,
         monitor_printf(mon, " (halted)");
     }
 
+    monitor_printf(mon, " thread_id=%" PRId64 " ",
+					qdict_get_int(cpu, "thread_id"));
+
     monitor_printf(mon, "\n");
 }
 
@@ -922,6 +925,7 @@ static void do_info_cpus(Monitor *mon, Q
 #elif defined(TARGET_MIPS)
         qdict_put(cpu, "PC", qint_from_int(env->active_tc.PC));
 #endif
+        qdict_put(cpu, "thread_id", qint_from_int(env->thread_id));
 
         qlist_append(cpu_list, cpu);
     }



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 3/8] Expose thread_id in info cpus
@ 2010-10-11 18:31       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: thread-id --]
[-- Type: text/plain, Size: 3810 bytes --]

commit ce6325ff1af34dbaee91c8d28e792277e43f1227
Author: Glauber Costa <gcosta@redhat.com>
Date:   Wed Mar 5 17:01:10 2008 -0300

    Augment info cpus
    
    This patch exposes the thread id associated with each
    cpu through the already well known 'info cpus' interface.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-defs.h
===================================================================
--- qemu.orig/cpu-defs.h
+++ qemu/cpu-defs.h
@@ -197,6 +197,7 @@ typedef struct CPUWatchpoint {
     int nr_cores;  /* number of cores within this CPU package */        \
     int nr_threads;/* number of threads within this CPU */              \
     int running; /* Nonzero if cpu is currently running(usermode).  */  \
+    int thread_id;                                                      \
     /* user data */                                                     \
     void *opaque;                                                       \
                                                                         \
Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -539,6 +539,7 @@ static void *kvm_cpu_thread_fn(void *arg
 
     qemu_mutex_lock(&qemu_global_mutex);
     qemu_thread_self(env->thread);
+    env->thread_id = get_thread_id();
     if (kvm_enabled())
         kvm_init_vcpu(env);
 
@@ -578,6 +579,10 @@ static void *tcg_cpu_thread_fn(void *arg
     while (!qemu_system_ready)
         qemu_cond_timedwait(&qemu_system_cond, &qemu_global_mutex, 100);
 
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        env->thread_id = get_thread_id();
+    }
+
     while (1) {
         cpu_exec_all();
         qemu_tcg_wait_io_event();
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -637,6 +637,7 @@ void cpu_exec_init(CPUState *env)
     env->numa_node = 0;
     QTAILQ_INIT(&env->breakpoints);
     QTAILQ_INIT(&env->watchpoints);
+    env->thread_id = get_thread_id();
     *penv = env;
 #if defined(CONFIG_USER_ONLY)
     cpu_list_unlock();
Index: qemu/osdep.c
===================================================================
--- qemu.orig/osdep.c
+++ qemu/osdep.c
@@ -44,6 +44,10 @@
 extern int madvise(caddr_t, size_t, int);
 #endif
 
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+
 #ifdef CONFIG_EVENTFD
 #include <sys/eventfd.h>
 #endif
@@ -200,6 +204,17 @@ int qemu_create_pidfile(const char *file
     return 0;
 }
 
+int get_thread_id(void)
+{
+#if defined (_WIN32)
+    return GetCurrentThreadId();
+#elif defined (__linux__)
+    return syscall(SYS_gettid);
+#else
+    return getpid();
+#endif
+}
+
 #ifdef _WIN32
 
 /* mingw32 needs ffs for compilations without optimization. */
Index: qemu/osdep.h
===================================================================
--- qemu.orig/osdep.h
+++ qemu/osdep.h
@@ -126,6 +126,7 @@ void qemu_vfree(void *ptr);
 int qemu_madvise(void *addr, size_t len, int advice);
 
 int qemu_create_pidfile(const char *filename);
+int get_thread_id(void);
 
 #ifdef _WIN32
 int ffs(int i);
Index: qemu/monitor.c
===================================================================
--- qemu.orig/monitor.c
+++ qemu/monitor.c
@@ -878,6 +878,9 @@ static void print_cpu_iter(QObject *obj,
         monitor_printf(mon, " (halted)");
     }
 
+    monitor_printf(mon, " thread_id=%" PRId64 " ",
+					qdict_get_int(cpu, "thread_id"));
+
     monitor_printf(mon, "\n");
 }
 
@@ -922,6 +925,7 @@ static void do_info_cpus(Monitor *mon, Q
 #elif defined(TARGET_MIPS)
         qdict_put(cpu, "PC", qint_from_int(env->active_tc.PC));
 #endif
+        qdict_put(cpu, "thread_id", qint_from_int(env->thread_id));
 
         qlist_append(cpu_list, cpu);
     }

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 4/8] kvm: x86: add mce support
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: mce --]
[-- Type: text/plain, Size: 4542 bytes --]

Port qemu-kvm's MCE support

commit c68b2374c9048812f488e00ffb95db66c0bc07a7
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Jul 20 10:00:53 2009 +0800

    Add MCE simulation support to qemu/kvm
    
    KVM ioctls are used to initialize MCE simulation and inject MCE. The
    real MCE simulation is implemented in Linux kernel. The Kernel part
    has been merged.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -27,6 +27,7 @@
 #include "exec-all.h"
 #include "qemu-common.h"
 #include "kvm.h"
+#include "kvm_x86.h"
 
 //#define DEBUG_MMU
 
@@ -1030,6 +1031,11 @@ void cpu_inject_x86_mce(CPUState *cenv, 
     if (bank >= bank_num || !(status & MCI_STATUS_VAL))
         return;
 
+    if (kvm_enabled()) {
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        return;
+    }
+
     /*
      * if MSR_MCG_CTL is not all 1s, the uncorrected error
      * reporting is disabled
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -27,6 +27,7 @@
 #include "hw/pc.h"
 #include "hw/apic.h"
 #include "ioport.h"
+#include "kvm_x86.h"
 
 #ifdef CONFIG_KVM_PARA
 #include <linux/kvm_para.h>
@@ -167,6 +168,67 @@ static int get_para_features(CPUState *e
 }
 #endif
 
+#ifdef KVM_CAP_MCE
+static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
+                                     int *max_banks)
+{
+    int r;
+
+    r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
+    if (r > 0) {
+        *max_banks = r;
+        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
+    }
+    return -ENOSYS;
+}
+
+static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
+}
+
+static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
+}
+
+struct kvm_x86_mce_data
+{
+    CPUState *env;
+    struct kvm_x86_mce *mce;
+};
+
+static void kvm_do_inject_x86_mce(void *_data)
+{
+    struct kvm_x86_mce_data *data = _data;
+    int r;
+
+    r = kvm_set_mce(data->env, data->mce);
+    if (r < 0)
+        perror("kvm_set_mce FAILED");
+}
+#endif
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+#ifdef KVM_CAP_MCE
+    struct kvm_x86_mce mce = {
+        .bank = bank,
+        .status = status,
+        .mcg_status = mcg_status,
+        .addr = addr,
+        .misc = misc,
+    };
+    struct kvm_x86_mce_data data = {
+            .env = cenv,
+            .mce = &mce,
+    };
+
+    run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#endif
+}
+
 int kvm_arch_init_vcpu(CPUState *env)
 {
     struct {
@@ -274,6 +336,28 @@ int kvm_arch_init_vcpu(CPUState *env)
 
     cpuid_data.cpuid.nent = cpuid_i;
 
+#ifdef KVM_CAP_MCE
+    if (((env->cpuid_version >> 8)&0xF) >= 6
+        && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
+        && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
+        uint64_t mcg_cap;
+        int banks;
+
+        if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks))
+            perror("kvm_get_mce_cap_supported FAILED");
+        else {
+            if (banks > MCE_BANKS_DEF)
+                banks = MCE_BANKS_DEF;
+            mcg_cap &= MCE_CAP_DEF;
+            mcg_cap |= banks;
+            if (kvm_setup_mce(env, &mcg_cap))
+                perror("kvm_setup_mce FAILED");
+            else
+                env->mcg_cap = mcg_cap;
+        }
+    }
+#endif
+
     return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
 }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- /dev/null
+++ qemu/target-i386/kvm_x86.h
@@ -0,0 +1,21 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright (C) 2009 Red Hat Inc.
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __KVM_X86_H__
+#define __KVM_X86_H__
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+
+#endif



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 4/8] kvm: x86: add mce support
@ 2010-10-11 18:31       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: mce --]
[-- Type: text/plain, Size: 4540 bytes --]

Port qemu-kvm's MCE support

commit c68b2374c9048812f488e00ffb95db66c0bc07a7
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Jul 20 10:00:53 2009 +0800

    Add MCE simulation support to qemu/kvm
    
    KVM ioctls are used to initialize MCE simulation and inject MCE. The
    real MCE simulation is implemented in Linux kernel. The Kernel part
    has been merged.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -27,6 +27,7 @@
 #include "exec-all.h"
 #include "qemu-common.h"
 #include "kvm.h"
+#include "kvm_x86.h"
 
 //#define DEBUG_MMU
 
@@ -1030,6 +1031,11 @@ void cpu_inject_x86_mce(CPUState *cenv, 
     if (bank >= bank_num || !(status & MCI_STATUS_VAL))
         return;
 
+    if (kvm_enabled()) {
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        return;
+    }
+
     /*
      * if MSR_MCG_CTL is not all 1s, the uncorrected error
      * reporting is disabled
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -27,6 +27,7 @@
 #include "hw/pc.h"
 #include "hw/apic.h"
 #include "ioport.h"
+#include "kvm_x86.h"
 
 #ifdef CONFIG_KVM_PARA
 #include <linux/kvm_para.h>
@@ -167,6 +168,67 @@ static int get_para_features(CPUState *e
 }
 #endif
 
+#ifdef KVM_CAP_MCE
+static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
+                                     int *max_banks)
+{
+    int r;
+
+    r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
+    if (r > 0) {
+        *max_banks = r;
+        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
+    }
+    return -ENOSYS;
+}
+
+static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
+}
+
+static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
+{
+    return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
+}
+
+struct kvm_x86_mce_data
+{
+    CPUState *env;
+    struct kvm_x86_mce *mce;
+};
+
+static void kvm_do_inject_x86_mce(void *_data)
+{
+    struct kvm_x86_mce_data *data = _data;
+    int r;
+
+    r = kvm_set_mce(data->env, data->mce);
+    if (r < 0)
+        perror("kvm_set_mce FAILED");
+}
+#endif
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+#ifdef KVM_CAP_MCE
+    struct kvm_x86_mce mce = {
+        .bank = bank,
+        .status = status,
+        .mcg_status = mcg_status,
+        .addr = addr,
+        .misc = misc,
+    };
+    struct kvm_x86_mce_data data = {
+            .env = cenv,
+            .mce = &mce,
+    };
+
+    run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#endif
+}
+
 int kvm_arch_init_vcpu(CPUState *env)
 {
     struct {
@@ -274,6 +336,28 @@ int kvm_arch_init_vcpu(CPUState *env)
 
     cpuid_data.cpuid.nent = cpuid_i;
 
+#ifdef KVM_CAP_MCE
+    if (((env->cpuid_version >> 8)&0xF) >= 6
+        && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
+        && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
+        uint64_t mcg_cap;
+        int banks;
+
+        if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks))
+            perror("kvm_get_mce_cap_supported FAILED");
+        else {
+            if (banks > MCE_BANKS_DEF)
+                banks = MCE_BANKS_DEF;
+            mcg_cap &= MCE_CAP_DEF;
+            mcg_cap |= banks;
+            if (kvm_setup_mce(env, &mcg_cap))
+                perror("kvm_setup_mce FAILED");
+            else
+                env->mcg_cap = mcg_cap;
+        }
+    }
+#endif
+
     return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
 }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- /dev/null
+++ qemu/target-i386/kvm_x86.h
@@ -0,0 +1,21 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright (C) 2009 Red Hat Inc.
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __KVM_X86_H__
+#define __KVM_X86_H__
+
+void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+
+#endif

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 5/8] Export qemu_ram_addr_from_host
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: do_qemu_ram_addr_from_host --]
[-- Type: text/plain, Size: 3271 bytes --]

To be used by next patches.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-common.h
===================================================================
--- qemu.orig/cpu-common.h
+++ qemu/cpu-common.h
@@ -47,7 +47,8 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr);
+int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
                            CPUWriteMemoryFunc * const *mem_write,
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2086,7 +2086,7 @@ static inline void tlb_update_dirty(CPUT
     if ((tlb_entry->addr_write & ~TARGET_PAGE_MASK) == IO_MEM_RAM) {
         p = (void *)(unsigned long)((tlb_entry->addr_write & TARGET_PAGE_MASK)
             + tlb_entry->addend);
-        ram_addr = qemu_ram_addr_from_host(p);
+        ram_addr = qemu_ram_addr_from_host_nofail(p);
         if (!cpu_physical_memory_is_dirty(ram_addr)) {
             tlb_entry->addr_write |= TLB_NOTDIRTY;
         }
@@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return NULL;
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (host - block->host < block->length) {
-            return block->offset + (host - block->host);
+            *ram_addr = block->offset + (host - block->host);
+            return 0;
         }
     }
+    return -1;
+}
 
-    fprintf(stderr, "Bad ram pointer %p\n", ptr);
-    abort();
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
+{
+    ram_addr_t ram_addr;
 
-    return 0;
+    if (qemu_ram_addr_from_host(ptr, &ram_addr)) {
+        fprintf(stderr, "Bad ram pointer %p\n", ptr);
+        abort();
+    }
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
@@ -3703,7 +3711,7 @@ void cpu_physical_memory_unmap(void *buf
 {
     if (buffer != bounce.buffer) {
         if (is_write) {
-            ram_addr_t addr1 = qemu_ram_addr_from_host(buffer);
+            ram_addr_t addr1 = qemu_ram_addr_from_host_nofail(buffer);
             while (access_len) {
                 unsigned l;
                 l = TARGET_PAGE_SIZE;
Index: qemu/exec-all.h
===================================================================
--- qemu.orig/exec-all.h
+++ qemu/exec-all.h
@@ -334,7 +334,7 @@ static inline tb_page_addr_t get_page_ad
     }
     p = (void *)(unsigned long)addr
         + env1->tlb_table[mmu_idx][page_index].addend;
-    return qemu_ram_addr_from_host(p);
+    return qemu_ram_addr_from_host_nofail(p);
 }
 #endif
 



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 5/8] Export qemu_ram_addr_from_host
@ 2010-10-11 18:31       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: do_qemu_ram_addr_from_host --]
[-- Type: text/plain, Size: 3269 bytes --]

To be used by next patches.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpu-common.h
===================================================================
--- qemu.orig/cpu-common.h
+++ qemu/cpu-common.h
@@ -47,7 +47,8 @@ void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
 void *qemu_get_ram_ptr(ram_addr_t addr);
 /* This should not be used by devices.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr);
+int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
 
 int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
                            CPUWriteMemoryFunc * const *mem_write,
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2086,7 +2086,7 @@ static inline void tlb_update_dirty(CPUT
     if ((tlb_entry->addr_write & ~TARGET_PAGE_MASK) == IO_MEM_RAM) {
         p = (void *)(unsigned long)((tlb_entry->addr_write & TARGET_PAGE_MASK)
             + tlb_entry->addend);
-        ram_addr = qemu_ram_addr_from_host(p);
+        ram_addr = qemu_ram_addr_from_host_nofail(p);
         if (!cpu_physical_memory_is_dirty(ram_addr)) {
             tlb_entry->addr_write |= TLB_NOTDIRTY;
         }
@@ -2938,23 +2938,31 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return NULL;
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
-   (typically a TLB entry) back to a ram offset.  */
-ram_addr_t qemu_ram_addr_from_host(void *ptr)
+int qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
         if (host - block->host < block->length) {
-            return block->offset + (host - block->host);
+            *ram_addr = block->offset + (host - block->host);
+            return 0;
         }
     }
+    return -1;
+}
 
-    fprintf(stderr, "Bad ram pointer %p\n", ptr);
-    abort();
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
+{
+    ram_addr_t ram_addr;
 
-    return 0;
+    if (qemu_ram_addr_from_host(ptr, &ram_addr)) {
+        fprintf(stderr, "Bad ram pointer %p\n", ptr);
+        abort();
+    }
+    return ram_addr;
 }
 
 static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
@@ -3703,7 +3711,7 @@ void cpu_physical_memory_unmap(void *buf
 {
     if (buffer != bounce.buffer) {
         if (is_write) {
-            ram_addr_t addr1 = qemu_ram_addr_from_host(buffer);
+            ram_addr_t addr1 = qemu_ram_addr_from_host_nofail(buffer);
             while (access_len) {
                 unsigned l;
                 l = TARGET_PAGE_SIZE;
Index: qemu/exec-all.h
===================================================================
--- qemu.orig/exec-all.h
+++ qemu/exec-all.h
@@ -334,7 +334,7 @@ static inline tb_page_addr_t get_page_ad
     }
     p = (void *)(unsigned long)addr
         + env1->tlb_table[mmu_idx][page_index].addend;
-    return qemu_ram_addr_from_host(p);
+    return qemu_ram_addr_from_host_nofail(p);
 }
 #endif
 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 6/8] Add RAM -> physical addr mapping in MCE simulation
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: kvm_physical_memory_addr_from_ram --]
[-- Type: text/plain, Size: 1710 bytes --]

From: Huang Ying <ying.huang@intel.com>

In QEMU-KVM, physical address != RAM address. While MCE simulation
needs physical address instead of RAM address. So
kvm_physical_memory_addr_from_ram() is implemented to do the
conversion, and it is invoked before being filled in the IA32_MCi_ADDR
MSR.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/kvm-all.c
===================================================================
--- qemu.orig/kvm-all.c
+++ qemu/kvm-all.c
@@ -137,6 +137,24 @@ static KVMSlot *kvm_lookup_overlapping_s
     return found;
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+        KVMSlot *mem = &s->slots[i];
+
+        if (ram_addr >= mem->phys_offset &&
+            ram_addr < mem->phys_offset + mem->memory_size) {
+            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 {
     struct kvm_userspace_memory_region mem;
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -174,6 +174,9 @@ static inline void cpu_synchronize_post_
     }
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr);
+
 #endif
 int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign);
 



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 6/8] Add RAM -> physical addr mapping in MCE simulation
@ 2010-10-11 18:31       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: kvm_physical_memory_addr_from_ram --]
[-- Type: text/plain, Size: 1708 bytes --]

From: Huang Ying <ying.huang@intel.com>

In QEMU-KVM, physical address != RAM address. While MCE simulation
needs physical address instead of RAM address. So
kvm_physical_memory_addr_from_ram() is implemented to do the
conversion, and it is invoked before being filled in the IA32_MCi_ADDR
MSR.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/kvm-all.c
===================================================================
--- qemu.orig/kvm-all.c
+++ qemu/kvm-all.c
@@ -137,6 +137,24 @@ static KVMSlot *kvm_lookup_overlapping_s
     return found;
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+        KVMSlot *mem = &s->slots[i];
+
+        if (ram_addr >= mem->phys_offset &&
+            ram_addr < mem->phys_offset + mem->memory_size) {
+            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 {
     struct kvm_userspace_memory_region mem;
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -174,6 +174,9 @@ static inline void cpu_synchronize_post_
     }
 }
 
+int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+                                      target_phys_addr_t *phys_addr);
+
 #endif
 int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign);
 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 7/8] MCE: Relay UCR MCE to guest
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: kvm-mce-sigbus --]
[-- Type: text/plain, Size: 14907 bytes --]

Port qemu-kvm's

commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Sep 21 10:43:25 2009 +0800

    MCE: Relay UCR MCE to guest
    
    UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
    where some hardware error such as some memory error can be reported
    without PCC (processor context corrupted). To recover from such MCE,
    the corresponding memory will be unmapped, and all processes accessing
    the memory will be killed via SIGBUS.
    
    For KVM, if QEMU/KVM is killed, all guest processes will be killed
    too. So we relay SIGBUS from host OS to guest system via a UCR MCE
    injection. Then guest OS can isolate corresponding memory and kill
    necessary guest processes only. SIGBUS sent to main thread (not VCPU
    threads) will be broadcast to all VCPU threads as UCR MCE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -34,6 +34,10 @@
 
 #include "cpus.h"
 #include "compatfd.h"
+#ifdef CONFIG_LINUX
+#include <sys/prctl.h>
+#include <sys/signalfd.h>
+#endif
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -41,6 +45,10 @@
 #define SIG_IPI SIGUSR1
 #endif
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
 static CPUState *next_cpu;
 
 /***********************************************************/
@@ -498,28 +506,77 @@ static void qemu_tcg_wait_io_event(void)
     }
 }
 
+static void sigbus_reraise(void)
+{
+    sigset_t set;
+    struct sigaction action;
+
+    memset(&action, 0, sizeof(action));
+    action.sa_handler = SIG_DFL;
+    if (!sigaction(SIGBUS, &action, NULL)) {
+        raise(SIGBUS);
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        sigprocmask(SIG_UNBLOCK, &set, NULL);
+    }
+    perror("Failed to re-raise SIGBUS!\n");
+    abort();
+}
+
+static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
+                           void *ctx)
+{
+#if defined(TARGET_I386)
+    if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
+#endif
+        sigbus_reraise();
+}
+
 static void qemu_kvm_eat_signal(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
 
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    qemu_mutex_unlock(&qemu_global_mutex);
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
-    qemu_mutex_lock(&qemu_global_mutex);
+    do {
+        qemu_mutex_unlock(&qemu_global_mutex);
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
+
+        qemu_mutex_lock(&qemu_global_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+#ifdef TARGET_I386
+            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
+#endif
+                sigbus_reraise();
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            fprintf(stderr, "sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 }
 
 static void qemu_kvm_wait_io_event(CPUState *env)
@@ -645,6 +702,7 @@ static void kvm_init_ipi(CPUState *env)
 
     pthread_sigmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
     r = kvm_set_signal_mask(env, &set);
     if (r) {
         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
@@ -655,6 +713,7 @@ static void kvm_init_ipi(CPUState *env)
 static sigset_t block_io_signals(void)
 {
     sigset_t set;
+    struct sigaction action;
 
     /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
@@ -665,8 +724,15 @@ static sigset_t block_io_signals(void)
     sigaddset(&set, SIGIO);
     sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
+    sigaddset(&set, SIGBUS);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1, 0, 0);
+
     return set;
 }
 
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
 
 void kvm_arch_reset_vcpu(CPUState *env);
 
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr);
+int kvm_on_sigbus(int code, void *addr);
+
 struct kvm_guest_debug;
 struct kvm_debug_exit_arch;
 
Index: qemu/target-i386/cpu.h
===================================================================
--- qemu.orig/target-i386/cpu.h
+++ qemu/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -46,6 +46,13 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #ifdef KVM_CAP_EXT_CPUID
 
 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
@@ -192,10 +199,39 @@ static int kvm_set_mce(CPUState *env, st
     return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
 }
 
+static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
+{
+    struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
+    int r;
+
+    kmsrs->nmsrs = n;
+    memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
+    r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
+    memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
+    free(kmsrs);
+    return r;
+}
+
+/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
+static int kvm_mce_in_exception(CPUState *env)
+{
+    struct kvm_msr_entry msr_mcg_status = {
+        .index = MSR_MCG_STATUS,
+    };
+    int r;
+
+    r = kvm_get_msr(env, &msr_mcg_status, 1);
+    if (r == -1 || r == 0) {
+        return -1;
+    }
+    return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
+}
+
 struct kvm_x86_mce_data
 {
     CPUState *env;
     struct kvm_x86_mce *mce;
+    int abort_on_error;
 };
 
 static void kvm_do_inject_x86_mce(void *_data)
@@ -203,14 +239,26 @@ static void kvm_do_inject_x86_mce(void *
     struct kvm_x86_mce_data *data = _data;
     int r;
 
+    /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
+    r = kvm_mce_in_exception(data->env);
+    if (r == -1)
+        fprintf(stderr, "Failed to get MCE status\n");
+    else if (r && !(data->mce->status & MCI_STATUS_AR))
+        return;
+
     r = kvm_set_mce(data->env, data->mce);
-    if (r < 0)
+    if (r < 0) {
         perror("kvm_set_mce FAILED");
+        if (data->abort_on_error) {
+            abort();
+        }
+    }
 }
 #endif
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error)
 {
 #ifdef KVM_CAP_MCE
     struct kvm_x86_mce mce = {
@@ -225,7 +273,15 @@ void kvm_inject_x86_mce(CPUState *cenv, 
             .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
+
     run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+    if (abort_on_error)
+        abort();
 #endif
 }
 
@@ -1525,3 +1581,122 @@ bool kvm_arch_stop_on_emulation_error(CP
               ((env->segs[R_CS].selector  & 3) != 3);
 }
 
+static void hardware_memory_error(void)
+{
+    fprintf(stderr, "Hardware memory error!\n");
+    exit(1);
+}
+
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    void *vaddr;
+    ram_addr_t ram_addr;
+    unsigned long paddr;
+    int r;
+
+    if ((env->mcg_cap & MCG_SER_P) && addr
+        && (code == BUS_MCEERR_AR
+            || code == BUS_MCEERR_AO)) {
+        if (code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /*
+             * If there is an MCE excpetion being processed, ignore
+             * this SRAO MCE
+             */
+            r = kvm_mce_in_exception(env);
+            if (r == -1) {
+                fprintf(stderr, "Failed to get MCE status\n");
+            } else if (r) {
+                return 0;
+            }
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        vaddr = (void *)addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (code == BUS_MCEERR_AO) {
+                return 0;
+            } else {
+                hardware_memory_error();
+            }
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            abort();
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int kvm_on_sigbus(int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
+        uint64_t status;
+        void *vaddr;
+        ram_addr_t ram_addr;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        vaddr = addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %p\n", addr);
+            return 0;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -1032,7 +1032,7 @@ void cpu_inject_x86_mce(CPUState *cenv, 
         return;
 
     if (kvm_enabled()) {
-        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
         return;
     }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- qemu.orig/target-i386/kvm_x86.h
+++ qemu/target-i386/kvm_x86.h
@@ -16,6 +16,7 @@
 #define __KVM_X86_H__
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error);
 
 #endif
Index: qemu/kvm-stub.c
===================================================================
--- qemu.orig/kvm-stub.c
+++ qemu/kvm-stub.c
@@ -141,3 +141,9 @@ int kvm_set_ioeventfd_mmio_long(int fd, 
 {
     return -ENOSYS;
 }
+
+int kvm_on_sigbus(int code, void *addr)
+{
+    return 1;
+}
+



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 7/8] MCE: Relay UCR MCE to guest
@ 2010-10-11 18:31       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: kvm-mce-sigbus --]
[-- Type: text/plain, Size: 14905 bytes --]

Port qemu-kvm's

commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Sep 21 10:43:25 2009 +0800

    MCE: Relay UCR MCE to guest
    
    UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
    where some hardware error such as some memory error can be reported
    without PCC (processor context corrupted). To recover from such MCE,
    the corresponding memory will be unmapped, and all processes accessing
    the memory will be killed via SIGBUS.
    
    For KVM, if QEMU/KVM is killed, all guest processes will be killed
    too. So we relay SIGBUS from host OS to guest system via a UCR MCE
    injection. Then guest OS can isolate corresponding memory and kill
    necessary guest processes only. SIGBUS sent to main thread (not VCPU
    threads) will be broadcast to all VCPU threads as UCR MCE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/cpus.c
===================================================================
--- qemu.orig/cpus.c
+++ qemu/cpus.c
@@ -34,6 +34,10 @@
 
 #include "cpus.h"
 #include "compatfd.h"
+#ifdef CONFIG_LINUX
+#include <sys/prctl.h>
+#include <sys/signalfd.h>
+#endif
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -41,6 +45,10 @@
 #define SIG_IPI SIGUSR1
 #endif
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
 static CPUState *next_cpu;
 
 /***********************************************************/
@@ -498,28 +506,77 @@ static void qemu_tcg_wait_io_event(void)
     }
 }
 
+static void sigbus_reraise(void)
+{
+    sigset_t set;
+    struct sigaction action;
+
+    memset(&action, 0, sizeof(action));
+    action.sa_handler = SIG_DFL;
+    if (!sigaction(SIGBUS, &action, NULL)) {
+        raise(SIGBUS);
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        sigprocmask(SIG_UNBLOCK, &set, NULL);
+    }
+    perror("Failed to re-raise SIGBUS!\n");
+    abort();
+}
+
+static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
+                           void *ctx)
+{
+#if defined(TARGET_I386)
+    if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
+#endif
+        sigbus_reraise();
+}
+
 static void qemu_kvm_eat_signal(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
 
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    qemu_mutex_unlock(&qemu_global_mutex);
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
-    qemu_mutex_lock(&qemu_global_mutex);
+    do {
+        qemu_mutex_unlock(&qemu_global_mutex);
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
+
+        qemu_mutex_lock(&qemu_global_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+#ifdef TARGET_I386
+            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
+#endif
+                sigbus_reraise();
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            fprintf(stderr, "sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 }
 
 static void qemu_kvm_wait_io_event(CPUState *env)
@@ -645,6 +702,7 @@ static void kvm_init_ipi(CPUState *env)
 
     pthread_sigmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
     r = kvm_set_signal_mask(env, &set);
     if (r) {
         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
@@ -655,6 +713,7 @@ static void kvm_init_ipi(CPUState *env)
 static sigset_t block_io_signals(void)
 {
     sigset_t set;
+    struct sigaction action;
 
     /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
@@ -665,8 +724,15 @@ static sigset_t block_io_signals(void)
     sigaddset(&set, SIGIO);
     sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
+    sigaddset(&set, SIGBUS);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1, 0, 0);
+
     return set;
 }
 
Index: qemu/kvm.h
===================================================================
--- qemu.orig/kvm.h
+++ qemu/kvm.h
@@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
 
 void kvm_arch_reset_vcpu(CPUState *env);
 
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr);
+int kvm_on_sigbus(int code, void *addr);
+
 struct kvm_guest_debug;
 struct kvm_debug_exit_arch;
 
Index: qemu/target-i386/cpu.h
===================================================================
--- qemu.orig/target-i386/cpu.h
+++ qemu/target-i386/cpu.h
@@ -250,16 +250,32 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -46,6 +46,13 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #ifdef KVM_CAP_EXT_CPUID
 
 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
@@ -192,10 +199,39 @@ static int kvm_set_mce(CPUState *env, st
     return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
 }
 
+static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
+{
+    struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
+    int r;
+
+    kmsrs->nmsrs = n;
+    memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
+    r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
+    memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
+    free(kmsrs);
+    return r;
+}
+
+/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
+static int kvm_mce_in_exception(CPUState *env)
+{
+    struct kvm_msr_entry msr_mcg_status = {
+        .index = MSR_MCG_STATUS,
+    };
+    int r;
+
+    r = kvm_get_msr(env, &msr_mcg_status, 1);
+    if (r == -1 || r == 0) {
+        return -1;
+    }
+    return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
+}
+
 struct kvm_x86_mce_data
 {
     CPUState *env;
     struct kvm_x86_mce *mce;
+    int abort_on_error;
 };
 
 static void kvm_do_inject_x86_mce(void *_data)
@@ -203,14 +239,26 @@ static void kvm_do_inject_x86_mce(void *
     struct kvm_x86_mce_data *data = _data;
     int r;
 
+    /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
+    r = kvm_mce_in_exception(data->env);
+    if (r == -1)
+        fprintf(stderr, "Failed to get MCE status\n");
+    else if (r && !(data->mce->status & MCI_STATUS_AR))
+        return;
+
     r = kvm_set_mce(data->env, data->mce);
-    if (r < 0)
+    if (r < 0) {
         perror("kvm_set_mce FAILED");
+        if (data->abort_on_error) {
+            abort();
+        }
+    }
 }
 #endif
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error)
 {
 #ifdef KVM_CAP_MCE
     struct kvm_x86_mce mce = {
@@ -225,7 +273,15 @@ void kvm_inject_x86_mce(CPUState *cenv, 
             .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
+
     run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+    if (abort_on_error)
+        abort();
 #endif
 }
 
@@ -1525,3 +1581,122 @@ bool kvm_arch_stop_on_emulation_error(CP
               ((env->segs[R_CS].selector  & 3) != 3);
 }
 
+static void hardware_memory_error(void)
+{
+    fprintf(stderr, "Hardware memory error!\n");
+    exit(1);
+}
+
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    void *vaddr;
+    ram_addr_t ram_addr;
+    unsigned long paddr;
+    int r;
+
+    if ((env->mcg_cap & MCG_SER_P) && addr
+        && (code == BUS_MCEERR_AR
+            || code == BUS_MCEERR_AO)) {
+        if (code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /*
+             * If there is an MCE excpetion being processed, ignore
+             * this SRAO MCE
+             */
+            r = kvm_mce_in_exception(env);
+            if (r == -1) {
+                fprintf(stderr, "Failed to get MCE status\n");
+            } else if (r) {
+                return 0;
+            }
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        vaddr = (void *)addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (code == BUS_MCEERR_AO) {
+                return 0;
+            } else {
+                hardware_memory_error();
+            }
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            abort();
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int kvm_on_sigbus(int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
+        uint64_t status;
+        void *vaddr;
+        ram_addr_t ram_addr;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        vaddr = addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %p\n", addr);
+            return 0;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
Index: qemu/target-i386/helper.c
===================================================================
--- qemu.orig/target-i386/helper.c
+++ qemu/target-i386/helper.c
@@ -1032,7 +1032,7 @@ void cpu_inject_x86_mce(CPUState *cenv, 
         return;
 
     if (kvm_enabled()) {
-        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
         return;
     }
 
Index: qemu/target-i386/kvm_x86.h
===================================================================
--- qemu.orig/target-i386/kvm_x86.h
+++ qemu/target-i386/kvm_x86.h
@@ -16,6 +16,7 @@
 #define __KVM_X86_H__
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error);
 
 #endif
Index: qemu/kvm-stub.c
===================================================================
--- qemu.orig/kvm-stub.c
+++ qemu/kvm-stub.c
@@ -141,3 +141,9 @@ int kvm_set_ioeventfd_mmio_long(int fd, 
 {
     return -ENOSYS;
 }
+
+int kvm_on_sigbus(int code, void *addr)
+{
+    return 1;
+}
+

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [patch 8/8] Add savevm/loadvm support for MCE
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-11 18:31       ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Huang Ying, Dean Nelson, Marcelo Tosatti

[-- Attachment #1: mce-save-restore --]
[-- Type: text/plain, Size: 2913 bytes --]

Port qemu-kvm's

commit 1bab5d11545d8de5facf46c28630085a2f9651ae
Author: Huang Ying <ying.huang@intel.com>
Date:   Wed Mar 3 16:52:46 2010 +0800

    Add savevm/loadvm support for MCE
    
    MCE registers are saved/load into/from CPUState in
    kvm_arch_save/load_regs. To simulate the MCG_STATUS clearing upon
    reset, MSR_MCG_STATUS is set to 0 for KVM_PUT_RESET_STATE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -774,7 +774,7 @@ static int kvm_put_msrs(CPUState *env, i
         struct kvm_msr_entry entries[100];
     } msr_data;
     struct kvm_msr_entry *msrs = msr_data.entries;
-    int n = 0;
+    int i, n = 0;
 
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
@@ -794,6 +794,18 @@ static int kvm_put_msrs(CPUState *env, i
                           env->system_time_msr);
         kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
     }
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        if (level == KVM_PUT_RESET_STATE)
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+        else if (level == KVM_PUT_FULL_STATE) {
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
+            for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+                kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
+        }
+    }
+#endif
 
     msr_data.info.nmsrs = n;
 
@@ -1001,6 +1013,15 @@ static int kvm_get_msrs(CPUState *env)
     msrs[n++].index = MSR_KVM_SYSTEM_TIME;
     msrs[n++].index = MSR_KVM_WALL_CLOCK;
 
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        msrs[n++].index = MSR_MCG_STATUS;
+        msrs[n++].index = MSR_MCG_CTL;
+        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+            msrs[n++].index = MSR_MC0_CTL + i;
+    }
+#endif
+
     msr_data.info.nmsrs = n;
     ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
     if (ret < 0)
@@ -1043,6 +1064,22 @@ static int kvm_get_msrs(CPUState *env)
         case MSR_KVM_WALL_CLOCK:
             env->wall_clock_msr = msrs[i].data;
             break;
+#ifdef KVM_CAP_MCE
+        case MSR_MCG_STATUS:
+            env->mcg_status = msrs[i].data;
+            break;
+        case MSR_MCG_CTL:
+            env->mcg_ctl = msrs[i].data;
+            break;
+#endif
+        default:
+#ifdef KVM_CAP_MCE
+            if (msrs[i].index >= MSR_MC0_CTL &&
+                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
+                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
+                break;
+            }
+#endif
         }
     }
 



^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] [patch 8/8] Add savevm/loadvm support for MCE
@ 2010-10-11 18:31       ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-11 18:31 UTC (permalink / raw)
  To: kvm, qemu-devel; +Cc: Dean Nelson, Marcelo Tosatti, Huang Ying

[-- Attachment #1: mce-save-restore --]
[-- Type: text/plain, Size: 2911 bytes --]

Port qemu-kvm's

commit 1bab5d11545d8de5facf46c28630085a2f9651ae
Author: Huang Ying <ying.huang@intel.com>
Date:   Wed Mar 3 16:52:46 2010 +0800

    Add savevm/loadvm support for MCE
    
    MCE registers are saved/load into/from CPUState in
    kvm_arch_save/load_regs. To simulate the MCG_STATUS clearing upon
    reset, MSR_MCG_STATUS is set to 0 for KVM_PUT_RESET_STATE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: qemu/target-i386/kvm.c
===================================================================
--- qemu.orig/target-i386/kvm.c
+++ qemu/target-i386/kvm.c
@@ -774,7 +774,7 @@ static int kvm_put_msrs(CPUState *env, i
         struct kvm_msr_entry entries[100];
     } msr_data;
     struct kvm_msr_entry *msrs = msr_data.entries;
-    int n = 0;
+    int i, n = 0;
 
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
@@ -794,6 +794,18 @@ static int kvm_put_msrs(CPUState *env, i
                           env->system_time_msr);
         kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
     }
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        if (level == KVM_PUT_RESET_STATE)
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+        else if (level == KVM_PUT_FULL_STATE) {
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
+            kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
+            for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+                kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
+        }
+    }
+#endif
 
     msr_data.info.nmsrs = n;
 
@@ -1001,6 +1013,15 @@ static int kvm_get_msrs(CPUState *env)
     msrs[n++].index = MSR_KVM_SYSTEM_TIME;
     msrs[n++].index = MSR_KVM_WALL_CLOCK;
 
+#ifdef KVM_CAP_MCE
+    if (env->mcg_cap) {
+        msrs[n++].index = MSR_MCG_STATUS;
+        msrs[n++].index = MSR_MCG_CTL;
+        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
+            msrs[n++].index = MSR_MC0_CTL + i;
+    }
+#endif
+
     msr_data.info.nmsrs = n;
     ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
     if (ret < 0)
@@ -1043,6 +1064,22 @@ static int kvm_get_msrs(CPUState *env)
         case MSR_KVM_WALL_CLOCK:
             env->wall_clock_msr = msrs[i].data;
             break;
+#ifdef KVM_CAP_MCE
+        case MSR_MCG_STATUS:
+            env->mcg_status = msrs[i].data;
+            break;
+        case MSR_MCG_CTL:
+            env->mcg_ctl = msrs[i].data;
+            break;
+#endif
+        default:
+#ifdef KVM_CAP_MCE
+            if (msrs[i].index >= MSR_MC0_CTL &&
+                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
+                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
+                break;
+            }
+#endif
         }
     }
 

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 0/8] port qemu-kvm's MCE support (v3)
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-14 10:25       ` Avi Kivity
  -1 siblings, 0 replies; 93+ messages in thread
From: Avi Kivity @ 2010-10-14 10:25 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

  On 10/11/2010 08:31 PM, Marcelo Tosatti wrote:
> Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
> allows qemu to propagate MCEs to the guest.
>
> v2:
> - rename do_qemu_ram_addr_from_host.
> - fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
> - fix bank register restoration (Dean Nelson).
>
> v3:
> - condition MCE generation on MCE_SEG_P bit (Huang Ying).
>

I only see patches 1 and 4 from v2, and this cover letter from v3.  
Please repost.

Also, if the patchset ends up with qemu-kvm master being different from 
uq/master in this area, please post the corresponding qemu-kvm master 
patches.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch 0/8] port qemu-kvm's MCE support (v3)
@ 2010-10-14 10:25       ` Avi Kivity
  0 siblings, 0 replies; 93+ messages in thread
From: Avi Kivity @ 2010-10-14 10:25 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

  On 10/11/2010 08:31 PM, Marcelo Tosatti wrote:
> Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
> allows qemu to propagate MCEs to the guest.
>
> v2:
> - rename do_qemu_ram_addr_from_host.
> - fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
> - fix bank register restoration (Dean Nelson).
>
> v3:
> - condition MCE generation on MCE_SEG_P bit (Huang Ying).
>

I only see patches 1 and 4 from v2, and this cover letter from v3.  
Please repost.

Also, if the patchset ends up with qemu-kvm master being different from 
uq/master in this area, please post the corresponding qemu-kvm master 
patches.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 0/8] port qemu-kvm's MCE support (v3)
  2010-10-14 10:25       ` [Qemu-devel] " Avi Kivity
@ 2010-10-14 16:21         ` Marcelo Tosatti
  -1 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-14 16:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

On Thu, Oct 14, 2010 at 12:25:34PM +0200, Avi Kivity wrote:
>  On 10/11/2010 08:31 PM, Marcelo Tosatti wrote:
> >Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
> >allows qemu to propagate MCEs to the guest.
> >
> >v2:
> >- rename do_qemu_ram_addr_from_host.
> >- fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
> >- fix bank register restoration (Dean Nelson).
> >
> >v3:
> >- condition MCE generation on MCE_SEG_P bit (Huang Ying).
> >
> 
> I only see patches 1 and 4 from v2, and this cover letter from v3.
> Please repost.

Done.

> Also, if the patchset ends up with qemu-kvm master being different
> from uq/master in this area, please post the corresponding qemu-kvm
> master patches.

Nope. I'll fix it up on the next qemu merge.


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch 0/8] port qemu-kvm's MCE support (v3)
@ 2010-10-14 16:21         ` Marcelo Tosatti
  0 siblings, 0 replies; 93+ messages in thread
From: Marcelo Tosatti @ 2010-10-14 16:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

On Thu, Oct 14, 2010 at 12:25:34PM +0200, Avi Kivity wrote:
>  On 10/11/2010 08:31 PM, Marcelo Tosatti wrote:
> >Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
> >allows qemu to propagate MCEs to the guest.
> >
> >v2:
> >- rename do_qemu_ram_addr_from_host.
> >- fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
> >- fix bank register restoration (Dean Nelson).
> >
> >v3:
> >- condition MCE generation on MCE_SEG_P bit (Huang Ying).
> >
> 
> I only see patches 1 and 4 from v2, and this cover letter from v3.
> Please repost.

Done.

> Also, if the patchset ends up with qemu-kvm master being different
> from uq/master in this area, please post the corresponding qemu-kvm
> master patches.

Nope. I'll fix it up on the next qemu merge.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 0/8] port qemu-kvm's MCE support (v3 resend)
  2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
@ 2010-10-17  9:32       ` Avi Kivity
  -1 siblings, 0 replies; 93+ messages in thread
From: Avi Kivity @ 2010-10-17  9:32 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, qemu-devel, Huang Ying, Dean Nelson

  On 10/11/2010 08:31 PM, Marcelo Tosatti wrote:
> Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
> allows qemu to propagate MCEs to the guest.
>
> v2:
> - rename do_qemu_ram_addr_from_host.
> - fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
> - fix bank register restoration (Dean Nelson).
>
> v3:
> - condition MCE generation on MCE_SEG_P bit (Huang Ying).
>
>

Thanks, applied to uq/master.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [Qemu-devel] Re: [patch 0/8] port qemu-kvm's MCE support (v3 resend)
@ 2010-10-17  9:32       ` Avi Kivity
  0 siblings, 0 replies; 93+ messages in thread
From: Avi Kivity @ 2010-10-17  9:32 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Dean Nelson, qemu-devel, kvm, Huang Ying

  On 10/11/2010 08:31 PM, Marcelo Tosatti wrote:
> Port qemu-kvm's KVM MCE (Machine Check Exception) handling to qemu. It
> allows qemu to propagate MCEs to the guest.
>
> v2:
> - rename do_qemu_ram_addr_from_host.
> - fix kvm_on_sigbus/kvm_on_sigbus_vcpu naming.
> - fix bank register restoration (Dean Nelson).
>
> v3:
> - condition MCE generation on MCE_SEG_P bit (Huang Ying).
>
>

Thanks, applied to uq/master.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 93+ messages in thread

end of thread, other threads:[~2010-10-17  9:32 UTC | newest]

Thread overview: 93+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-04 18:54 [patch uq/master 0/8] port qemu-kvm's MCE support Marcelo Tosatti
2010-10-04 18:54 ` [Qemu-devel] " Marcelo Tosatti
2010-10-04 18:54 ` [patch uq/master 1/8] signalfd compatibility Marcelo Tosatti
2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
2010-10-04 18:54 ` [patch uq/master 2/8] iothread: use signalfd Marcelo Tosatti
2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
2010-10-04 18:54 ` [patch uq/master 3/8] Expose thread_id in info cpus Marcelo Tosatti
2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
2010-10-04 18:54 ` [patch uq/master 4/8] kvm: x86: add mce support Marcelo Tosatti
2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
2010-10-04 18:54 ` [patch uq/master 5/8] Export qemu_ram_addr_from_host Marcelo Tosatti
2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
2010-10-05 12:57   ` Anthony Liguori
2010-10-05 12:57     ` [Qemu-devel] " Anthony Liguori
2010-10-05 20:13     ` Marcelo Tosatti
2010-10-05 20:13       ` [Qemu-devel] " Marcelo Tosatti
2010-10-05 20:48       ` Anthony Liguori
2010-10-05 20:48         ` [Qemu-devel] " Anthony Liguori
2010-10-04 18:54 ` [patch uq/master 6/8] Add RAM -> physical addr mapping in MCE simulation Marcelo Tosatti
2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
2010-10-04 18:54 ` [patch uq/master 7/8] MCE: Relay UCR MCE to guest Marcelo Tosatti
2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
2010-10-06  1:10   ` Hidetoshi Seto
2010-10-06  1:10     ` [Qemu-devel] " Hidetoshi Seto
2010-10-06 16:02     ` Marcelo Tosatti
2010-10-06 16:02       ` [Qemu-devel] " Marcelo Tosatti
2010-10-06  1:58   ` Hidetoshi Seto
2010-10-06  1:58     ` [Qemu-devel] " Hidetoshi Seto
2010-10-06 16:05     ` Marcelo Tosatti
2010-10-06 16:05       ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 18:10       ` Dean Nelson
2010-10-06 18:10         ` [Qemu-devel] " Dean Nelson
2010-10-07  3:41         ` Hidetoshi Seto
2010-10-07  3:41           ` [Qemu-devel] " Hidetoshi Seto
2010-10-07 15:23           ` Dean Nelson
2010-10-07 15:23             ` [Qemu-devel] " Dean Nelson
2010-10-08  3:15           ` Huang Ying
2010-10-08  3:15             ` [Qemu-devel] " Huang Ying
2010-10-08  5:54             ` Hidetoshi Seto
2010-10-08  5:54               ` [Qemu-devel] " Hidetoshi Seto
2010-10-08 12:02             ` Dean Nelson
2010-10-08 12:02               ` [Qemu-devel] " Dean Nelson
2010-10-08  2:50       ` Huang Ying
2010-10-08  2:50         ` [Qemu-devel] " Huang Ying
2010-10-04 18:54 ` [patch uq/master 8/8] Add savevm/loadvm support for MCE Marcelo Tosatti
2010-10-04 18:54   ` [Qemu-devel] " Marcelo Tosatti
2010-10-05 16:31 ` [Qemu-devel] [patch uq/master 0/8] port qemu-kvm's MCE support Andreas Färber
2010-10-05 18:58   ` Chris Wright
2010-10-05 20:24     ` Marcelo Tosatti
2010-10-06 17:34 ` [patch uq/master 0/8] port qemu-kvm's MCE support (v2) Marcelo Tosatti
2010-10-06 17:34   ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 17:34   ` [patch uq/master 1/8] signalfd compatibility Marcelo Tosatti
2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 17:34   ` [patch uq/master 2/8] iothread: use signalfd Marcelo Tosatti
2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 17:34   ` [patch uq/master 3/8] Expose thread_id in info cpus Marcelo Tosatti
2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 17:34   ` [patch uq/master 4/8] kvm: x86: add mce support Marcelo Tosatti
2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 19:32     ` Anthony Liguori
2010-10-06 19:32       ` [Qemu-devel] " Anthony Liguori
2010-10-06 17:34   ` [patch uq/master 5/8] Export qemu_ram_addr_from_host Marcelo Tosatti
2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 17:34   ` [patch uq/master 6/8] Add RAM -> physical addr mapping in MCE simulation Marcelo Tosatti
2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 17:34   ` [patch uq/master 7/8] MCE: Relay UCR MCE to guest Marcelo Tosatti
2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
2010-10-06 17:34   ` [patch uq/master 8/8] Add savevm/loadvm support for MCE Marcelo Tosatti
2010-10-06 17:34     ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31   ` [patch 0/8] port qemu-kvm's MCE support (v3) Marcelo Tosatti
2010-10-11 18:31     ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31     ` [patch 1/8] signalfd compatibility Marcelo Tosatti
2010-10-11 18:31       ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31     ` [patch 2/8] iothread: use signalfd Marcelo Tosatti
2010-10-11 18:31       ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31     ` [patch 3/8] Expose thread_id in info cpus Marcelo Tosatti
2010-10-11 18:31       ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31     ` [patch 4/8] kvm: x86: add mce support Marcelo Tosatti
2010-10-11 18:31       ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31     ` [patch 5/8] Export qemu_ram_addr_from_host Marcelo Tosatti
2010-10-11 18:31       ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31     ` [patch 6/8] Add RAM -> physical addr mapping in MCE simulation Marcelo Tosatti
2010-10-11 18:31       ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31     ` [patch 7/8] MCE: Relay UCR MCE to guest Marcelo Tosatti
2010-10-11 18:31       ` [Qemu-devel] " Marcelo Tosatti
2010-10-11 18:31     ` [patch 8/8] Add savevm/loadvm support for MCE Marcelo Tosatti
2010-10-11 18:31       ` [Qemu-devel] " Marcelo Tosatti
2010-10-14 10:25     ` [patch 0/8] port qemu-kvm's MCE support (v3) Avi Kivity
2010-10-14 10:25       ` [Qemu-devel] " Avi Kivity
2010-10-14 16:21       ` Marcelo Tosatti
2010-10-14 16:21         ` [Qemu-devel] " Marcelo Tosatti
2010-10-17  9:32     ` [patch 0/8] port qemu-kvm's MCE support (v3 resend) Avi Kivity
2010-10-17  9:32       ` [Qemu-devel] " Avi Kivity

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.