[PATCH v2 1/8] kvm tools: Use correct value for user signal base

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 1/8] kvm tools: Use correct value for user signal base
@ 2011-05-30  8:30 Sasha Levin
  2011-05-30  8:30 ` [PATCH v2 2/8] kvm tools: Remove wrong global definition of kvm Sasha Levin
                   ` (7 more replies)
  0 siblings, 8 replies; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:30 UTC (permalink / raw)
  To: penberg; +Cc: kvm, mingo, asias.hejun, gorcunov, prasadjoshi124, Sasha Levin

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/include/kvm/kvm.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
index f951f2d..6a17362 100644
--- a/tools/kvm/include/kvm/kvm.h
+++ b/tools/kvm/include/kvm/kvm.h
@@ -11,7 +11,7 @@
 #define KVM_32BIT_GAP_SIZE	(512 << 20)
 #define KVM_32BIT_GAP_START	((1ULL << 32) - KVM_32BIT_GAP_SIZE)
 
-#define SIGKVMEXIT		(SIGUSR1 + 2)
+#define SIGKVMEXIT		(SIGRTMIN + 0)
 
 struct kvm {
 	int			sys_fd;		/* For system ioctls(), i.e. /dev/kvm */
-- 
1.7.5.rc3


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH v2 2/8] kvm tools: Remove wrong global definition of kvm
  2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
@ 2011-05-30  8:30 ` Sasha Levin
  2011-05-30  8:38   ` Ingo Molnar
  2011-05-30  8:30 ` [PATCH v2 3/8] kvm tools: Allow pausing guests Sasha Levin
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:30 UTC (permalink / raw)
  To: penberg; +Cc: kvm, mingo, asias.hejun, gorcunov, prasadjoshi124, Sasha Levin

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/include/kvm/mptable.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/tools/kvm/include/kvm/mptable.h b/tools/kvm/include/kvm/mptable.h
index 3c8362d..8557ae8 100644
--- a/tools/kvm/include/kvm/mptable.h
+++ b/tools/kvm/include/kvm/mptable.h
@@ -1,7 +1,7 @@
 #ifndef KVM_MPTABLE_H_
 #define KVM_MPTABLE_H_
 
-struct kvm kvm;
+struct kvm;
 
 void mptable_setup(struct kvm *kvm, unsigned int ncpus);
 
-- 
1.7.5.rc3


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH v2 3/8] kvm tools: Allow pausing guests
  2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
  2011-05-30  8:30 ` [PATCH v2 2/8] kvm tools: Remove wrong global definition of kvm Sasha Levin
@ 2011-05-30  8:30 ` Sasha Levin
  2011-05-30  8:39   ` Ingo Molnar
  2011-05-30  8:30 ` [PATCH v2 4/8] kvm tools: Pause/resume guest using SIGUSR2 Sasha Levin
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:30 UTC (permalink / raw)
  To: penberg; +Cc: kvm, mingo, asias.hejun, gorcunov, prasadjoshi124, Sasha Levin

Allow pausing and unpausing guests running on the host.
Pausing a guest means that none of the VCPU threads are running
KVM_RUN until they are unpaused.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/include/kvm/kvm-cpu.h |    1 +
 tools/kvm/include/kvm/kvm.h     |    4 +++
 tools/kvm/kvm-cpu.c             |   20 +++++++++++---
 tools/kvm/kvm-run.c             |    4 +-
 tools/kvm/kvm.c                 |   54 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/tools/kvm/include/kvm/kvm-cpu.h b/tools/kvm/include/kvm/kvm-cpu.h
index b2b6fce..4d99246 100644
--- a/tools/kvm/include/kvm/kvm-cpu.h
+++ b/tools/kvm/include/kvm/kvm-cpu.h
@@ -23,6 +23,7 @@ struct kvm_cpu {
 	struct kvm_msrs		*msrs;		/* dynamically allocated */
 
 	u8			is_running;
+	u8			paused;
 };
 
 struct kvm_cpu *kvm_cpu__init(struct kvm *kvm, unsigned long cpu_id);
diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
index 6a17362..d22a849 100644
--- a/tools/kvm/include/kvm/kvm.h
+++ b/tools/kvm/include/kvm/kvm.h
@@ -12,6 +12,7 @@
 #define KVM_32BIT_GAP_START	((1ULL << 32) - KVM_32BIT_GAP_SIZE)
 
 #define SIGKVMEXIT		(SIGRTMIN + 0)
+#define SIGKVMPAUSE		(SIGRTMIN + 1)
 
 struct kvm {
 	int			sys_fd;		/* For system ioctls(), i.e. /dev/kvm */
@@ -50,6 +51,9 @@ bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int s
 bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write);
 bool kvm__register_mmio(u64 phys_addr, u64 phys_addr_len, void (*kvm_mmio_callback_fn)(u64 addr, u8 *data, u32 len, u8 is_write));
 bool kvm__deregister_mmio(u64 phys_addr);
+void kvm__pause(void);
+void kvm__continue(void);
+void kvm__notify_paused(void);
 
 /*
  * Debugging
diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c
index de0591f..be0528b 100644
--- a/tools/kvm/kvm-cpu.c
+++ b/tools/kvm/kvm-cpu.c
@@ -383,11 +383,15 @@ void kvm_cpu__run(struct kvm_cpu *vcpu)
 		die_perror("KVM_RUN failed");
 }
 
-static void kvm_cpu_exit_handler(int signum)
+static void kvm_cpu_signal_handler(int signum)
 {
-	if (current_kvm_cpu->is_running) {
-		current_kvm_cpu->is_running = false;
-		pthread_kill(pthread_self(), SIGKVMEXIT);
+	if (signum == SIGKVMEXIT) {
+		if (current_kvm_cpu->is_running) {
+			current_kvm_cpu->is_running = false;
+			pthread_kill(pthread_self(), SIGKVMEXIT);
+		}
+	} else if (signum == SIGKVMPAUSE) {
+		current_kvm_cpu->paused = 1;
 	}
 }
 
@@ -400,12 +404,18 @@ int kvm_cpu__start(struct kvm_cpu *cpu)
 
 	pthread_sigmask(SIG_BLOCK, &sigset, NULL);
 
-	signal(SIGKVMEXIT, kvm_cpu_exit_handler);
+	signal(SIGKVMEXIT, kvm_cpu_signal_handler);
+	signal(SIGKVMPAUSE, kvm_cpu_signal_handler);
 
 	kvm_cpu__setup_cpuid(cpu);
 	kvm_cpu__reset_vcpu(cpu);
 
 	for (;;) {
+		if (cpu->paused) {
+			kvm__notify_paused();
+			cpu->paused = 0;
+		}
+
 		kvm_cpu__run(cpu);
 
 		switch (cpu->kvm_run->exit_reason) {
diff --git a/tools/kvm/kvm-run.c b/tools/kvm/kvm-run.c
index 48b8e70..761ac0d 100644
--- a/tools/kvm/kvm-run.c
+++ b/tools/kvm/kvm-run.c
@@ -47,8 +47,8 @@
 #define MIN_RAM_SIZE_MB		(64ULL)
 #define MIN_RAM_SIZE_BYTE	(MIN_RAM_SIZE_MB << MB_SHIFT)
 
-static struct kvm *kvm;
-static struct kvm_cpu *kvm_cpus[KVM_NR_CPUS];
+struct kvm *kvm;
+struct kvm_cpu *kvm_cpus[KVM_NR_CPUS];
 __thread struct kvm_cpu *current_kvm_cpu;
 
 static u64 ram_size;
diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c
index 1d756e0..54e3203 100644
--- a/tools/kvm/kvm.c
+++ b/tools/kvm/kvm.c
@@ -6,6 +6,8 @@
 #include "kvm/interrupt.h"
 #include "kvm/mptable.h"
 #include "kvm/util.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm-cpu.h"
 
 #include <linux/kvm.h>
 
@@ -25,6 +27,7 @@
 #include <stdio.h>
 #include <fcntl.h>
 #include <time.h>
+#include <sys/eventfd.h>
 
 #define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
 
@@ -68,6 +71,11 @@ struct {
 	{ DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) },
 };
 
+extern struct kvm *kvm;
+extern struct kvm_cpu *kvm_cpus[KVM_NR_CPUS];
+static int pause_event;
+static DEFINE_MUTEX(pause_lock);
+
 static bool kvm__supports_extension(struct kvm *kvm, unsigned int extension)
 {
 	int ret;
@@ -575,3 +583,49 @@ void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size)
 				  p[n + 4], p[n + 5], p[n + 6], p[n + 7]);
 	}
 }
+
+void kvm__pause(void)
+{
+	int i, paused_vcpus = 0;
+
+	/* Check if the guest is running */
+	if (!kvm_cpus[0] || kvm_cpus[0]->thread == 0)
+		return;
+
+	mutex_lock(&pause_lock);
+
+	pause_event = eventfd(0, 0);
+	if (pause_event < 0)
+		die("Failed creating pause notification event");
+	for (i = 0; i < kvm->nrcpus; i++)
+		pthread_kill(kvm_cpus[i]->thread, SIGKVMPAUSE);
+
+	while (paused_vcpus < kvm->nrcpus) {
+		u64 cur_read;
+
+		if (read(pause_event, &cur_read, sizeof(cur_read)) < 0)
+			die("Failed reading pause event");
+		paused_vcpus += cur_read;
+	}
+	close(pause_event);
+}
+
+void kvm__continue(void)
+{
+	/* Check if the guest is running */
+	if (!kvm_cpus[0] || kvm_cpus[0]->thread == 0)
+		return;
+
+	mutex_unlock(&pause_lock);
+}
+
+void kvm__notify_paused(void)
+{
+	u64 p = 1;
+
+	if (write(pause_event, &p, sizeof(p)) < 0)
+		die("Failed notifying of paused VCPU.");
+
+	mutex_lock(&pause_lock);
+	mutex_unlock(&pause_lock);
+}
-- 
1.7.5.rc3


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH v2 4/8] kvm tools: Pause/resume guest using SIGUSR2
  2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
  2011-05-30  8:30 ` [PATCH v2 2/8] kvm tools: Remove wrong global definition of kvm Sasha Levin
  2011-05-30  8:30 ` [PATCH v2 3/8] kvm tools: Allow pausing guests Sasha Levin
@ 2011-05-30  8:30 ` Sasha Levin
  2011-05-30  8:41   ` Ingo Molnar
  2011-05-30  8:30 ` [PATCH v2 5/8] kvm tools: Add a brlock Sasha Levin
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:30 UTC (permalink / raw)
  To: penberg; +Cc: kvm, mingo, asias.hejun, gorcunov, prasadjoshi124, Sasha Levin

Make SIGUSR2 pause/resume a guest, this allows to easily test
pausing a guest.
Can be tested using cmdline 'kill -USR2 $(pidof kvm)'.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/kvm-run.c |   15 +++++++++++++++
 1 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/tools/kvm/kvm-run.c b/tools/kvm/kvm-run.c
index 761ac0d..49cc0aa 100644
--- a/tools/kvm/kvm-run.c
+++ b/tools/kvm/kvm-run.c
@@ -165,6 +165,20 @@ static void handle_sigusr1(int sig)
 	mb();
 }
 
+/* Pause/resume the guest using SIGUSR2 */
+static int is_paused;
+
+static void handle_sigusr2(int sig)
+{
+	if (is_paused)
+		kvm__continue();
+	else
+		kvm__pause();
+
+	is_paused = !is_paused;
+	pr_info("Guest %s\n", is_paused?"paused":"resumed");
+}
+
 static void handle_sigquit(int sig)
 {
 	int i;
@@ -422,6 +436,7 @@ int kvm_cmd_run(int argc, const char **argv, const char *prefix)
 	signal(SIGALRM, handle_sigalrm);
 	signal(SIGQUIT, handle_sigquit);
 	signal(SIGUSR1, handle_sigusr1);
+	signal(SIGUSR2, handle_sigusr2);
 
 	nr_online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 
-- 
1.7.5.rc3


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH v2 5/8] kvm tools: Add a brlock
  2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
                   ` (2 preceding siblings ...)
  2011-05-30  8:30 ` [PATCH v2 4/8] kvm tools: Pause/resume guest using SIGUSR2 Sasha Levin
@ 2011-05-30  8:30 ` Sasha Levin
  2011-05-30  8:30 ` [PATCH v2 6/8] kvm tools: Add rwlock wrapper Sasha Levin
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:30 UTC (permalink / raw)
  To: penberg; +Cc: kvm, mingo, asias.hejun, gorcunov, prasadjoshi124, Sasha Levin

brlock is a lock which is very cheap for reads, but very expensive
for writes.
This lock will be used when updates are very rare and reads are
common.
This lock is currently implemented by stopping the guest while
performing the updates. We assume that the only threads which
read from the locked data are VCPU threads, and the only writer
isn't a VCPU thread.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/include/kvm/brlock.h |   25 +++++++++++++++++++++++++
 1 files changed, 25 insertions(+), 0 deletions(-)
 create mode 100644 tools/kvm/include/kvm/brlock.h

diff --git a/tools/kvm/include/kvm/brlock.h b/tools/kvm/include/kvm/brlock.h
new file mode 100644
index 0000000..1863166
--- /dev/null
+++ b/tools/kvm/include/kvm/brlock.h
@@ -0,0 +1,25 @@
+#ifndef KVM__BRLOCK_H
+#define KVM__BRLOCK_H
+
+#include "kvm/kvm.h"
+#include "kvm/barrier.h"
+
+/*
+ * brlock is a lock which is very cheap for reads, but very expensive
+ * for writes.
+ * This lock will be used when updates are very rare and reads are common.
+ * This lock is currently implemented by stopping the guest while
+ * performing the updates. We assume that the only threads whichread from
+ * the locked data are VCPU threads, and the only writer isn't a VCPU thread.
+ */
+
+#ifndef barrier()
+#define barrier()		__asm__ __volatile__("": : :"memory")
+#endif
+
+#define br_read_lock()		barrier()
+#define br_read_unlock()	barrier()
+
+#define br_write_lock()		kvm__pause()
+#define br_write_unlock()	kvm__continue()
+#endif
-- 
1.7.5.rc3


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
                   ` (3 preceding siblings ...)
  2011-05-30  8:30 ` [PATCH v2 5/8] kvm tools: Add a brlock Sasha Levin
@ 2011-05-30  8:30 ` Sasha Levin
  2011-05-30  8:43   ` Ingo Molnar
  2011-05-30  8:30 ` [PATCH v2 7/8] kvm tools: Add debug mode to brlock Sasha Levin
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:30 UTC (permalink / raw)
  To: penberg; +Cc: kvm, mingo, asias.hejun, gorcunov, prasadjoshi124, Sasha Levin

Adds a rwlock wrapper which like the mutex wrapper makes rwlock calls
similar to their kernel counterparts.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/include/kvm/rwsem.h |   39 +++++++++++++++++++++++++++++++++++++++
 1 files changed, 39 insertions(+), 0 deletions(-)
 create mode 100644 tools/kvm/include/kvm/rwsem.h

diff --git a/tools/kvm/include/kvm/rwsem.h b/tools/kvm/include/kvm/rwsem.h
new file mode 100644
index 0000000..75a22f8
--- /dev/null
+++ b/tools/kvm/include/kvm/rwsem.h
@@ -0,0 +1,39 @@
+#ifndef KVM__RWSEM_H
+#define KVM__RWSEM_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike rwsem API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DECLARE_RWSEM(sem) pthread_rwlock_t sem = PTHREAD_RWLOCK_INITIALIZER
+
+static inline void down_read(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_rdlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_rdlock() failure!");
+}
+
+static inline void down_write(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_wrlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_wrlock() failure!");
+}
+
+static inline void up_read(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_unlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+static inline void up_write(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_unlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+#endif /* KVM__RWSEM_H */
-- 
1.7.5.rc3


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH v2 7/8] kvm tools: Add debug mode to brlock
  2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
                   ` (4 preceding siblings ...)
  2011-05-30  8:30 ` [PATCH v2 6/8] kvm tools: Add rwlock wrapper Sasha Levin
@ 2011-05-30  8:30 ` Sasha Levin
  2011-05-30  8:30 ` [PATCH v2 8/8] kvm tools: Use brlock in MMIO and IOPORT Sasha Levin
  2011-05-30  8:35 ` [PATCH v2 1/8] kvm tools: Use correct value for user signal base Ingo Molnar
  7 siblings, 0 replies; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:30 UTC (permalink / raw)
  To: penberg; +Cc: kvm, mingo, asias.hejun, gorcunov, prasadjoshi124, Sasha Levin

Adds a debug mode which allows to switch the brlock into
a big rwlock.
This can be used to verify we don't end up with a BKL kind
of lock with the current brlock implementation.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/include/kvm/brlock.h |   16 ++++++++++++++++
 1 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/tools/kvm/include/kvm/brlock.h b/tools/kvm/include/kvm/brlock.h
index 1863166..4cc0521 100644
--- a/tools/kvm/include/kvm/brlock.h
+++ b/tools/kvm/include/kvm/brlock.h
@@ -17,9 +17,25 @@
 #define barrier()		__asm__ __volatile__("": : :"memory")
 #endif
 
+#ifdef KVM_BRLOCK_DEBUG
+
+#include "kvm/rwsem.h"
+
+DECLARE_RWSEM(brlock_sem);
+
+#define br_read_lock()		down_read(&brlock_sem);
+#define br_read_unlock()	up_read(&brlock_sem);
+
+#define br_write_lock()		down_write(&brlock_sem);
+#define br_write_unlock()	up_write(&brlock_sem);
+
+#else
+
 #define br_read_lock()		barrier()
 #define br_read_unlock()	barrier()
 
 #define br_write_lock()		kvm__pause()
 #define br_write_unlock()	kvm__continue()
 #endif
+
+#endif
-- 
1.7.5.rc3


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* [PATCH v2 8/8] kvm tools: Use brlock in MMIO and IOPORT
  2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
                   ` (5 preceding siblings ...)
  2011-05-30  8:30 ` [PATCH v2 7/8] kvm tools: Add debug mode to brlock Sasha Levin
@ 2011-05-30  8:30 ` Sasha Levin
  2011-05-30  8:47   ` Ingo Molnar
  2011-05-30  8:35 ` [PATCH v2 1/8] kvm tools: Use correct value for user signal base Ingo Molnar
  7 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:30 UTC (permalink / raw)
  To: penberg; +Cc: kvm, mingo, asias.hejun, gorcunov, prasadjoshi124, Sasha Levin

Use brlock to protect mmio and ioport modules and make them
update-safe.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/ioport.c |   10 +++++++++-
 tools/kvm/mmio.c   |   17 +++++++++++++++--
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/tools/kvm/ioport.c b/tools/kvm/ioport.c
index d0a1aa8..e00fb59 100644
--- a/tools/kvm/ioport.c
+++ b/tools/kvm/ioport.c
@@ -2,7 +2,7 @@
 
 #include "kvm/kvm.h"
 #include "kvm/util.h"
-
+#include "kvm/brlock.h"
 #include "kvm/rbtree-interval.h"
 #include "kvm/mutex.h"
 
@@ -84,6 +84,7 @@ u16 ioport__register(u16 port, struct ioport_operations *ops, int count, void *p
 {
 	struct ioport *entry;
 
+	br_write_lock();
 	if (port == IOPORT_EMPTY)
 		port = ioport__find_free_port();
 
@@ -105,6 +106,8 @@ u16 ioport__register(u16 port, struct ioport_operations *ops, int count, void *p
 
 	ioport_insert(&ioport_tree, entry);
 
+	br_write_unlock();
+
 	return port;
 }
 
@@ -127,6 +130,7 @@ bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int s
 	bool ret = false;
 	struct ioport *entry;
 
+	br_read_lock();
 	entry = ioport_search(&ioport_tree, port);
 	if (!entry)
 		goto error;
@@ -141,11 +145,15 @@ bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int s
 			ret = ops->io_out(entry, kvm, port, data, size, count);
 	}
 
+	br_read_unlock();
+
 	if (!ret)
 		goto error;
 
 	return true;
 error:
+	br_read_unlock();
+
 	if (ioport_debug)
 		ioport_error(port, data, direction, size, count);
 
diff --git a/tools/kvm/mmio.c b/tools/kvm/mmio.c
index ef986bf..a57abeb 100644
--- a/tools/kvm/mmio.c
+++ b/tools/kvm/mmio.c
@@ -1,5 +1,6 @@
 #include "kvm/kvm.h"
 #include "kvm/rbtree-interval.h"
+#include "kvm/brlock.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -55,41 +56,53 @@ static const char *to_direction(u8 is_write)
 bool kvm__register_mmio(u64 phys_addr, u64 phys_addr_len, void (*kvm_mmio_callback_fn)(u64 addr, u8 *data, u32 len, u8 is_write))
 {
 	struct mmio_mapping *mmio;
+	int ret;
 
 	mmio = malloc(sizeof(*mmio));
 	if (mmio == NULL)
 		return false;
 
+	br_write_lock();
 	*mmio = (struct mmio_mapping) {
 		.node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
 		.kvm_mmio_callback_fn = kvm_mmio_callback_fn,
 	};
 
-	return mmio_insert(&mmio_tree, mmio);
+	ret = mmio_insert(&mmio_tree, mmio);
+	br_write_unlock();
+
+	return ret;
 }
 
 bool kvm__deregister_mmio(u64 phys_addr)
 {
 	struct mmio_mapping *mmio;
 
+	br_write_lock();
 	mmio = mmio_search_single(&mmio_tree, phys_addr);
 	if (mmio == NULL)
 		return false;
 
 	rb_int_erase(&mmio_tree, &mmio->node);
+	br_write_unlock();
+
 	free(mmio);
 	return true;
 }
 
 bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
 {
-	struct mmio_mapping *mmio = mmio_search(&mmio_tree, phys_addr, len);
+	struct mmio_mapping *mmio;
+
+	br_read_lock();
+	mmio = mmio_search(&mmio_tree, phys_addr, len);
 
 	if (mmio)
 		mmio->kvm_mmio_callback_fn(phys_addr, data, len, is_write);
 	else
 		fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n",
 			to_direction(is_write), phys_addr, len);
+	br_read_unlock();
 
 	return true;
 }
-- 
1.7.5.rc3


^ permalink raw reply related	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 1/8] kvm tools: Use correct value for user signal base
  2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
                   ` (6 preceding siblings ...)
  2011-05-30  8:30 ` [PATCH v2 8/8] kvm tools: Use brlock in MMIO and IOPORT Sasha Levin
@ 2011-05-30  8:35 ` Ingo Molnar
  2011-05-30  8:40   ` Sasha Levin
  7 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  8:35 UTC (permalink / raw)
  To: Sasha Levin; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124


* Sasha Levin <levinsasha928@gmail.com> wrote:

> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> ---
>  tools/kvm/include/kvm/kvm.h |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
> index f951f2d..6a17362 100644
> --- a/tools/kvm/include/kvm/kvm.h
> +++ b/tools/kvm/include/kvm/kvm.h
> @@ -11,7 +11,7 @@
>  #define KVM_32BIT_GAP_SIZE	(512 << 20)
>  #define KVM_32BIT_GAP_START	((1ULL << 32) - KVM_32BIT_GAP_SIZE)
>  
> -#define SIGKVMEXIT		(SIGUSR1 + 2)
> +#define SIGKVMEXIT		(SIGRTMIN + 0)

I think the changelog is missing essential information like what the 
effects of this bug were in practice. ('none' is a good answer as 
well.)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 2/8] kvm tools: Remove wrong global definition of kvm
  2011-05-30  8:30 ` [PATCH v2 2/8] kvm tools: Remove wrong global definition of kvm Sasha Levin
@ 2011-05-30  8:38   ` Ingo Molnar
  2011-05-30  8:59     ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  8:38 UTC (permalink / raw)
  To: Sasha Levin; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124

* Sasha Levin <levinsasha928@gmail.com> wrote:

> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> ---
>  tools/kvm/include/kvm/mptable.h |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/tools/kvm/include/kvm/mptable.h b/tools/kvm/include/kvm/mptable.h
> index 3c8362d..8557ae8 100644
> --- a/tools/kvm/include/kvm/mptable.h
> +++ b/tools/kvm/include/kvm/mptable.h
> @@ -1,7 +1,7 @@
>  #ifndef KVM_MPTABLE_H_
>  #define KVM_MPTABLE_H_
>  
> -struct kvm kvm;
> +struct kvm;

heh, that was a funny typo.

Btw., could we do a 'make test' run that includes every header file 
in a standalone .c file and checks whether that builds without 
warnings/errors? They are enumerated in the Makefile already.

That would check tools/kvm/ headers for sanity, type encapsulaion and 
self-sufficiently and would prevent the kind of header-dependencies 
spaghetti mess the kernel got into after ~20 years of randomly added 
kernel dependencies.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 3/8] kvm tools: Allow pausing guests
  2011-05-30  8:30 ` [PATCH v2 3/8] kvm tools: Allow pausing guests Sasha Levin
@ 2011-05-30  8:39   ` Ingo Molnar
  0 siblings, 0 replies; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  8:39 UTC (permalink / raw)
  To: Sasha Levin; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124


* Sasha Levin <levinsasha928@gmail.com> wrote:

> Allow pausing and unpausing guests running on the host.
> Pausing a guest means that none of the VCPU threads are running
> KVM_RUN until they are unpaused.

When adding new APIs please mention them in the changelog. Just 
listing them is enough:

 void kvm__pause(void);
 void kvm__continue(void);
 void kvm__notify_paused(void);

Also, standardizing on the 'Add APIs to ...' prefix for API additions 
would make the shortlog more obvious as well.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 1/8] kvm tools: Use correct value for user signal base
  2011-05-30  8:35 ` [PATCH v2 1/8] kvm tools: Use correct value for user signal base Ingo Molnar
@ 2011-05-30  8:40   ` Sasha Levin
  2011-05-30  8:49     ` Ingo Molnar
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:40 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124

On Mon, 2011-05-30 at 10:35 +0200, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > ---
> >  tools/kvm/include/kvm/kvm.h |    2 +-
> >  1 files changed, 1 insertions(+), 1 deletions(-)
> > 
> > diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
> > index f951f2d..6a17362 100644
> > --- a/tools/kvm/include/kvm/kvm.h
> > +++ b/tools/kvm/include/kvm/kvm.h
> > @@ -11,7 +11,7 @@
> >  #define KVM_32BIT_GAP_SIZE	(512 << 20)
> >  #define KVM_32BIT_GAP_START	((1ULL << 32) - KVM_32BIT_GAP_SIZE)
> >  
> > -#define SIGKVMEXIT		(SIGUSR1 + 2)
> > +#define SIGKVMEXIT		(SIGRTMIN + 0)
> 
> I think the changelog is missing essential information like what the 
> effects of this bug were in practice. ('none' is a good answer as 
> well.)

None - It made us exit when receiving SIGUSR2, which wasn't in use at
that time (Now it's used to pause the guest - thats how I found it).

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 4/8] kvm tools: Pause/resume guest using SIGUSR2
  2011-05-30  8:30 ` [PATCH v2 4/8] kvm tools: Pause/resume guest using SIGUSR2 Sasha Levin
@ 2011-05-30  8:41   ` Ingo Molnar
  0 siblings, 0 replies; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  8:41 UTC (permalink / raw)
  To: Sasha Levin; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124


* Sasha Levin <levinsasha928@gmail.com> wrote:

> Make SIGUSR2 pause/resume a guest, this allows to easily test
> pausing a guest.
> Can be tested using cmdline 'kill -USR2 $(pidof kvm)'.

Please locking tests to 'make test' as well (as i think Pekka also 
suggested).

Developers will otherwise readily forget how to test locking 
correctness.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  8:30 ` [PATCH v2 6/8] kvm tools: Add rwlock wrapper Sasha Levin
@ 2011-05-30  8:43   ` Ingo Molnar
  2011-05-30  9:29     ` Pekka Enberg
  0 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  8:43 UTC (permalink / raw)
  To: Sasha Levin; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124


* Sasha Levin <levinsasha928@gmail.com> wrote:

> Adds a rwlock wrapper which like the mutex wrapper makes rwlock calls
> similar to their kernel counterparts.

Testing idea: for example 'make test locking' could do the bzImage 
test-bootup, with rwlocks built instead of the brlocksj?

Pekka might have more ideas about how a good locking test-suite 
should be done, as JATO has one, right?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 8/8] kvm tools: Use brlock in MMIO and IOPORT
  2011-05-30  8:30 ` [PATCH v2 8/8] kvm tools: Use brlock in MMIO and IOPORT Sasha Levin
@ 2011-05-30  8:47   ` Ingo Molnar
  2011-05-30  8:56     ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  8:47 UTC (permalink / raw)
  To: Sasha Levin; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124


* Sasha Levin <levinsasha928@gmail.com> wrote:

> @@ -55,41 +56,53 @@ static const char *to_direction(u8 is_write)
>  bool kvm__register_mmio(u64 phys_addr, u64 phys_addr_len, void (*kvm_mmio_callback_fn)(u64 addr, u8 *data, u32 len, u8 is_write))
>  {
>  	struct mmio_mapping *mmio;
> +	int ret;
>  
>  	mmio = malloc(sizeof(*mmio));
>  	if (mmio == NULL)
>  		return false;
>  
> +	br_write_lock();
>  	*mmio = (struct mmio_mapping) {
>  		.node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
>  		.kvm_mmio_callback_fn = kvm_mmio_callback_fn,
>  	};

The initialization here does not use any global state AFAICS so it 
does not need the write lock, right?

>  
> -	return mmio_insert(&mmio_tree, mmio);
> +	ret = mmio_insert(&mmio_tree, mmio);
> +	br_write_unlock();

Shouldnt mmio_insert() thus have the write_lock()/unlock() sequence?

>  bool kvm__deregister_mmio(u64 phys_addr)
>  {
>  	struct mmio_mapping *mmio;
>  
> +	br_write_lock();
>  	mmio = mmio_search_single(&mmio_tree, phys_addr);
>  	if (mmio == NULL)
>  		return false;

Here we leak the write lock!

>  bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
>  {
> -	struct mmio_mapping *mmio = mmio_search(&mmio_tree, phys_addr, len);
> +	struct mmio_mapping *mmio;
> +
> +	br_read_lock();
> +	mmio = mmio_search(&mmio_tree, phys_addr, len);
>  
>  	if (mmio)
>  		mmio->kvm_mmio_callback_fn(phys_addr, data, len, is_write);
>  	else
>  		fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n",
>  			to_direction(is_write), phys_addr, len);
> +	br_read_unlock();
>  
>  	return true;
>  }

Yummie, scalability here we come! :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 1/8] kvm tools: Use correct value for user signal base
  2011-05-30  8:40   ` Sasha Levin
@ 2011-05-30  8:49     ` Ingo Molnar
  0 siblings, 0 replies; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  8:49 UTC (permalink / raw)
  To: Sasha Levin; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124


* Sasha Levin <levinsasha928@gmail.com> wrote:

> On Mon, 2011-05-30 at 10:35 +0200, Ingo Molnar wrote:
> > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > 
> > > Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > > ---
> > >  tools/kvm/include/kvm/kvm.h |    2 +-
> > >  1 files changed, 1 insertions(+), 1 deletions(-)
> > > 
> > > diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
> > > index f951f2d..6a17362 100644
> > > --- a/tools/kvm/include/kvm/kvm.h
> > > +++ b/tools/kvm/include/kvm/kvm.h
> > > @@ -11,7 +11,7 @@
> > >  #define KVM_32BIT_GAP_SIZE	(512 << 20)
> > >  #define KVM_32BIT_GAP_START	((1ULL << 32) - KVM_32BIT_GAP_SIZE)
> > >  
> > > -#define SIGKVMEXIT		(SIGUSR1 + 2)
> > > +#define SIGKVMEXIT		(SIGRTMIN + 0)
> > 
> > I think the changelog is missing essential information like what the 
> > effects of this bug were in practice. ('none' is a good answer as 
> > well.)
> 
> None - It made us exit when receiving SIGUSR2, which wasn't in use at
> that time (Now it's used to pause the guest - thats how I found it).

So the bug did have an effect after all - worth mentioning that in 
the changelog, fixes like this do not usually happen in a vacuum! :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 8/8] kvm tools: Use brlock in MMIO and IOPORT
  2011-05-30  8:47   ` Ingo Molnar
@ 2011-05-30  8:56     ` Sasha Levin
  0 siblings, 0 replies; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:56 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124

On Mon, 2011-05-30 at 10:47 +0200, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > @@ -55,41 +56,53 @@ static const char *to_direction(u8 is_write)
> >  bool kvm__register_mmio(u64 phys_addr, u64 phys_addr_len, void (*kvm_mmio_callback_fn)(u64 addr, u8 *data, u32 len, u8 is_write))
> >  {
> >  	struct mmio_mapping *mmio;
> > +	int ret;
> >  
> >  	mmio = malloc(sizeof(*mmio));
> >  	if (mmio == NULL)
> >  		return false;
> >  
> > +	br_write_lock();
> >  	*mmio = (struct mmio_mapping) {
> >  		.node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
> >  		.kvm_mmio_callback_fn = kvm_mmio_callback_fn,
> >  	};
> 
> The initialization here does not use any global state AFAICS so it 
> does not need the write lock, right?
> 
> >  
> > -	return mmio_insert(&mmio_tree, mmio);
> > +	ret = mmio_insert(&mmio_tree, mmio);
> > +	br_write_unlock();
> 
> Shouldnt mmio_insert() thus have the write_lock()/unlock() sequence?

Yes, the lock can be reduced to just the insert.

> >  bool kvm__deregister_mmio(u64 phys_addr)
> >  {
> >  	struct mmio_mapping *mmio;
> >  
> > +	br_write_lock();
> >  	mmio = mmio_search_single(&mmio_tree, phys_addr);
> >  	if (mmio == NULL)
> >  		return false;
> 
> Here we leak the write lock!
> 
> >  bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
> >  {
> > -	struct mmio_mapping *mmio = mmio_search(&mmio_tree, phys_addr, len);
> > +	struct mmio_mapping *mmio;
> > +
> > +	br_read_lock();
> > +	mmio = mmio_search(&mmio_tree, phys_addr, len);
> >  
> >  	if (mmio)
> >  		mmio->kvm_mmio_callback_fn(phys_addr, data, len, is_write);
> >  	else
> >  		fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n",
> >  			to_direction(is_write), phys_addr, len);
> > +	br_read_unlock();
> >  
> >  	return true;
> >  }
> 
> Yummie, scalability here we come! :-)
> 
> Thanks,
> 
> 	Ingo

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 2/8] kvm tools: Remove wrong global definition of kvm
  2011-05-30  8:38   ` Ingo Molnar
@ 2011-05-30  8:59     ` Sasha Levin
  0 siblings, 0 replies; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  8:59 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: penberg, kvm, asias.hejun, gorcunov, prasadjoshi124

On Mon, 2011-05-30 at 10:38 +0200, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > ---
> >  tools/kvm/include/kvm/mptable.h |    2 +-
> >  1 files changed, 1 insertions(+), 1 deletions(-)
> > 
> > diff --git a/tools/kvm/include/kvm/mptable.h b/tools/kvm/include/kvm/mptable.h
> > index 3c8362d..8557ae8 100644
> > --- a/tools/kvm/include/kvm/mptable.h
> > +++ b/tools/kvm/include/kvm/mptable.h
> > @@ -1,7 +1,7 @@
> >  #ifndef KVM_MPTABLE_H_
> >  #define KVM_MPTABLE_H_
> >  
> > -struct kvm kvm;
> > +struct kvm;
> 
> heh, that was a funny typo.
> 
> Btw., could we do a 'make test' run that includes every header file 
> in a standalone .c file and checks whether that builds without 
> warnings/errors? They are enumerated in the Makefile already.

They're not - we need to get it done first.

> That would check tools/kvm/ headers for sanity, type encapsulaion and 
> self-sufficiently and would prevent the kind of header-dependencies 
> spaghetti mess the kernel got into after ~20 years of randomly added 
> kernel dependencies.
> 
> Thanks,
> 
> 	Ingo

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  8:43   ` Ingo Molnar
@ 2011-05-30  9:29     ` Pekka Enberg
  2011-05-30  9:34       ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Pekka Enberg @ 2011-05-30  9:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Sasha Levin, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney

On Mon, May 30, 2011 at 11:43 AM, Ingo Molnar <mingo@elte.hu> wrote:
> Testing idea: for example 'make test locking' could do the bzImage
> test-bootup, with rwlocks built instead of the brlocksj?
>
> Pekka might have more ideas about how a good locking test-suite
> should be done, as JATO has one, right?

The current users of brlocks won't actually cause the guest to be
paused under load. That's the part I worry about breaking. So again,
the test case can be a simple as firing up 100-1000 threads where most
of them are taking the read lock but few of them racing to take the
write lock.

It's usually pretty easy to make buggy suspend/locking implementation
deadlock when run with more than one physical cores.

                        Pekka

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  9:29     ` Pekka Enberg
@ 2011-05-30  9:34       ` Sasha Levin
  2011-05-30  9:40         ` Pekka Enberg
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  9:34 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney

On Mon, 2011-05-30 at 12:29 +0300, Pekka Enberg wrote:
> On Mon, May 30, 2011 at 11:43 AM, Ingo Molnar <mingo@elte.hu> wrote:
> > Testing idea: for example 'make test locking' could do the bzImage
> > test-bootup, with rwlocks built instead of the brlocksj?
> >
> > Pekka might have more ideas about how a good locking test-suite
> > should be done, as JATO has one, right?
> 
> The current users of brlocks won't actually cause the guest to be
> paused under load. That's the part I worry about breaking. So again,
> the test case can be a simple as firing up 100-1000 threads where most
> of them are taking the read lock but few of them racing to take the
> write lock.

It would mean we need that many VCPUs: current br_read_lock() doesn't
really do anything, which means that running these tests with dummy
threads won't work.

> It's usually pretty easy to make buggy suspend/locking implementation
> deadlock when run with more than one physical cores.
> 
>                         Pekka

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  9:34       ` Sasha Levin
@ 2011-05-30  9:40         ` Pekka Enberg
  2011-05-30  9:46           ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Pekka Enberg @ 2011-05-30  9:40 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney

Hi Sasha,

On Mon, May 30, 2011 at 12:34 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> It would mean we need that many VCPUs: current br_read_lock() doesn't
> really do anything, which means that running these tests with dummy
> threads won't work.

Heh, sure they're doing something - they're burning CPU. You assume
that br_write_unlock() will actually resume all of them but I'd really
like to see a reproducible test case for that. ;-)

Do you think it's a bad idea? I don't quite see why - as I've said
before, this is something we learned the hard way when we implemented
our own locking primitives and stop-the-world for Jato. Those are just
so damn easy to break accidentally so having some safety net is
definitely worth it.

Also, didn't Paul suggest some debugging magic to detect errors?

                       Pekka

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  9:40         ` Pekka Enberg
@ 2011-05-30  9:46           ` Sasha Levin
  2011-05-30  9:48             ` Pekka Enberg
  2011-05-30  9:56             ` Ingo Molnar
  0 siblings, 2 replies; 72+ messages in thread
From: Sasha Levin @ 2011-05-30  9:46 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney

On Mon, 2011-05-30 at 12:40 +0300, Pekka Enberg wrote:
> Hi Sasha,
> 
> On Mon, May 30, 2011 at 12:34 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> > It would mean we need that many VCPUs: current br_read_lock() doesn't
> > really do anything, which means that running these tests with dummy
> > threads won't work.
> 
> Heh, sure they're doing something - they're burning CPU. You assume
> that br_write_unlock() will actually resume all of them but I'd really
> like to see a reproducible test case for that. ;-)

I agree with what you're saying: testing whether br_write_lock() makes
all VCPU threads stop and br_write_unlock() makes all VCPU threads
resume should be tested.

I'm just saying that we're limited to as many VCPU threads as we can
create. br_read_lock() won't do anything on a non-VCPU thread, which
makes it impossible to test it on non-VCPUs.

> Do you think it's a bad idea? I don't quite see why - as I've said
> before, this is something we learned the hard way when we implemented
> our own locking primitives and stop-the-world for Jato. Those are just
> so damn easy to break accidentally so having some safety net is
> definitely worth it.
> 
> Also, didn't Paul suggest some debugging magic to detect errors?
> 
>                        Pekka

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  9:46           ` Sasha Levin
@ 2011-05-30  9:48             ` Pekka Enberg
  2011-05-30  9:54               ` Ingo Molnar
  2011-05-30  9:56             ` Ingo Molnar
  1 sibling, 1 reply; 72+ messages in thread
From: Pekka Enberg @ 2011-05-30  9:48 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney, Avi Kivity

On Mon, May 30, 2011 at 12:46 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> I'm just saying that we're limited to as many VCPU threads as we can
> create. br_read_lock() won't do anything on a non-VCPU thread, which
> makes it impossible to test it on non-VCPUs.

It sure would be useful to be able to fire up 4096 VCPUs... ;-)

But lets start out with 64 VCPUs, OK? It's better than nothing.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  9:48             ` Pekka Enberg
@ 2011-05-30  9:54               ` Ingo Molnar
  2011-05-30 11:11                 ` Takuya Yoshikawa
  0 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  9:54 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney, Avi Kivity


* Pekka Enberg <penberg@kernel.org> wrote:

> On Mon, May 30, 2011 at 12:46 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> > I'm just saying that we're limited to as many VCPU threads as we can
> > create. br_read_lock() won't do anything on a non-VCPU thread, which
> > makes it impossible to test it on non-VCPUs.
> 
> It sure would be useful to be able to fire up 4096 VCPUs... ;-)
> 
> But lets start out with 64 VCPUs, OK? It's better than nothing.

In practice 64 VPUs ought to be able to find a good deal of bugs 
already.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  9:46           ` Sasha Levin
  2011-05-30  9:48             ` Pekka Enberg
@ 2011-05-30  9:56             ` Ingo Molnar
  2011-05-30 10:05               ` Sasha Levin
  1 sibling, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30  9:56 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney

* Sasha Levin <levinsasha928@gmail.com> wrote:

> I'm just saying that we're limited to as many VCPU threads as we 
> can create. br_read_lock() won't do anything on a non-VCPU thread, 
> which makes it impossible to test it on non-VCPUs.

btw., i wondered about that limit - don't we want to fix it?

I mean, there's no fundamental reason why brlocks should do 'nothing' 
in worker threads. In fact it's a subtle breakage waiting AFAICS.

We should have enumeration for all threads that kvm starts, and that 
we can use for a generic pause/resume facility. Can you see anything 
that prevents that model?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  9:56             ` Ingo Molnar
@ 2011-05-30 10:05               ` Sasha Levin
  2011-05-30 10:13                 ` Ingo Molnar
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30 10:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney

On Mon, 2011-05-30 at 11:56 +0200, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > I'm just saying that we're limited to as many VCPU threads as we 
> > can create. br_read_lock() won't do anything on a non-VCPU thread, 
> > which makes it impossible to test it on non-VCPUs.
> 
> btw., i wondered about that limit - don't we want to fix it?
> 
> I mean, there's no fundamental reason why brlocks should do 'nothing' 
> in worker threads. In fact it's a subtle breakage waiting AFAICS.
> 
Can they do anything useful without locking? I think we should work on
integrating an RCU and changing brlocks to use that instead of focusing
too much on the current implementation.
This will also fix that limit you don't like :)

> We should have enumeration for all threads that kvm starts, and that 
> we can use for a generic pause/resume facility. Can you see anything 
> that prevents that model?
> 
Theres a difference between how you would pause a VCPU thread as opposed
to a non-VCPU thread, other than that - no. We could use a thread
manager.

> Thanks,
> 
> 	Ingo

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 10:05               ` Sasha Levin
@ 2011-05-30 10:13                 ` Ingo Molnar
  2011-05-30 10:22                   ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30 10:13 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney


* Sasha Levin <levinsasha928@gmail.com> wrote:

> On Mon, 2011-05-30 at 11:56 +0200, Ingo Molnar wrote:
> > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > 
> > > I'm just saying that we're limited to as many VCPU threads as we 
> > > can create. br_read_lock() won't do anything on a non-VCPU thread, 
> > > which makes it impossible to test it on non-VCPUs.
> > 
> > btw., i wondered about that limit - don't we want to fix it?
> > 
> > I mean, there's no fundamental reason why brlocks should do 'nothing' 
> > in worker threads. In fact it's a subtle breakage waiting AFAICS.
> 
> Can they do anything useful without locking? I think we should work 
> on integrating an RCU and changing brlocks to use that instead of 
> focusing too much on the current implementation.

What do you mean 'without locking'? If a worker thread uses a 
br_read_lock() then that will be 'locking'. It should map to a real 
read_lock() in the rwlock debug case, etc.

> This will also fix that limit you don't like :)

I'd prefer brlocks to more complex solutions in cases where the write 
path is very infrequent!

So we don't want to keep brlocks intentionally crippled.

Currently we should at minimum need to BUG_ON() if br_read_lock() or 
br_write_lock() is called in a worker thread context, right?

> > We should have enumeration for all threads that kvm starts, and 
> > that we can use for a generic pause/resume facility. Can you see 
> > anything that prevents that model?
> 
> Theres a difference between how you would pause a VCPU thread as 
> opposed to a non-VCPU thread, other than that - no. We could use a 
> thread manager.

Exactly, which would be useful for other purposes as well :-)

This only involves the write path so it is not an issue.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 10:13                 ` Ingo Molnar
@ 2011-05-30 10:22                   ` Sasha Levin
  2011-05-30 10:30                     ` Ingo Molnar
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30 10:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney

On Mon, 2011-05-30 at 12:13 +0200, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > On Mon, 2011-05-30 at 11:56 +0200, Ingo Molnar wrote:
> > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > 
> > > > I'm just saying that we're limited to as many VCPU threads as we 
> > > > can create. br_read_lock() won't do anything on a non-VCPU thread, 
> > > > which makes it impossible to test it on non-VCPUs.
> > > 
> > > btw., i wondered about that limit - don't we want to fix it?
> > > 
> > > I mean, there's no fundamental reason why brlocks should do 'nothing' 
> > > in worker threads. In fact it's a subtle breakage waiting AFAICS.
> > 
> > Can they do anything useful without locking? I think we should work 
> > on integrating an RCU and changing brlocks to use that instead of 
> > focusing too much on the current implementation.
> 
> What do you mean 'without locking'? If a worker thread uses a 
> br_read_lock() then that will be 'locking'. It should map to a real 
> read_lock() in the rwlock debug case, etc.
> 
I meant without locking anything within br_read_lock(), because we
wanted to keep the read patch lock-free.

> > This will also fix that limit you don't like :)
> 
> I'd prefer brlocks to more complex solutions in cases where the write 
> path is very infrequent!
> 
> So we don't want to keep brlocks intentionally crippled.
> 
Do you see brlock as a global lock that will pause the entire guest (not
just VCPUs - anything except the calling thread)?

> Currently we should at minimum need to BUG_ON() if br_read_lock() or 
> br_write_lock() is called in a worker thread context, right?
> 
> > > We should have enumeration for all threads that kvm starts, and 
> > > that we can use for a generic pause/resume facility. Can you see 
> > > anything that prevents that model?
> > 
> > Theres a difference between how you would pause a VCPU thread as 
> > opposed to a non-VCPU thread, other than that - no. We could use a 
> > thread manager.
> 
> Exactly, which would be useful for other purposes as well :-)
> 
> This only involves the write path so it is not an issue.
> 
> Thanks,
> 
> 	Ingo

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 10:22                   ` Sasha Levin
@ 2011-05-30 10:30                     ` Ingo Molnar
  2011-05-30 10:41                       ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30 10:30 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney


* Sasha Levin <levinsasha928@gmail.com> wrote:

> On Mon, 2011-05-30 at 12:13 +0200, Ingo Molnar wrote:
> > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > 
> > > On Mon, 2011-05-30 at 11:56 +0200, Ingo Molnar wrote:
> > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > 
> > > > > I'm just saying that we're limited to as many VCPU threads as we 
> > > > > can create. br_read_lock() won't do anything on a non-VCPU thread, 
> > > > > which makes it impossible to test it on non-VCPUs.
> > > > 
> > > > btw., i wondered about that limit - don't we want to fix it?
> > > > 
> > > > I mean, there's no fundamental reason why brlocks should do 'nothing' 
> > > > in worker threads. In fact it's a subtle breakage waiting AFAICS.
> > > 
> > > Can they do anything useful without locking? I think we should work 
> > > on integrating an RCU and changing brlocks to use that instead of 
> > > focusing too much on the current implementation.
> > 
> > What do you mean 'without locking'? If a worker thread uses a 
> > br_read_lock() then that will be 'locking'. It should map to a real 
> > read_lock() in the rwlock debug case, etc.
> > 
> I meant without locking anything within br_read_lock(), because we
> wanted to keep the read patch lock-free.

oh, so it's not recursive.

Sane enough - might be worth adding:

	br_is_read_locked(&lock)

and a debug check for that into br_read_lock():

	BUG_ON(br_is_read_locked(&lock));

> > > This will also fix that limit you don't like :)
> > 
> > I'd prefer brlocks to more complex solutions in cases where the write 
> > path is very infrequent!
> > 
> > So we don't want to keep brlocks intentionally crippled.
> 
> Do you see brlock as a global lock that will pause the entire guest 
> (not just VCPUs - anything except the calling thread)?

Yeah, that's how such brlocks work - life has to stop when there's 
write modifications going on.

There should be a mutex around br_write_lock() itself, to make sure 
two br_write_lock() attempts cannot deadlock each other, but other 
than that it should be pretty straightforward and robust.

And note that such a pause/suspend thing might be helpful to do a 
*real* host driven suspend feature in the future: stop all vcpus, all 
worker threads, save state to disk and exit?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 10:30                     ` Ingo Molnar
@ 2011-05-30 10:41                       ` Sasha Levin
  0 siblings, 0 replies; 72+ messages in thread
From: Sasha Levin @ 2011-05-30 10:41 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	Paul E. McKenney

On Mon, 2011-05-30 at 12:30 +0200, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > On Mon, 2011-05-30 at 12:13 +0200, Ingo Molnar wrote:
> > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > 
> > > > On Mon, 2011-05-30 at 11:56 +0200, Ingo Molnar wrote:
> > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > 
> > > > > > I'm just saying that we're limited to as many VCPU threads as we 
> > > > > > can create. br_read_lock() won't do anything on a non-VCPU thread, 
> > > > > > which makes it impossible to test it on non-VCPUs.
> > > > > 
> > > > > btw., i wondered about that limit - don't we want to fix it?
> > > > > 
> > > > > I mean, there's no fundamental reason why brlocks should do 'nothing' 
> > > > > in worker threads. In fact it's a subtle breakage waiting AFAICS.
> > > > 
> > > > Can they do anything useful without locking? I think we should work 
> > > > on integrating an RCU and changing brlocks to use that instead of 
> > > > focusing too much on the current implementation.
> > > 
> > > What do you mean 'without locking'? If a worker thread uses a 
> > > br_read_lock() then that will be 'locking'. It should map to a real 
> > > read_lock() in the rwlock debug case, etc.
> > > 
> > I meant without locking anything within br_read_lock(), because we
> > wanted to keep the read patch lock-free.
> 
> oh, so it's not recursive.
> 
> Sane enough - might be worth adding:
> 
> 	br_is_read_locked(&lock)
> 
> and a debug check for that into br_read_lock():
> 
> 	BUG_ON(br_is_read_locked(&lock));
> 
> > > > This will also fix that limit you don't like :)
> > > 
> > > I'd prefer brlocks to more complex solutions in cases where the write 
> > > path is very infrequent!
> > > 
> > > So we don't want to keep brlocks intentionally crippled.
> > 
> > Do you see brlock as a global lock that will pause the entire guest 
> > (not just VCPUs - anything except the calling thread)?
> 
> Yeah, that's how such brlocks work - life has to stop when there's 
> write modifications going on.
> 
> There should be a mutex around br_write_lock() itself, to make sure 
> two br_write_lock() attempts cannot deadlock each other, but other 
> than that it should be pretty straightforward and robust.
> 
Yes, It'll need to wait for a thread manager to be added so it could be
un-crippled.

> And note that such a pause/suspend thing might be helpful to do a 
> *real* host driven suspend feature in the future: stop all vcpus, all 
> worker threads, save state to disk and exit?
> 
> Thanks,
> 
> 	Ingo

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30  9:54               ` Ingo Molnar
@ 2011-05-30 11:11                 ` Takuya Yoshikawa
  2011-05-30 11:12                   ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Takuya Yoshikawa @ 2011-05-30 11:11 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Sasha Levin, kvm, asias.hejun, gorcunov,
	prasadjoshi124, Paul E. McKenney, Avi Kivity, takuya.yoshikawa

On Mon, 30 May 2011 11:54:51 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Pekka Enberg <penberg@kernel.org> wrote:
> 
> > On Mon, May 30, 2011 at 12:46 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> > > I'm just saying that we're limited to as many VCPU threads as we can
> > > create. br_read_lock() won't do anything on a non-VCPU thread, which
> > > makes it impossible to test it on non-VCPUs.
> > 
> > It sure would be useful to be able to fire up 4096 VCPUs... ;-)
> > 
> > But lets start out with 64 VCPUs, OK? It's better than nothing.
> 
> In practice 64 VPUs ought to be able to find a good deal of bugs 
> already.

Does kvm tools allow users to start more VCPUs than the actual cores?

IIRC, qemu rejects such settings.

  Takuya

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:11                 ` Takuya Yoshikawa
@ 2011-05-30 11:12                   ` Sasha Levin
  2011-05-30 11:26                     ` Takuya Yoshikawa
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30 11:12 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Ingo Molnar, Pekka Enberg, kvm, asias.hejun, gorcunov,
	prasadjoshi124, Paul E. McKenney, Avi Kivity, takuya.yoshikawa

On Mon, 2011-05-30 at 20:11 +0900, Takuya Yoshikawa wrote:
> On Mon, 30 May 2011 11:54:51 +0200
> Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > * Pekka Enberg <penberg@kernel.org> wrote:
> > 
> > > On Mon, May 30, 2011 at 12:46 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > I'm just saying that we're limited to as many VCPU threads as we can
> > > > create. br_read_lock() won't do anything on a non-VCPU thread, which
> > > > makes it impossible to test it on non-VCPUs.
> > > 
> > > It sure would be useful to be able to fire up 4096 VCPUs... ;-)
> > > 
> > > But lets start out with 64 VCPUs, OK? It's better than nothing.
> > 
> > In practice 64 VPUs ought to be able to find a good deal of bugs 
> > already.
> 
> Does kvm tools allow users to start more VCPUs than the actual cores?
> 
Yes.

> IIRC, qemu rejects such settings.
> 
qemu also allows having more VCPUs than cores.

>   Takuya

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:12                   ` Sasha Levin
@ 2011-05-30 11:26                     ` Takuya Yoshikawa
  2011-05-30 11:39                       ` Avi Kivity
  0 siblings, 1 reply; 72+ messages in thread
From: Takuya Yoshikawa @ 2011-05-30 11:26 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Pekka Enberg, kvm, asias.hejun, gorcunov,
	prasadjoshi124, Paul E. McKenney, Avi Kivity, takuya.yoshikawa

On Mon, 30 May 2011 14:12:34 +0300
Sasha Levin <levinsasha928@gmail.com> wrote:

> On Mon, 2011-05-30 at 20:11 +0900, Takuya Yoshikawa wrote:
> > On Mon, 30 May 2011 11:54:51 +0200
> > Ingo Molnar <mingo@elte.hu> wrote:
> > 
> > > 
> > > * Pekka Enberg <penberg@kernel.org> wrote:
> > > 
> > > > On Mon, May 30, 2011 at 12:46 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > I'm just saying that we're limited to as many VCPU threads as we can
> > > > > create. br_read_lock() won't do anything on a non-VCPU thread, which
> > > > > makes it impossible to test it on non-VCPUs.
> > > > 
> > > > It sure would be useful to be able to fire up 4096 VCPUs... ;-)
> > > > 
> > > > But lets start out with 64 VCPUs, OK? It's better than nothing.
> > > 
> > > In practice 64 VPUs ought to be able to find a good deal of bugs 
> > > already.
> > 
> > Does kvm tools allow users to start more VCPUs than the actual cores?
> > 
> Yes.
> 
> > IIRC, qemu rejects such settings.
> > 
> qemu also allows having more VCPUs than cores.

I have to check again, then :) Thank you!
I will try both with many VCPUs.

  Takuya

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:26                     ` Takuya Yoshikawa
@ 2011-05-30 11:39                       ` Avi Kivity
  2011-05-30 11:49                         ` Ingo Molnar
  0 siblings, 1 reply; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 11:39 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Sasha Levin, Ingo Molnar, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 02:26 PM, Takuya Yoshikawa wrote:
> >  >
> >  qemu also allows having more VCPUs than cores.
>
> I have to check again, then :) Thank you!
> I will try both with many VCPUs.

Note, with cpu overcommit the results are going to be bad.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:39                       ` Avi Kivity
@ 2011-05-30 11:49                         ` Ingo Molnar
  2011-05-30 11:55                           ` Pekka Enberg
                                             ` (2 more replies)
  0 siblings, 3 replies; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30 11:49 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Takuya Yoshikawa, Sasha Levin, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa


* Avi Kivity <avi@redhat.com> wrote:

> On 05/30/2011 02:26 PM, Takuya Yoshikawa wrote:
> >>  >
> >>  qemu also allows having more VCPUs than cores.
> >
> >I have to check again, then :) Thank you!
> >I will try both with many VCPUs.
> 
> Note, with cpu overcommit the results are going to be bad.

And that is good: if pushed hard enough it will trigger exciting (or 
obscure) bugs in the guest kernel much faster than if there's no 
overcommit, so it's rather useful for testing. (We made that 
surprising experience with -rt.)

Also, such simulation would be very obviously useful if you get 
bugreports about 1024 or 4096 CPUs, like i do sometimes! :-) [*]

Thanks,

	Ingo

[*] Would be nice if tools/kvm/ had a debug option to simulate 'lots
    of RAM' as well somehow - perhaps by not pre-initializing it and
    somehow catching all-zeroes pages and keeping them all zeroes and
    shared? It would obviously OOM after some time but would allow me
    to at least boot a fair deal of userspace. The motivation is that
    i have recently received a 1 TB RAM bugreport. 1 TB of RAM mapped 
    with 2MB mappings should still be able to boot to shell on a 32 
    GB RAM testbox of mine, and survive there for some time. We could
    even do some kernel modifications to make this kind of simulation 
    easier.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:49                         ` Ingo Molnar
@ 2011-05-30 11:55                           ` Pekka Enberg
  2011-05-30 11:58                             ` Sasha Levin
  2011-05-30 12:04                           ` Avi Kivity
  2011-05-30 14:16                           ` Takuya Yoshikawa
  2 siblings, 1 reply; 72+ messages in thread
From: Pekka Enberg @ 2011-05-30 11:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, Takuya Yoshikawa, Sasha Levin, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On Mon, May 30, 2011 at 2:49 PM, Ingo Molnar <mingo@elte.hu> wrote:
> [*] Would be nice if tools/kvm/ had a debug option to simulate 'lots
>    of RAM' as well somehow - perhaps by not pre-initializing it and
>    somehow catching all-zeroes pages and keeping them all zeroes and
>    shared? It would obviously OOM after some time but would allow me
>    to at least boot a fair deal of userspace. The motivation is that
>    i have recently received a 1 TB RAM bugreport. 1 TB of RAM mapped
>    with 2MB mappings should still be able to boot to shell on a 32
>    GB RAM testbox of mine, and survive there for some time. We could
>    even do some kernel modifications to make this kind of simulation
>    easier.

Doesn't our mmap() overcommit work for you? IIRC, Sasha booted guests
with huge amounts of overcommitted RAM.

                             Pekka

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:55                           ` Pekka Enberg
@ 2011-05-30 11:58                             ` Sasha Levin
  2011-05-30 12:20                               ` Ingo Molnar
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30 11:58 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, Avi Kivity, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On Mon, 2011-05-30 at 14:55 +0300, Pekka Enberg wrote:
> On Mon, May 30, 2011 at 2:49 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > [*] Would be nice if tools/kvm/ had a debug option to simulate 'lots
> >    of RAM' as well somehow - perhaps by not pre-initializing it and
> >    somehow catching all-zeroes pages and keeping them all zeroes and
> >    shared? It would obviously OOM after some time but would allow me
> >    to at least boot a fair deal of userspace. The motivation is that
> >    i have recently received a 1 TB RAM bugreport. 1 TB of RAM mapped
> >    with 2MB mappings should still be able to boot to shell on a 32
> >    GB RAM testbox of mine, and survive there for some time. We could
> >    even do some kernel modifications to make this kind of simulation
> >    easier.
> 
> Doesn't our mmap() overcommit work for you? IIRC, Sasha booted guests
> with huge amounts of overcommitted RAM.

It should. I can (easily) boot 64GB guest on my 4GB laptop. We fail at
>64GB due to an issue with our mappings (I haven't investigated it yet)
- not due to running out of memory.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:49                         ` Ingo Molnar
  2011-05-30 11:55                           ` Pekka Enberg
@ 2011-05-30 12:04                           ` Avi Kivity
  2011-05-30 12:36                             ` Ingo Molnar
  2011-05-30 14:16                           ` Takuya Yoshikawa
  2 siblings, 1 reply; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 12:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Takuya Yoshikawa, Sasha Levin, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 02:49 PM, Ingo Molnar wrote:
> * Avi Kivity<avi@redhat.com>  wrote:
>
> >  On 05/30/2011 02:26 PM, Takuya Yoshikawa wrote:
> >  >>   >
> >  >>   qemu also allows having more VCPUs than cores.
> >  >
> >  >I have to check again, then :) Thank you!
> >  >I will try both with many VCPUs.
> >
> >  Note, with cpu overcommit the results are going to be bad.
>
> And that is good: if pushed hard enough it will trigger exciting (or
> obscure) bugs in the guest kernel much faster than if there's no
> overcommit, so it's rather useful for testing. (We made that
> surprising experience with -rt.)
>
> Also, such simulation would be very obviously useful if you get
> bugreports about 1024 or 4096 CPUs, like i do sometimes! :-) [*]

I'll be surprised if 1024 cpus actually boot on a reasonable machine.  
Without PLE support, any busy wait (like smp_call_function_single()) 
turns into a delay the length scheduler time slice (or CFS's unfairness 
measure, I forget how it's called) - 3 or 4 orders of magnitude larger.  
Even with PLE, it's significantly slower, plus 1-2 orders of magnitude 
loss from the overcommit itself.

> [*] Would be nice if tools/kvm/ had a debug option to simulate 'lots
>      of RAM' as well somehow - perhaps by not pre-initializing it and
>      somehow catching all-zeroes pages and keeping them all zeroes and
>      shared? It would obviously OOM after some time but would allow me
>      to at least boot a fair deal of userspace. The motivation is that
>      i have recently received a 1 TB RAM bugreport. 1 TB of RAM mapped
>      with 2MB mappings should still be able to boot to shell on a 32
>      GB RAM testbox of mine, and survive there for some time. We could
>      even do some kernel modifications to make this kind of simulation
>      easier.

This should work fine - I just booted a 128GB guest on a 4GB host.  Just 
set overcommit_accounting to 1 (and disable transparent hugepages, 
though the kernel mostly allocates contiguous memory).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:58                             ` Sasha Levin
@ 2011-05-30 12:20                               ` Ingo Molnar
  2011-05-30 12:22                                 ` Sasha Levin
  2011-05-30 12:23                                 ` Avi Kivity
  0 siblings, 2 replies; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30 12:20 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Pekka Enberg, Avi Kivity, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa


* Sasha Levin <levinsasha928@gmail.com> wrote:

> On Mon, 2011-05-30 at 14:55 +0300, Pekka Enberg wrote:
> > On Mon, May 30, 2011 at 2:49 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > > [*] Would be nice if tools/kvm/ had a debug option to simulate 'lots
> > >    of RAM' as well somehow - perhaps by not pre-initializing it and
> > >    somehow catching all-zeroes pages and keeping them all zeroes and
> > >    shared? It would obviously OOM after some time but would allow me
> > >    to at least boot a fair deal of userspace. The motivation is that
> > >    i have recently received a 1 TB RAM bugreport. 1 TB of RAM mapped
> > >    with 2MB mappings should still be able to boot to shell on a 32
> > >    GB RAM testbox of mine, and survive there for some time. We could
> > >    even do some kernel modifications to make this kind of simulation
> > >    easier.
> > 
> > Doesn't our mmap() overcommit work for you? IIRC, Sasha booted guests
> > with huge amounts of overcommitted RAM.
> 
> It should. I can (easily) boot 64GB guest on my 4GB laptop. We fail at
> >64GB due to an issue with our mappings (I haven't investigated it yet)
> - not due to running out of memory.

Oh, that's neat. Btw., i think there should be an option to actually 
*force* blatant overcommit: if i typo the option i'd like the tool to 
tell me that i'm probably stupid and refuse to run with that (and 
suggest the force-overcommit option), instead of trying to swap itelf 
to death!

How does this work in practice - i thought we memset() all of RAM 
during guest kernel bootup. That might have changed with the memblock 
allocator ... So the guest kernel does not touch all of that 64 GB of 
RAM, so your box wont OOM straight away?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:20                               ` Ingo Molnar
@ 2011-05-30 12:22                                 ` Sasha Levin
  2011-05-30 12:25                                   ` Avi Kivity
  2011-05-30 12:23                                 ` Avi Kivity
  1 sibling, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-05-30 12:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Avi Kivity, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On Mon, 2011-05-30 at 14:20 +0200, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > On Mon, 2011-05-30 at 14:55 +0300, Pekka Enberg wrote:
> > > On Mon, May 30, 2011 at 2:49 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > > > [*] Would be nice if tools/kvm/ had a debug option to simulate 'lots
> > > >    of RAM' as well somehow - perhaps by not pre-initializing it and
> > > >    somehow catching all-zeroes pages and keeping them all zeroes and
> > > >    shared? It would obviously OOM after some time but would allow me
> > > >    to at least boot a fair deal of userspace. The motivation is that
> > > >    i have recently received a 1 TB RAM bugreport. 1 TB of RAM mapped
> > > >    with 2MB mappings should still be able to boot to shell on a 32
> > > >    GB RAM testbox of mine, and survive there for some time. We could
> > > >    even do some kernel modifications to make this kind of simulation
> > > >    easier.
> > > 
> > > Doesn't our mmap() overcommit work for you? IIRC, Sasha booted guests
> > > with huge amounts of overcommitted RAM.
> > 
> > It should. I can (easily) boot 64GB guest on my 4GB laptop. We fail at
> > >64GB due to an issue with our mappings (I haven't investigated it yet)
> > - not due to running out of memory.
> 
> Oh, that's neat. Btw., i think there should be an option to actually 
> *force* blatant overcommit: if i typo the option i'd like the tool to 
> tell me that i'm probably stupid and refuse to run with that (and 
> suggest the force-overcommit option), instead of trying to swap itelf 
> to death!

How much is a 'blatant' overcommit? KVM easily handles x32 of host RAM.

> How does this work in practice - i thought we memset() all of RAM 
> during guest kernel bootup. That might have changed with the memblock 
> allocator ... So the guest kernel does not touch all of that 64 GB of 
> RAM, so your box wont OOM straight away?

We simply mmap() with MAP_NORESERVE.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:20                               ` Ingo Molnar
  2011-05-30 12:22                                 ` Sasha Levin
@ 2011-05-30 12:23                                 ` Avi Kivity
  2011-05-30 12:30                                   ` Pekka Enberg
  2011-05-30 14:10                                   ` Ingo Molnar
  1 sibling, 2 replies; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 12:23 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Sasha Levin, Pekka Enberg, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 03:20 PM, Ingo Molnar wrote:
> How does this work in practice - i thought we memset() all of RAM
> during guest kernel bootup. That might have changed with the memblock
> allocator ... So the guest kernel does not touch all of that 64 GB of
> RAM, so your box wont OOM straight away?

IIRC there never was a memset() of all RAM, at least since kvm started 
booting Linux.  Windows has a zeroing thread which causes all of RAM to 
be committed shortly after boot, though.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:22                                 ` Sasha Levin
@ 2011-05-30 12:25                                   ` Avi Kivity
  0 siblings, 0 replies; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 12:25 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Pekka Enberg, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 03:22 PM, Sasha Levin wrote:
> >
> >  Oh, that's neat. Btw., i think there should be an option to actually
> >  *force* blatant overcommit: if i typo the option i'd like the tool to
> >  tell me that i'm probably stupid and refuse to run with that (and
> >  suggest the force-overcommit option), instead of trying to swap itelf
> >  to death!
>
> How much is a 'blatant' overcommit? KVM easily handles x32 of host RAM.
>

That's close to the limit.  Most of memory would be taken by the struct 
page array, which is about guest_ram/64.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:23                                 ` Avi Kivity
@ 2011-05-30 12:30                                   ` Pekka Enberg
  2011-05-30 12:32                                     ` Avi Kivity
  2011-05-30 14:10                                   ` Ingo Molnar
  1 sibling, 1 reply; 72+ messages in thread
From: Pekka Enberg @ 2011-05-30 12:30 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Ingo Molnar, Sasha Levin, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On Mon, May 30, 2011 at 3:23 PM, Avi Kivity <avi@redhat.com> wrote:
> IIRC there never was a memset() of all RAM, at least since kvm started
> booting Linux.  Windows has a zeroing thread which causes all of RAM to be
> committed shortly after boot, though.

Yup, I haven't heard of such thing either. AFAIK, it's all done lazily
by the page allocator.

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:30                                   ` Pekka Enberg
@ 2011-05-30 12:32                                     ` Avi Kivity
  0 siblings, 0 replies; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 12:32 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, Sasha Levin, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 03:30 PM, Pekka Enberg wrote:
> On Mon, May 30, 2011 at 3:23 PM, Avi Kivity<avi@redhat.com>  wrote:
> >  IIRC there never was a memset() of all RAM, at least since kvm started
> >  booting Linux.  Windows has a zeroing thread which causes all of RAM to be
> >  committed shortly after boot, though.
>
> Yup, I haven't heard of such thing either. AFAIK, it's all done lazily
> by the page allocator.

Note that in these days of dma engines a zeroing thread makes some 
sense.  During a kernel build I see quite a bit of time spent in 
clear_page_c() or something.  It would be nice to have the dma engine 
prepare some clear pages so we could skip it.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:04                           ` Avi Kivity
@ 2011-05-30 12:36                             ` Ingo Molnar
  2011-05-30 12:44                               ` Avi Kivity
                                                 ` (3 more replies)
  0 siblings, 4 replies; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30 12:36 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Takuya Yoshikawa, Sasha Levin, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa


* Avi Kivity <avi@redhat.com> wrote:

> On 05/30/2011 02:49 PM, Ingo Molnar wrote:
> >* Avi Kivity<avi@redhat.com>  wrote:
> >
> >>  On 05/30/2011 02:26 PM, Takuya Yoshikawa wrote:
> >>  >>   >
> >>  >>   qemu also allows having more VCPUs than cores.
> >>  >
> >>  >I have to check again, then :) Thank you!
> >>  >I will try both with many VCPUs.
> >>
> >>  Note, with cpu overcommit the results are going to be bad.
> >
> >And that is good: if pushed hard enough it will trigger exciting (or
> >obscure) bugs in the guest kernel much faster than if there's no
> >overcommit, so it's rather useful for testing. (We made that
> >surprising experience with -rt.)
> >
> >Also, such simulation would be very obviously useful if you get
> >bugreports about 1024 or 4096 CPUs, like i do sometimes! :-) [*]
> 
> I'll be surprised if 1024 cpus actually boot on a reasonable 
> machine.  Without PLE support, any busy wait (like 
> smp_call_function_single()) turns into a delay the length scheduler 
> time slice (or CFS's unfairness measure, I forget how it's called) 
> - 3 or 4 orders of magnitude larger.  Even with PLE, it's 
> significantly slower, plus 1-2 orders of magnitude loss from the 
> overcommit itself.

You are probably right about 1024 CPUs.

Right now i can produce something similar to it: 42 vcpus on a single 
CPU:

	$ taskse 1 kvm run --cpus 42

And that hangs early on during bootup, around:

[    0.236000] Disabled fast string operations
[    0.242000]  #4
[    0.270000] Disabled fast string operations
[    0.275000]  #5
[    0.317000] Disabled fast string operations
[    0.322000]  #6
[    0.352000] Disabled fast string operations
[    0.358000]  #7
[    0.414000] Disabled fast string operations

The threads seem to be livelocked:

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND                                    
22227 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.39 kvm                                        
22230 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.36 kvm                                        
22226 mingo     20   0  471g  95m  904 R 11.9  0.8   0:07.04 kvm                                        
22228 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.38 kvm                                        
22229 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm                                        
22231 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm                                        
22232 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.36 kvm                                        
22233 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.33 kvm                                        
    7 root      -2  19     0    0    0 S  2.0  0.0   1:12.53 rcuc0             

with no apparent progress being made.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:36                             ` Ingo Molnar
@ 2011-05-30 12:44                               ` Avi Kivity
  2011-05-30 12:46                                 ` Pekka Enberg
  2011-05-30 13:05                               ` Sasha Levin
                                                 ` (2 subsequent siblings)
  3 siblings, 1 reply; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 12:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Takuya Yoshikawa, Sasha Levin, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 03:36 PM, Ingo Molnar wrote:
> You are probably right about 1024 CPUs.
>
> Right now i can produce something similar to it: 42 vcpus on a single
> CPU:
>
> 	$ taskse 1 kvm run --cpus 42
>
> And that hangs early on during bootup, around:
>
> [    0.236000] Disabled fast string operations
> [    0.242000]  #4
> [    0.270000] Disabled fast string operations
> [    0.275000]  #5
> [    0.317000] Disabled fast string operations
> [    0.322000]  #6
> [    0.352000] Disabled fast string operations
> [    0.358000]  #7
> [    0.414000] Disabled fast string operations
>
> The threads seem to be livelocked:
>
>    PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
> 22227 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.39 kvm
> 22230 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.36 kvm
> 22226 mingo     20   0  471g  95m  904 R 11.9  0.8   0:07.04 kvm
> 22228 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.38 kvm
> 22229 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm
> 22231 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm
> 22232 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.36 kvm
> 22233 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.33 kvm
>      7 root      -2  19     0    0    0 S  2.0  0.0   1:12.53 rcuc0
>
> with no apparent progress being made.
>

Well, strangely, 42-on-1 booted for me in qemu, and only got 
ridiculously slow in initrd.  Three minutes and it's still booting.

I don't really see how tools/kvm could cause this slowness, though.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:44                               ` Avi Kivity
@ 2011-05-30 12:46                                 ` Pekka Enberg
  2011-05-30 12:48                                   ` Avi Kivity
  0 siblings, 1 reply; 72+ messages in thread
From: Pekka Enberg @ 2011-05-30 12:46 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Ingo Molnar, Takuya Yoshikawa, Sasha Levin, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On Mon, May 30, 2011 at 3:44 PM, Avi Kivity <avi@redhat.com> wrote:
> Well, strangely, 42-on-1 booted for me in qemu, and only got ridiculously
> slow in initrd.  Three minutes and it's still booting.
>
> I don't really see how tools/kvm could cause this slowness, though.

We have the SIGALRM thing for serial console that Qemu probably doesn't have?

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:46                                 ` Pekka Enberg
@ 2011-05-30 12:48                                   ` Avi Kivity
  0 siblings, 0 replies; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 12:48 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, Takuya Yoshikawa, Sasha Levin, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 03:46 PM, Pekka Enberg wrote:
> On Mon, May 30, 2011 at 3:44 PM, Avi Kivity<avi@redhat.com>  wrote:
> >  Well, strangely, 42-on-1 booted for me in qemu, and only got ridiculously
> >  slow in initrd.  Three minutes and it's still booting.
> >
> >  I don't really see how tools/kvm could cause this slowness, though.
>
> We have the SIGALRM thing for serial console that Qemu probably doesn't have?

qemu has tons of timers.  We don't let signals get delivered though, we 
queue them.  I doubt that's the issue, perf will tell.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:36                             ` Ingo Molnar
  2011-05-30 12:44                               ` Avi Kivity
@ 2011-05-30 13:05                               ` Sasha Levin
  2011-06-03  7:27                               ` Sasha Levin
  2011-06-05 12:12                               ` Avi Kivity
  3 siblings, 0 replies; 72+ messages in thread
From: Sasha Levin @ 2011-05-30 13:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, Takuya Yoshikawa, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On Mon, 2011-05-30 at 14:36 +0200, Ingo Molnar wrote:
> * Avi Kivity <avi@redhat.com> wrote:
> 
> > On 05/30/2011 02:49 PM, Ingo Molnar wrote:
> > >* Avi Kivity<avi@redhat.com>  wrote:
> > >
> > >>  On 05/30/2011 02:26 PM, Takuya Yoshikawa wrote:
> > >>  >>   >
> > >>  >>   qemu also allows having more VCPUs than cores.
> > >>  >
> > >>  >I have to check again, then :) Thank you!
> > >>  >I will try both with many VCPUs.
> > >>
> > >>  Note, with cpu overcommit the results are going to be bad.
> > >
> > >And that is good: if pushed hard enough it will trigger exciting (or
> > >obscure) bugs in the guest kernel much faster than if there's no
> > >overcommit, so it's rather useful for testing. (We made that
> > >surprising experience with -rt.)
> > >
> > >Also, such simulation would be very obviously useful if you get
> > >bugreports about 1024 or 4096 CPUs, like i do sometimes! :-) [*]
> > 
> > I'll be surprised if 1024 cpus actually boot on a reasonable 
> > machine.  Without PLE support, any busy wait (like 
> > smp_call_function_single()) turns into a delay the length scheduler 
> > time slice (or CFS's unfairness measure, I forget how it's called) 
> > - 3 or 4 orders of magnitude larger.  Even with PLE, it's 
> > significantly slower, plus 1-2 orders of magnitude loss from the 
> > overcommit itself.
> 
> You are probably right about 1024 CPUs.
> 
> Right now i can produce something similar to it: 42 vcpus on a single 
> CPU:
> 
> 	$ taskse 1 kvm run --cpus 42
> 
> And that hangs early on during bootup, around:
> 
> [    0.236000] Disabled fast string operations
> [    0.242000]  #4
> [    0.270000] Disabled fast string operations
> [    0.275000]  #5
> [    0.317000] Disabled fast string operations
> [    0.322000]  #6
> [    0.352000] Disabled fast string operations
> [    0.358000]  #7
> [    0.414000] Disabled fast string operations
> 
> The threads seem to be livelocked:
> 
>   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND                                    
> 22227 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.39 kvm                                        
> 22230 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.36 kvm                                        
> 22226 mingo     20   0  471g  95m  904 R 11.9  0.8   0:07.04 kvm                                        
> 22228 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.38 kvm                                        
> 22229 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm                                        
> 22231 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm                                        
> 22232 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.36 kvm                                        
> 22233 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.33 kvm                                        
>     7 root      -2  19     0    0    0 S  2.0  0.0   1:12.53 rcuc0             
> 
> with no apparent progress being made.

I've noticed the same issue when using a 2.6.39 kernel. Try doing the
same with the 2.6.37 kernel we have in the tree.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:23                                 ` Avi Kivity
  2011-05-30 12:30                                   ` Pekka Enberg
@ 2011-05-30 14:10                                   ` Ingo Molnar
  2011-05-30 14:30                                     ` Avi Kivity
  1 sibling, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30 14:10 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Pekka Enberg, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa


* Avi Kivity <avi@redhat.com> wrote:

> On 05/30/2011 03:20 PM, Ingo Molnar wrote:
> >How does this work in practice - i thought we memset() all of RAM
> >during guest kernel bootup. That might have changed with the memblock
> >allocator ... So the guest kernel does not touch all of that 64 GB of
> >RAM, so your box wont OOM straight away?
> 
> IIRC there never was a memset() of all RAM, at least since kvm 
> started booting Linux. [...]

Hm, bootmem used to do a memset().

> [...]  Windows has a zeroing thread which causes all of RAM to be 
> committed shortly after boot, though.

heh, maybe they read lkml and copied my ancient idea:

  http://people.redhat.com/mingo/clearpage-patches/clearpage-2.3.18-J1

An earlier version had a 'zerod' (page zeroing kernel thread).

This was one of my more stupid ideas btw.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 11:49                         ` Ingo Molnar
  2011-05-30 11:55                           ` Pekka Enberg
  2011-05-30 12:04                           ` Avi Kivity
@ 2011-05-30 14:16                           ` Takuya Yoshikawa
  2 siblings, 0 replies; 72+ messages in thread
From: Takuya Yoshikawa @ 2011-05-30 14:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, Takuya Yoshikawa, Sasha Levin, Pekka Enberg, kvm,
	asias.hejun, gorcunov, prasadjoshi124, Paul E. McKenney

On Mon, 30 May 2011 13:49:49 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> > Note, with cpu overcommit the results are going to be bad.
> 
> And that is good: if pushed hard enough it will trigger exciting (or 
> obscure) bugs in the guest kernel much faster than if there's no 
> overcommit, so it's rather useful for testing. (We made that 
> surprising experience with -rt.)
> 
> Also, such simulation would be very obviously useful if you get 
> bugreports about 1024 or 4096 CPUs, like i do sometimes! :-) [*]
> 

Yes, that is exactly what I wanted to see.
16 core server is the best I can use now, and usually I only use
4 core desktop!

  Takuya

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 14:10                                   ` Ingo Molnar
@ 2011-05-30 14:30                                     ` Avi Kivity
  2011-05-30 14:43                                       ` Ingo Molnar
  0 siblings, 1 reply; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 14:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Sasha Levin, Pekka Enberg, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 05:10 PM, Ingo Molnar wrote:
> >  [...]  Windows has a zeroing thread which causes all of RAM to be
> >  committed shortly after boot, though.
>
> heh, maybe they read lkml and copied my ancient idea:
>
>    http://people.redhat.com/mingo/clearpage-patches/clearpage-2.3.18-J1
>
> An earlier version had a 'zerod' (page zeroing kernel thread).
>
> This was one of my more stupid ideas btw.
>

I think that with a dma engine it makes sense.  We've got an extra 
resource, why not utilize it in the background?  Some workloads generate 
a lot of demand for zero pages.

I agree that using the cpu to clear memory is not a good idea, it just 
causes cache pollution.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 14:30                                     ` Avi Kivity
@ 2011-05-30 14:43                                       ` Ingo Molnar
  2011-05-30 14:50                                         ` Avi Kivity
  0 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30 14:43 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Pekka Enberg, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

* Avi Kivity <avi@redhat.com> wrote:

> On 05/30/2011 05:10 PM, Ingo Molnar wrote:
> >>  [...]  Windows has a zeroing thread which causes all of RAM to be
> >>  committed shortly after boot, though.
> >
> > heh, maybe they read lkml and copied my ancient idea:
> >
> >   http://people.redhat.com/mingo/clearpage-patches/clearpage-2.3.18-J1
> >
> > An earlier version had a 'zerod' (page zeroing kernel thread).
> >
> > This was one of my more stupid ideas btw.
> >
> 
> I think that with a dma engine it makes sense.  We've got an extra 
> resource, why not utilize it in the background?  Some workloads 
> generate a lot of demand for zero pages.

It was one of the early ideas to use DMA engines to clear memory 
slowly in the background.

> I agree that using the cpu to clear memory is not a good idea, it 
> just causes cache pollution.

Yeah, but even cache-neutral clearing (either driven from the CPU 
from the idle thread or by a DMA engine) is not a particularly good 
idea: because it uses up a finite resource: memory bandwidth. Can we 
create 'idle' DMA transactions - once that never get in the way of 
real DMA transactions?

Also, a profile of a typical kernel build shows:

     0.69% cc1 [kernel.kallsyms] [k] clear_page_c
     0.49% cc1 [kernel.kallsyms] [k] page_fault

So while we could improve it, the question is, can we do this without 
accidentally slowing things down by more than 0.69%? And kernel 
builds are a pretty clear_page_c() intense workload.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 14:43                                       ` Ingo Molnar
@ 2011-05-30 14:50                                         ` Avi Kivity
  2011-05-30 19:32                                           ` Ingo Molnar
  0 siblings, 1 reply; 72+ messages in thread
From: Avi Kivity @ 2011-05-30 14:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Sasha Levin, Pekka Enberg, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 05:43 PM, Ingo Molnar wrote:
> >  I agree that using the cpu to clear memory is not a good idea, it
> >  just causes cache pollution.
>
> Yeah, but even cache-neutral clearing (either driven from the CPU
> from the idle thread or by a DMA engine) is not a particularly good
> idea: because it uses up a finite resource: memory bandwidth.

I think that on modern machines it's not such an issue, at least 
compared to cpu time.

> Can we
> create 'idle' DMA transactions - once that never get in the way of
> real DMA transactions?

Not to my knowledge.

> Also, a profile of a typical kernel build shows:
>
>       0.69% cc1 [kernel.kallsyms] [k] clear_page_c
>       0.49% cc1 [kernel.kallsyms] [k] page_fault
>
> So while we could improve it, the question is, can we do this without
> accidentally slowing things down by more than 0.69%? And kernel
> builds are a pretty clear_page_c() intense workload.

I usually see much higher clear_page_c, but I'm using a pretty old 
system for most of my testing.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 14:50                                         ` Avi Kivity
@ 2011-05-30 19:32                                           ` Ingo Molnar
  0 siblings, 0 replies; 72+ messages in thread
From: Ingo Molnar @ 2011-05-30 19:32 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Pekka Enberg, Takuya Yoshikawa, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa


* Avi Kivity <avi@redhat.com> wrote:

> > Also, a profile of a typical kernel build shows:
> >
> >      0.69% cc1 [kernel.kallsyms] [k] clear_page_c
> >      0.49% cc1 [kernel.kallsyms] [k] page_fault
> >
> > So while we could improve it, the question is, can we do this without
> > accidentally slowing things down by more than 0.69%? And kernel
> > builds are a pretty clear_page_c() intense workload.
> 
> I usually see much higher clear_page_c, but I'm using a pretty old 
> system for most of my testing.

Well, clear_page_c() is still the most expensive kernel function.

I'd suggest using a modern system *and* using percise PEBS profiling 
via perf record -e cycles:pp - otherwise it's easy to see profiling 
artifacts, especially in and around short functions like 
clear_page_c().

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:36                             ` Ingo Molnar
  2011-05-30 12:44                               ` Avi Kivity
  2011-05-30 13:05                               ` Sasha Levin
@ 2011-06-03  7:27                               ` Sasha Levin
  2011-06-03  7:34                                 ` Ingo Molnar
  2011-06-05 12:12                               ` Avi Kivity
  3 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-06-03  7:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, Takuya Yoshikawa, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On Mon, 2011-05-30 at 14:36 +0200, Ingo Molnar wrote:
> Right now i can produce something similar to it: 42 vcpus on a single 
> CPU:
> 
> 	$ taskse 1 kvm run --cpus 42
> 
> And that hangs early on during bootup, around:
> 
> [    0.236000] Disabled fast string operations
> [    0.242000]  #4
> [    0.270000] Disabled fast string operations
> [    0.275000]  #5
> [    0.317000] Disabled fast string operations
> [    0.322000]  #6
> [    0.352000] Disabled fast string operations
> [    0.358000]  #7
> [    0.414000] Disabled fast string operations
> 
> The threads seem to be livelocked:
> 
>   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND                                    
> 22227 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.39 kvm                                        
> 22230 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.36 kvm                                        
> 22226 mingo     20   0  471g  95m  904 R 11.9  0.8   0:07.04 kvm                                        
> 22228 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.38 kvm                                        
> 22229 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm                                        
> 22231 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm                                        
> 22232 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.36 kvm                                        
> 22233 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.33 kvm                                        
>     7 root      -2  19     0    0    0 S  2.0  0.0   1:12.53 rcuc0             
> 
> with no apparent progress being made.

Since it's something that worked in 2.6.37, I've looked into it to find
what might have caused this issue.

I've bisected guest kernels and found that the problem starts with:

a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
Author: Paul E. McKenney <paul.mckenney@linaro.org>
Date:   Wed Jan 12 14:10:23 2011 -0800

    rcu: move TREE_RCU from softirq to kthread

Ingo, could you confirm that the problem goes away for you when you use
an earlier commit?
-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03  7:27                               ` Sasha Levin
@ 2011-06-03  7:34                                 ` Ingo Molnar
  2011-06-03  7:54                                   ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Ingo Molnar @ 2011-06-03  7:34 UTC (permalink / raw)
  To: Sasha Levin, Yinghai Lu
  Cc: Avi Kivity, Takuya Yoshikawa, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa


* Sasha Levin <levinsasha928@gmail.com> wrote:

> > with no apparent progress being made.
> 
> Since it's something that worked in 2.6.37, I've looked into it to 
> find what might have caused this issue.
> 
> I've bisected guest kernels and found that the problem starts with:
> 
> a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> Author: Paul E. McKenney <paul.mckenney@linaro.org>
> Date:   Wed Jan 12 14:10:23 2011 -0800
> 
>     rcu: move TREE_RCU from softirq to kthread
> 
> Ingo, could you confirm that the problem goes away for you when you 
> use an earlier commit?

testing will have to wait, but there's a recent upstream fix:

  d72bce0e67e8: rcu: Cure load woes

That *might* perhaps address this problem too.

If not then this appears to be some sort of RCU related livelock with 
brutally overcommitted vcpus. On native this would show up too, in a 
less drastic form, as a spurious bootup delay.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03  7:34                                 ` Ingo Molnar
@ 2011-06-03  7:54                                   ` Sasha Levin
  2011-06-03 19:31                                     ` Paul E. McKenney
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-06-03  7:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, Avi Kivity, Takuya Yoshikawa, Pekka Enberg, kvm,
	asias.hejun, gorcunov, prasadjoshi124, Paul E. McKenney,
	takuya.yoshikawa

On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > > with no apparent progress being made.
> > 
> > Since it's something that worked in 2.6.37, I've looked into it to 
> > find what might have caused this issue.
> > 
> > I've bisected guest kernels and found that the problem starts with:
> > 
> > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > Date:   Wed Jan 12 14:10:23 2011 -0800
> > 
> >     rcu: move TREE_RCU from softirq to kthread
> > 
> > Ingo, could you confirm that the problem goes away for you when you 
> > use an earlier commit?
> 
> testing will have to wait, but there's a recent upstream fix:
> 
>   d72bce0e67e8: rcu: Cure load woes
> 
> That *might* perhaps address this problem too.
> 
I've re-tested with Linus's current git, the problem is still there.

> If not then this appears to be some sort of RCU related livelock with 
> brutally overcommitted vcpus. On native this would show up too, in a 
> less drastic form, as a spurious bootup delay.

I don't think it was overcommited by *that* much. With that commit it
usually hangs at 20-40 vcpus, while without it I can go up to 255.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03  7:54                                   ` Sasha Levin
@ 2011-06-03 19:31                                     ` Paul E. McKenney
  2011-06-03 19:56                                       ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Paul E. McKenney @ 2011-06-03 19:31 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > 
> > > > with no apparent progress being made.
> > > 
> > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > find what might have caused this issue.
> > > 
> > > I've bisected guest kernels and found that the problem starts with:
> > > 
> > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > 
> > >     rcu: move TREE_RCU from softirq to kthread
> > > 
> > > Ingo, could you confirm that the problem goes away for you when you 
> > > use an earlier commit?
> > 
> > testing will have to wait, but there's a recent upstream fix:
> > 
> >   d72bce0e67e8: rcu: Cure load woes
> > 
> > That *might* perhaps address this problem too.
> > 
> I've re-tested with Linus's current git, the problem is still there.
> 
> > If not then this appears to be some sort of RCU related livelock with 
> > brutally overcommitted vcpus. On native this would show up too, in a 
> > less drastic form, as a spurious bootup delay.
> 
> I don't think it was overcommited by *that* much. With that commit it
> usually hangs at 20-40 vcpus, while without it I can go up to 255.

Here is a diagnostic patch, untested.  It assumes that your system
has only a few CPUs (maybe 8-16) and that timers are still running.
It dumps out some RCU state if grace periods extend for more than
a few seconds.

To activate it, call rcu_diag_timer_start() from process context.
To stop it, call rcu_diag_timer_stop(), also from process context.

Thoughts?

							Thanx, Paul

------------------------------------------------------------------------

rcu: diagnostic check of kthread state
    
Not-signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 99f9aa7..489ea1b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -80,6 +80,8 @@ extern void call_rcu_sched(struct rcu_head *head,
 extern void synchronize_sched(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
+extern void rcu_diag_timer_start(void);
+extern void rcu_diag_timer_stop(void);
 
 static inline void __rcu_read_lock_bh(void)
 {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 89419ff..bb61574 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -2423,3 +2423,48 @@ void __init rcu_init(void)
 }
 
 #include "rcutree_plugin.h"
+
+/* Diagnostic code for boot-time hangs observed in early 3.0 days. */
+
+static int rcu_diag_timer_must_stop;
+struct timer_list rcu_diag_timer;
+#define RCU_DIAG_TIMER_PERIOD	(10 * HZ)
+
+static void rcu_diag_timer_handler(unsigned long unused)
+{
+	int cpu;
+
+	if (rcu_diag_timer_must_stop)
+		return;
+
+	if(ULONG_CMP_GE(jiffies,
+			rcu_sched_state.gp_start + RCU_DIAG_TIMER_PERIOD))
+		for_each_online_cpu(cpu) {
+			printk(KERN_ALERT "rcu_diag: rcuc%d %u/%u/%d ",
+			       cpu,
+			       per_cpu(rcu_cpu_kthread_status, cpu),
+			       per_cpu(rcu_cpu_kthread_loops, cpu),
+			       per_cpu(rcu_cpu_has_work, cpu));
+			sched_show_task(current);
+		}
+
+	if (rcu_diag_timer_must_stop)
+		return;
+	mod_timer(&rcu_diag_timer, RCU_DIAG_TIMER_PERIOD + jiffies);
+}
+
+void rcu_diag_timer_start(void)
+{
+	rcu_diag_timer_must_stop = 0;
+	setup_timer(&rcu_diag_timer,
+		    rcu_diag_timer_handler, (unsigned long) NULL);
+	mod_timer(&rcu_diag_timer, RCU_DIAG_TIMER_PERIOD + jiffies);
+}
+EXPORT_SYMBOL_GPL(rcu_diag_timer_start);
+
+void rcu_diag_timer_stop(void)
+{
+	rcu_diag_timer_must_stop = 1;
+	del_timer(&rcu_diag_timer);
+}
+EXPORT_SYMBOL_GPL(rcu_diag_timer_stop);

^ permalink raw reply related	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03 19:31                                     ` Paul E. McKenney
@ 2011-06-03 19:56                                       ` Sasha Levin
  2011-06-03 20:22                                         ` Paul E. McKenney
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-06-03 19:56 UTC (permalink / raw)
  To: paulmck
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > 
> > > > > with no apparent progress being made.
> > > > 
> > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > find what might have caused this issue.
> > > > 
> > > > I've bisected guest kernels and found that the problem starts with:
> > > > 
> > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > 
> > > >     rcu: move TREE_RCU from softirq to kthread
> > > > 
> > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > use an earlier commit?
> > > 
> > > testing will have to wait, but there's a recent upstream fix:
> > > 
> > >   d72bce0e67e8: rcu: Cure load woes
> > > 
> > > That *might* perhaps address this problem too.
> > > 
> > I've re-tested with Linus's current git, the problem is still there.
> > 
> > > If not then this appears to be some sort of RCU related livelock with 
> > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > less drastic form, as a spurious bootup delay.
> > 
> > I don't think it was overcommited by *that* much. With that commit it
> > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> 
> Here is a diagnostic patch, untested.  It assumes that your system
> has only a few CPUs (maybe 8-16) and that timers are still running.
> It dumps out some RCU state if grace periods extend for more than
> a few seconds.
> 
> To activate it, call rcu_diag_timer_start() from process context.
> To stop it, call rcu_diag_timer_stop(), also from process context.

Since the hang happens in guest kernel very early during boot, I can't
call rcu_diag_timer_start(). What would be a good place to put the
_start() code instead?

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03 19:56                                       ` Sasha Levin
@ 2011-06-03 20:22                                         ` Paul E. McKenney
  2011-06-03 21:03                                           ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Paul E. McKenney @ 2011-06-03 20:22 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > 
> > > > > > with no apparent progress being made.
> > > > > 
> > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > find what might have caused this issue.
> > > > > 
> > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > 
> > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > 
> > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > 
> > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > use an earlier commit?
> > > > 
> > > > testing will have to wait, but there's a recent upstream fix:
> > > > 
> > > >   d72bce0e67e8: rcu: Cure load woes
> > > > 
> > > > That *might* perhaps address this problem too.
> > > > 
> > > I've re-tested with Linus's current git, the problem is still there.
> > > 
> > > > If not then this appears to be some sort of RCU related livelock with 
> > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > less drastic form, as a spurious bootup delay.
> > > 
> > > I don't think it was overcommited by *that* much. With that commit it
> > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > 
> > Here is a diagnostic patch, untested.  It assumes that your system
> > has only a few CPUs (maybe 8-16) and that timers are still running.
> > It dumps out some RCU state if grace periods extend for more than
> > a few seconds.
> > 
> > To activate it, call rcu_diag_timer_start() from process context.
> > To stop it, call rcu_diag_timer_stop(), also from process context.
> 
> Since the hang happens in guest kernel very early during boot, I can't
> call rcu_diag_timer_start(). What would be a good place to put the
> _start() code instead?

Assuming that the failure occurs in the host OS rather than in the guest
OS, I suggest placing rcu_diag_timer_start() in the host code that starts
up the guest.

On the other hand, if the failure is occuring in the guest OS, then
I suggest placing the call to rcu_diag_timer_start() just after timer
initialization -- that is, assuming that interrupts are enabled at the
time of the failure.  If interrupts are not yet enabled at the time of
the failure, color me clueless.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03 20:22                                         ` Paul E. McKenney
@ 2011-06-03 21:03                                           ` Sasha Levin
  2011-06-03 21:20                                             ` Paul E. McKenney
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-06-03 21:03 UTC (permalink / raw)
  To: paulmck
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Fri, 2011-06-03 at 13:22 -0700, Paul E. McKenney wrote:
> On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> > On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > 
> > > > > > > with no apparent progress being made.
> > > > > > 
> > > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > > find what might have caused this issue.
> > > > > > 
> > > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > > 
> > > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > > 
> > > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > > 
> > > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > > use an earlier commit?
> > > > > 
> > > > > testing will have to wait, but there's a recent upstream fix:
> > > > > 
> > > > >   d72bce0e67e8: rcu: Cure load woes
> > > > > 
> > > > > That *might* perhaps address this problem too.
> > > > > 
> > > > I've re-tested with Linus's current git, the problem is still there.
> > > > 
> > > > > If not then this appears to be some sort of RCU related livelock with 
> > > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > > less drastic form, as a spurious bootup delay.
> > > > 
> > > > I don't think it was overcommited by *that* much. With that commit it
> > > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > > 
> > > Here is a diagnostic patch, untested.  It assumes that your system
> > > has only a few CPUs (maybe 8-16) and that timers are still running.
> > > It dumps out some RCU state if grace periods extend for more than
> > > a few seconds.
> > > 
> > > To activate it, call rcu_diag_timer_start() from process context.
> > > To stop it, call rcu_diag_timer_stop(), also from process context.
> > 
> > Since the hang happens in guest kernel very early during boot, I can't
> > call rcu_diag_timer_start(). What would be a good place to put the
> > _start() code instead?
> 
> Assuming that the failure occurs in the host OS rather than in the guest
> OS, I suggest placing rcu_diag_timer_start() in the host code that starts
> up the guest.
> 
> On the other hand, if the failure is occuring in the guest OS, then
> I suggest placing the call to rcu_diag_timer_start() just after timer
> initialization -- that is, assuming that interrupts are enabled at the
> time of the failure.  If interrupts are not yet enabled at the time of
> the failure, color me clueless.

It happens in the guest OS, the bisection was done on a guest kernel.

I've placed the rcu debug _start() right at the end of init_timers()
which happens before the hang, but I'm not seeing any output from the
debug function.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03 21:03                                           ` Sasha Levin
@ 2011-06-03 21:20                                             ` Paul E. McKenney
  2011-06-03 22:54                                               ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Paul E. McKenney @ 2011-06-03 21:20 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Sat, Jun 04, 2011 at 12:03:59AM +0300, Sasha Levin wrote:
> On Fri, 2011-06-03 at 13:22 -0700, Paul E. McKenney wrote:
> > On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> > > On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > > > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > > 
> > > > > > > > with no apparent progress being made.
> > > > > > > 
> > > > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > > > find what might have caused this issue.
> > > > > > > 
> > > > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > > > 
> > > > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > > > 
> > > > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > > > 
> > > > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > > > use an earlier commit?
> > > > > > 
> > > > > > testing will have to wait, but there's a recent upstream fix:
> > > > > > 
> > > > > >   d72bce0e67e8: rcu: Cure load woes
> > > > > > 
> > > > > > That *might* perhaps address this problem too.
> > > > > > 
> > > > > I've re-tested with Linus's current git, the problem is still there.
> > > > > 
> > > > > > If not then this appears to be some sort of RCU related livelock with 
> > > > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > > > less drastic form, as a spurious bootup delay.
> > > > > 
> > > > > I don't think it was overcommited by *that* much. With that commit it
> > > > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > > > 
> > > > Here is a diagnostic patch, untested.  It assumes that your system
> > > > has only a few CPUs (maybe 8-16) and that timers are still running.
> > > > It dumps out some RCU state if grace periods extend for more than
> > > > a few seconds.
> > > > 
> > > > To activate it, call rcu_diag_timer_start() from process context.
> > > > To stop it, call rcu_diag_timer_stop(), also from process context.
> > > 
> > > Since the hang happens in guest kernel very early during boot, I can't
> > > call rcu_diag_timer_start(). What would be a good place to put the
> > > _start() code instead?
> > 
> > Assuming that the failure occurs in the host OS rather than in the guest
> > OS, I suggest placing rcu_diag_timer_start() in the host code that starts
> > up the guest.
> > 
> > On the other hand, if the failure is occuring in the guest OS, then
> > I suggest placing the call to rcu_diag_timer_start() just after timer
> > initialization -- that is, assuming that interrupts are enabled at the
> > time of the failure.  If interrupts are not yet enabled at the time of
> > the failure, color me clueless.
> 
> It happens in the guest OS, the bisection was done on a guest kernel.
> 
> I've placed the rcu debug _start() right at the end of init_timers()
> which happens before the hang, but I'm not seeing any output from the
> debug function.

Sounds like the hang is happening before timers start?  Are scheduling
clock interrupts happening in the guest, as in scheduler_tick()?  If so,
I could just as easily put the debug there.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03 21:20                                             ` Paul E. McKenney
@ 2011-06-03 22:54                                               ` Sasha Levin
  2011-06-03 23:05                                                 ` Paul E. McKenney
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-06-03 22:54 UTC (permalink / raw)
  To: paulmck
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Fri, 2011-06-03 at 14:20 -0700, Paul E. McKenney wrote:
> On Sat, Jun 04, 2011 at 12:03:59AM +0300, Sasha Levin wrote:
> > On Fri, 2011-06-03 at 13:22 -0700, Paul E. McKenney wrote:
> > > On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> > > > On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > > > > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > > > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > > > 
> > > > > > > > > with no apparent progress being made.
> > > > > > > > 
> > > > > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > > > > find what might have caused this issue.
> > > > > > > > 
> > > > > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > > > > 
> > > > > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > > > > 
> > > > > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > > > > 
> > > > > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > > > > use an earlier commit?
> > > > > > > 
> > > > > > > testing will have to wait, but there's a recent upstream fix:
> > > > > > > 
> > > > > > >   d72bce0e67e8: rcu: Cure load woes
> > > > > > > 
> > > > > > > That *might* perhaps address this problem too.
> > > > > > > 
> > > > > > I've re-tested with Linus's current git, the problem is still there.
> > > > > > 
> > > > > > > If not then this appears to be some sort of RCU related livelock with 
> > > > > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > > > > less drastic form, as a spurious bootup delay.
> > > > > > 
> > > > > > I don't think it was overcommited by *that* much. With that commit it
> > > > > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > > > > 
> > > > > Here is a diagnostic patch, untested.  It assumes that your system
> > > > > has only a few CPUs (maybe 8-16) and that timers are still running.
> > > > > It dumps out some RCU state if grace periods extend for more than
> > > > > a few seconds.
> > > > > 
> > > > > To activate it, call rcu_diag_timer_start() from process context.
> > > > > To stop it, call rcu_diag_timer_stop(), also from process context.
> > > > 
> > > > Since the hang happens in guest kernel very early during boot, I can't
> > > > call rcu_diag_timer_start(). What would be a good place to put the
> > > > _start() code instead?
> > > 
> > > Assuming that the failure occurs in the host OS rather than in the guest
> > > OS, I suggest placing rcu_diag_timer_start() in the host code that starts
> > > up the guest.
> > > 
> > > On the other hand, if the failure is occuring in the guest OS, then
> > > I suggest placing the call to rcu_diag_timer_start() just after timer
> > > initialization -- that is, assuming that interrupts are enabled at the
> > > time of the failure.  If interrupts are not yet enabled at the time of
> > > the failure, color me clueless.
> > 
> > It happens in the guest OS, the bisection was done on a guest kernel.
> > 
> > I've placed the rcu debug _start() right at the end of init_timers()
> > which happens before the hang, but I'm not seeing any output from the
> > debug function.
> 
> Sounds like the hang is happening before timers start?  Are scheduling
> clock interrupts happening in the guest, as in scheduler_tick()?  If so,
> I could just as easily put the debug there.

Yes, scheduler_tick() is running before the hang.
Should I just call the handler function directly from there?

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03 22:54                                               ` Sasha Levin
@ 2011-06-03 23:05                                                 ` Paul E. McKenney
  2011-06-04  6:26                                                   ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Paul E. McKenney @ 2011-06-03 23:05 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Sat, Jun 04, 2011 at 01:54:45AM +0300, Sasha Levin wrote:
> On Fri, 2011-06-03 at 14:20 -0700, Paul E. McKenney wrote:
> > On Sat, Jun 04, 2011 at 12:03:59AM +0300, Sasha Levin wrote:
> > > On Fri, 2011-06-03 at 13:22 -0700, Paul E. McKenney wrote:
> > > > On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> > > > > On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > > > > > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > > > > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > > > > 
> > > > > > > > > > with no apparent progress being made.
> > > > > > > > > 
> > > > > > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > > > > > find what might have caused this issue.
> > > > > > > > > 
> > > > > > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > > > > > 
> > > > > > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > > > > > 
> > > > > > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > > > > > 
> > > > > > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > > > > > use an earlier commit?
> > > > > > > > 
> > > > > > > > testing will have to wait, but there's a recent upstream fix:
> > > > > > > > 
> > > > > > > >   d72bce0e67e8: rcu: Cure load woes
> > > > > > > > 
> > > > > > > > That *might* perhaps address this problem too.
> > > > > > > > 
> > > > > > > I've re-tested with Linus's current git, the problem is still there.
> > > > > > > 
> > > > > > > > If not then this appears to be some sort of RCU related livelock with 
> > > > > > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > > > > > less drastic form, as a spurious bootup delay.
> > > > > > > 
> > > > > > > I don't think it was overcommited by *that* much. With that commit it
> > > > > > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > > > > > 
> > > > > > Here is a diagnostic patch, untested.  It assumes that your system
> > > > > > has only a few CPUs (maybe 8-16) and that timers are still running.
> > > > > > It dumps out some RCU state if grace periods extend for more than
> > > > > > a few seconds.
> > > > > > 
> > > > > > To activate it, call rcu_diag_timer_start() from process context.
> > > > > > To stop it, call rcu_diag_timer_stop(), also from process context.
> > > > > 
> > > > > Since the hang happens in guest kernel very early during boot, I can't
> > > > > call rcu_diag_timer_start(). What would be a good place to put the
> > > > > _start() code instead?
> > > > 
> > > > Assuming that the failure occurs in the host OS rather than in the guest
> > > > OS, I suggest placing rcu_diag_timer_start() in the host code that starts
> > > > up the guest.
> > > > 
> > > > On the other hand, if the failure is occuring in the guest OS, then
> > > > I suggest placing the call to rcu_diag_timer_start() just after timer
> > > > initialization -- that is, assuming that interrupts are enabled at the
> > > > time of the failure.  If interrupts are not yet enabled at the time of
> > > > the failure, color me clueless.
> > > 
> > > It happens in the guest OS, the bisection was done on a guest kernel.
> > > 
> > > I've placed the rcu debug _start() right at the end of init_timers()
> > > which happens before the hang, but I'm not seeing any output from the
> > > debug function.
> > 
> > Sounds like the hang is happening before timers start?  Are scheduling
> > clock interrupts happening in the guest, as in scheduler_tick()?  If so,
> > I could just as easily put the debug there.
> 
> Yes, scheduler_tick() is running before the hang.
> Should I just call the handler function directly from there?

I suggest calling it no more than once every few seconds, but yes.
(Otherwise you will get a very full dmesg.)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-03 23:05                                                 ` Paul E. McKenney
@ 2011-06-04  6:26                                                   ` Sasha Levin
  2011-06-04 16:30                                                     ` Paul E. McKenney
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-06-04  6:26 UTC (permalink / raw)
  To: paulmck
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Fri, 2011-06-03 at 16:05 -0700, Paul E. McKenney wrote:
> On Sat, Jun 04, 2011 at 01:54:45AM +0300, Sasha Levin wrote:
> > On Fri, 2011-06-03 at 14:20 -0700, Paul E. McKenney wrote:
> > > On Sat, Jun 04, 2011 at 12:03:59AM +0300, Sasha Levin wrote:
> > > > On Fri, 2011-06-03 at 13:22 -0700, Paul E. McKenney wrote:
> > > > > On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> > > > > > On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > > > > > > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > > > > > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > > > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > > > > > 
> > > > > > > > > > > with no apparent progress being made.
> > > > > > > > > > 
> > > > > > > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > > > > > > find what might have caused this issue.
> > > > > > > > > > 
> > > > > > > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > > > > > > 
> > > > > > > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > > > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > > > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > > > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > > > > > > 
> > > > > > > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > > > > > > 
> > > > > > > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > > > > > > use an earlier commit?
> > > > > > > > > 
> > > > > > > > > testing will have to wait, but there's a recent upstream fix:
> > > > > > > > > 
> > > > > > > > >   d72bce0e67e8: rcu: Cure load woes
> > > > > > > > > 
> > > > > > > > > That *might* perhaps address this problem too.
> > > > > > > > > 
> > > > > > > > I've re-tested with Linus's current git, the problem is still there.
> > > > > > > > 
> > > > > > > > > If not then this appears to be some sort of RCU related livelock with 
> > > > > > > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > > > > > > less drastic form, as a spurious bootup delay.
> > > > > > > > 
> > > > > > > > I don't think it was overcommited by *that* much. With that commit it
> > > > > > > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > > > > > > 
> > > > > > > Here is a diagnostic patch, untested.  It assumes that your system
> > > > > > > has only a few CPUs (maybe 8-16) and that timers are still running.
> > > > > > > It dumps out some RCU state if grace periods extend for more than
> > > > > > > a few seconds.
> > > > > > > 
> > > > > > > To activate it, call rcu_diag_timer_start() from process context.
> > > > > > > To stop it, call rcu_diag_timer_stop(), also from process context.
> > > > > > 
> > > > > > Since the hang happens in guest kernel very early during boot, I can't
> > > > > > call rcu_diag_timer_start(). What would be a good place to put the
> > > > > > _start() code instead?
> > > > > 
> > > > > Assuming that the failure occurs in the host OS rather than in the guest
> > > > > OS, I suggest placing rcu_diag_timer_start() in the host code that starts
> > > > > up the guest.
> > > > > 
> > > > > On the other hand, if the failure is occuring in the guest OS, then
> > > > > I suggest placing the call to rcu_diag_timer_start() just after timer
> > > > > initialization -- that is, assuming that interrupts are enabled at the
> > > > > time of the failure.  If interrupts are not yet enabled at the time of
> > > > > the failure, color me clueless.
> > > > 
> > > > It happens in the guest OS, the bisection was done on a guest kernel.
> > > > 
> > > > I've placed the rcu debug _start() right at the end of init_timers()
> > > > which happens before the hang, but I'm not seeing any output from the
> > > > debug function.
> > > 
> > > Sounds like the hang is happening before timers start?  Are scheduling
> > > clock interrupts happening in the guest, as in scheduler_tick()?  If so,
> > > I could just as easily put the debug there.
> > 
> > Yes, scheduler_tick() is running before the hang.
> > Should I just call the handler function directly from there?
> 
> I suggest calling it no more than once every few seconds, but yes.
> (Otherwise you will get a very full dmesg.)

Here is the last group of printk(), basically it just repeats itself
when it's stuck:

KVM setup async PF for cpu 19
 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc17 2/46/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc18 2/23/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc19 0/0/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
[    0.527044]  #20
 0000000000000000
Call Trace:
rcu_diag: rcuc3 0/0/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc4 2/358/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc5 2/340/1 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc6 2/321/1 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc7 2/301/1 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc8 2/277/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc9 2/255/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc10 2/226/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc11 2/214/1 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc12 2/193/1 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc13 2/172/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc14 2/146/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc15 2/130/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc16 2/107/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc17 2/86/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc18 2/62/1 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
rcu_diag: rcuc19 2/39/0 
kworker/0:1     R  running task        0     0     14 0x00000008
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:


-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-04  6:26                                                   ` Sasha Levin
@ 2011-06-04 16:30                                                     ` Paul E. McKenney
  2011-06-14 22:26                                                       ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Paul E. McKenney @ 2011-06-04 16:30 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Sat, Jun 04, 2011 at 09:26:15AM +0300, Sasha Levin wrote:
> On Fri, 2011-06-03 at 16:05 -0700, Paul E. McKenney wrote:
> > On Sat, Jun 04, 2011 at 01:54:45AM +0300, Sasha Levin wrote:
> > > On Fri, 2011-06-03 at 14:20 -0700, Paul E. McKenney wrote:
> > > > On Sat, Jun 04, 2011 at 12:03:59AM +0300, Sasha Levin wrote:
> > > > > On Fri, 2011-06-03 at 13:22 -0700, Paul E. McKenney wrote:
> > > > > > On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> > > > > > > On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > > > > > > > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > > > > > > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > > > > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > > > > > > 
> > > > > > > > > > > > with no apparent progress being made.
> > > > > > > > > > > 
> > > > > > > > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > > > > > > > find what might have caused this issue.
> > > > > > > > > > > 
> > > > > > > > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > > > > > > > 
> > > > > > > > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > > > > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > > > > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > > > > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > > > > > > > 
> > > > > > > > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > > > > > > > 
> > > > > > > > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > > > > > > > use an earlier commit?
> > > > > > > > > > 
> > > > > > > > > > testing will have to wait, but there's a recent upstream fix:
> > > > > > > > > > 
> > > > > > > > > >   d72bce0e67e8: rcu: Cure load woes
> > > > > > > > > > 
> > > > > > > > > > That *might* perhaps address this problem too.
> > > > > > > > > > 
> > > > > > > > > I've re-tested with Linus's current git, the problem is still there.
> > > > > > > > > 
> > > > > > > > > > If not then this appears to be some sort of RCU related livelock with 
> > > > > > > > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > > > > > > > less drastic form, as a spurious bootup delay.
> > > > > > > > > 
> > > > > > > > > I don't think it was overcommited by *that* much. With that commit it
> > > > > > > > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > > > > > > > 
> > > > > > > > Here is a diagnostic patch, untested.  It assumes that your system
> > > > > > > > has only a few CPUs (maybe 8-16) and that timers are still running.
> > > > > > > > It dumps out some RCU state if grace periods extend for more than
> > > > > > > > a few seconds.
> > > > > > > > 
> > > > > > > > To activate it, call rcu_diag_timer_start() from process context.
> > > > > > > > To stop it, call rcu_diag_timer_stop(), also from process context.
> > > > > > > 
> > > > > > > Since the hang happens in guest kernel very early during boot, I can't
> > > > > > > call rcu_diag_timer_start(). What would be a good place to put the
> > > > > > > _start() code instead?
> > > > > > 
> > > > > > Assuming that the failure occurs in the host OS rather than in the guest
> > > > > > OS, I suggest placing rcu_diag_timer_start() in the host code that starts
> > > > > > up the guest.
> > > > > > 
> > > > > > On the other hand, if the failure is occuring in the guest OS, then
> > > > > > I suggest placing the call to rcu_diag_timer_start() just after timer
> > > > > > initialization -- that is, assuming that interrupts are enabled at the
> > > > > > time of the failure.  If interrupts are not yet enabled at the time of
> > > > > > the failure, color me clueless.
> > > > > 
> > > > > It happens in the guest OS, the bisection was done on a guest kernel.
> > > > > 
> > > > > I've placed the rcu debug _start() right at the end of init_timers()
> > > > > which happens before the hang, but I'm not seeing any output from the
> > > > > debug function.
> > > > 
> > > > Sounds like the hang is happening before timers start?  Are scheduling
> > > > clock interrupts happening in the guest, as in scheduler_tick()?  If so,
> > > > I could just as easily put the debug there.
> > > 
> > > Yes, scheduler_tick() is running before the hang.
> > > Should I just call the handler function directly from there?
> > 
> > I suggest calling it no more than once every few seconds, but yes.
> > (Otherwise you will get a very full dmesg.)
> 
> Here is the last group of printk(), basically it just repeats itself
> when it's stuck:
> 
> KVM setup async PF for cpu 19
>  0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc17 2/46/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc18 2/23/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc19 0/0/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> [    0.527044]  #20
>  0000000000000000
> Call Trace:
> rcu_diag: rcuc3 0/0/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc4 2/358/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc5 2/340/1 

OK, this one (RCU's per-CPU kthread for CPU 5) has work to do (this
is the "1" after the final "/").  Does rcuc5 show up later in the
output?

							Thanx, Paul

> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc6 2/321/1 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc7 2/301/1 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc8 2/277/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc9 2/255/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc10 2/226/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc11 2/214/1 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc12 2/193/1 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc13 2/172/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc14 2/146/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc15 2/130/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc16 2/107/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc17 2/86/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc18 2/62/1 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> rcu_diag: rcuc19 2/39/0 
> kworker/0:1     R  running task        0     0     14 0x00000008
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> 
> 
> -- 
> 
> Sasha.
> 

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-05-30 12:36                             ` Ingo Molnar
                                                 ` (2 preceding siblings ...)
  2011-06-03  7:27                               ` Sasha Levin
@ 2011-06-05 12:12                               ` Avi Kivity
  3 siblings, 0 replies; 72+ messages in thread
From: Avi Kivity @ 2011-06-05 12:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Takuya Yoshikawa, Sasha Levin, Pekka Enberg, kvm, asias.hejun,
	gorcunov, prasadjoshi124, Paul E. McKenney, takuya.yoshikawa

On 05/30/2011 03:36 PM, Ingo Molnar wrote:
> You are probably right about 1024 CPUs.
>
> Right now i can produce something similar to it: 42 vcpus on a single
> CPU:
>
> 	$ taskse 1 kvm run --cpus 42
>
> And that hangs early on during bootup, around:
>
> [    0.236000] Disabled fast string operations
> [    0.242000]  #4
> [    0.270000] Disabled fast string operations
> [    0.275000]  #5
> [    0.317000] Disabled fast string operations
> [    0.322000]  #6
> [    0.352000] Disabled fast string operations
> [    0.358000]  #7
> [    0.414000] Disabled fast string operations
>
> The threads seem to be livelocked:
>
>    PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
> 22227 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.39 kvm
> 22230 mingo     20   0  471g  95m  904 R 12.9  0.8   0:06.36 kvm
> 22226 mingo     20   0  471g  95m  904 R 11.9  0.8   0:07.04 kvm
> 22228 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.38 kvm
> 22229 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm
> 22231 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.37 kvm
> 22232 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.36 kvm
> 22233 mingo     20   0  471g  95m  904 R 11.9  0.8   0:06.33 kvm
>      7 root      -2  19     0    0    0 S  2.0  0.0   1:12.53 rcuc0
>
> with no apparent progress being made.
>


Does your host kernel include 8fa2206821953 ("KVM: make guest mode entry 
to be rcu quiescent state")?

I'm guessing it does, and that it doesn't matter anyway (you have a lot 
of ordinary host context switches), but it doesn't hurt to check.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-04 16:30                                                     ` Paul E. McKenney
@ 2011-06-14 22:26                                                       ` Sasha Levin
  2011-06-14 23:42                                                         ` Paul E. McKenney
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-06-14 22:26 UTC (permalink / raw)
  To: paulmck
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

Sorry for the delay on this.

On Sat, 2011-06-04 at 09:30 -0700, Paul E. McKenney wrote:
> On Sat, Jun 04, 2011 at 09:26:15AM +0300, Sasha Levin wrote:
> > On Fri, 2011-06-03 at 16:05 -0700, Paul E. McKenney wrote:
> > > On Sat, Jun 04, 2011 at 01:54:45AM +0300, Sasha Levin wrote:
> > > > On Fri, 2011-06-03 at 14:20 -0700, Paul E. McKenney wrote:
> > > > > On Sat, Jun 04, 2011 at 12:03:59AM +0300, Sasha Levin wrote:
> > > > > > On Fri, 2011-06-03 at 13:22 -0700, Paul E. McKenney wrote:
> > > > > > > On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> > > > > > > > On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > > > > > > > > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > > > > > > > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > > > > > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > > > > > > > 
> > > > > > > > > > > > > with no apparent progress being made.
> > > > > > > > > > > > 
> > > > > > > > > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > > > > > > > > find what might have caused this issue.
> > > > > > > > > > > > 
> > > > > > > > > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > > > > > > > > 
> > > > > > > > > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > > > > > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > > > > > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > > > > > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > > > > > > > > 
> > > > > > > > > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > > > > > > > > 
> > > > > > > > > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > > > > > > > > use an earlier commit?
> > > > > > > > > > > 
> > > > > > > > > > > testing will have to wait, but there's a recent upstream fix:
> > > > > > > > > > > 
> > > > > > > > > > >   d72bce0e67e8: rcu: Cure load woes
> > > > > > > > > > > 
> > > > > > > > > > > That *might* perhaps address this problem too.
> > > > > > > > > > > 
> > > > > > > > > > I've re-tested with Linus's current git, the problem is still there.
> > > > > > > > > > 
> > > > > > > > > > > If not then this appears to be some sort of RCU related livelock with 
> > > > > > > > > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > > > > > > > > less drastic form, as a spurious bootup delay.
> > > > > > > > > > 
> > > > > > > > > > I don't think it was overcommited by *that* much. With that commit it
> > > > > > > > > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > > > > > > > > 
> > > > > > > > > Here is a diagnostic patch, untested.  It assumes that your system
> > > > > > > > > has only a few CPUs (maybe 8-16) and that timers are still running.
> > > > > > > > > It dumps out some RCU state if grace periods extend for more than
> > > > > > > > > a few seconds.
> > > > > > > > > 
> > > > > > > > > To activate it, call rcu_diag_timer_start() from process context.
> > > > > > > > > To stop it, call rcu_diag_timer_stop(), also from process context.
> > > > > > > > 
> > > > > > > > Since the hang happens in guest kernel very early during boot, I can't
> > > > > > > > call rcu_diag_timer_start(). What would be a good place to put the
> > > > > > > > _start() code instead?
> > > > > > > 
> > > > > > > Assuming that the failure occurs in the host OS rather than in the guest
> > > > > > > OS, I suggest placing rcu_diag_timer_start() in the host code that starts
> > > > > > > up the guest.
> > > > > > > 
> > > > > > > On the other hand, if the failure is occuring in the guest OS, then
> > > > > > > I suggest placing the call to rcu_diag_timer_start() just after timer
> > > > > > > initialization -- that is, assuming that interrupts are enabled at the
> > > > > > > time of the failure.  If interrupts are not yet enabled at the time of
> > > > > > > the failure, color me clueless.
> > > > > > 
> > > > > > It happens in the guest OS, the bisection was done on a guest kernel.
> > > > > > 
> > > > > > I've placed the rcu debug _start() right at the end of init_timers()
> > > > > > which happens before the hang, but I'm not seeing any output from the
> > > > > > debug function.
> > > > > 
> > > > > Sounds like the hang is happening before timers start?  Are scheduling
> > > > > clock interrupts happening in the guest, as in scheduler_tick()?  If so,
> > > > > I could just as easily put the debug there.
> > > > 
> > > > Yes, scheduler_tick() is running before the hang.
> > > > Should I just call the handler function directly from there?
> > > 
> > > I suggest calling it no more than once every few seconds, but yes.
> > > (Otherwise you will get a very full dmesg.)
> > 
> > Here is the last group of printk(), basically it just repeats itself
> > when it's stuck:
> > 
> > KVM setup async PF for cpu 19
> >  0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc17 2/46/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc18 2/23/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc19 0/0/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > [    0.527044]  #20
> >  0000000000000000
> > Call Trace:
> > rcu_diag: rcuc3 0/0/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc4 2/358/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc5 2/340/1 
> 
> OK, this one (RCU's per-CPU kthread for CPU 5) has work to do (this
> is the "1" after the final "/").  Does rcuc5 show up later in the
> output?
> 

The output grinds to a stop, I've pasted the last part of the prints -
there are no more prints at all past this one.

> 							Thanx, Paul
> 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc6 2/321/1 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc7 2/301/1 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc8 2/277/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc9 2/255/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc10 2/226/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc11 2/214/1 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc12 2/193/1 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc13 2/172/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc14 2/146/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc15 2/130/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc16 2/107/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc17 2/86/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc18 2/62/1 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > rcu_diag: rcuc19 2/39/0 
> > kworker/0:1     R  running task        0     0     14 0x00000008
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > Call Trace:
> > 
> > 
> > -- 
> > 
> > Sasha.
> > 

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-14 22:26                                                       ` Sasha Levin
@ 2011-06-14 23:42                                                         ` Paul E. McKenney
  2011-06-15  1:25                                                           ` Sasha Levin
  0 siblings, 1 reply; 72+ messages in thread
From: Paul E. McKenney @ 2011-06-14 23:42 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Tue, Jun 14, 2011 at 06:26:01PM -0400, Sasha Levin wrote:
> Sorry for the delay on this.

Actually, you might have had excellent timing on this one.

Could you please try Shaohua Li's patch at:

	https://lkml.org/lkml/2011/6/14/20

This patch makes RCU much less prone to causing massive scheduler lock
contention, which might well be the cause of the problems that you
are seeing.

							Thanx, Paul

> On Sat, 2011-06-04 at 09:30 -0700, Paul E. McKenney wrote:
> > On Sat, Jun 04, 2011 at 09:26:15AM +0300, Sasha Levin wrote:
> > > On Fri, 2011-06-03 at 16:05 -0700, Paul E. McKenney wrote:
> > > > On Sat, Jun 04, 2011 at 01:54:45AM +0300, Sasha Levin wrote:
> > > > > On Fri, 2011-06-03 at 14:20 -0700, Paul E. McKenney wrote:
> > > > > > On Sat, Jun 04, 2011 at 12:03:59AM +0300, Sasha Levin wrote:
> > > > > > > On Fri, 2011-06-03 at 13:22 -0700, Paul E. McKenney wrote:
> > > > > > > > On Fri, Jun 03, 2011 at 10:56:20PM +0300, Sasha Levin wrote:
> > > > > > > > > On Fri, 2011-06-03 at 12:31 -0700, Paul E. McKenney wrote:
> > > > > > > > > > On Fri, Jun 03, 2011 at 10:54:19AM +0300, Sasha Levin wrote:
> > > > > > > > > > > On Fri, 2011-06-03 at 09:34 +0200, Ingo Molnar wrote:
> > > > > > > > > > > > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > > > > > > > > > > > 
> > > > > > > > > > > > > > with no apparent progress being made.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Since it's something that worked in 2.6.37, I've looked into it to 
> > > > > > > > > > > > > find what might have caused this issue.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > I've bisected guest kernels and found that the problem starts with:
> > > > > > > > > > > > > 
> > > > > > > > > > > > > a26ac2455ffcf3be5c6ef92bc6df7182700f2114 is the first bad commit
> > > > > > > > > > > > > commit a26ac2455ffcf3be5c6ef92bc6df7182700f2114
> > > > > > > > > > > > > Author: Paul E. McKenney <paul.mckenney@linaro.org>
> > > > > > > > > > > > > Date:   Wed Jan 12 14:10:23 2011 -0800
> > > > > > > > > > > > > 
> > > > > > > > > > > > >     rcu: move TREE_RCU from softirq to kthread
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Ingo, could you confirm that the problem goes away for you when you 
> > > > > > > > > > > > > use an earlier commit?
> > > > > > > > > > > > 
> > > > > > > > > > > > testing will have to wait, but there's a recent upstream fix:
> > > > > > > > > > > > 
> > > > > > > > > > > >   d72bce0e67e8: rcu: Cure load woes
> > > > > > > > > > > > 
> > > > > > > > > > > > That *might* perhaps address this problem too.
> > > > > > > > > > > > 
> > > > > > > > > > > I've re-tested with Linus's current git, the problem is still there.
> > > > > > > > > > > 
> > > > > > > > > > > > If not then this appears to be some sort of RCU related livelock with 
> > > > > > > > > > > > brutally overcommitted vcpus. On native this would show up too, in a 
> > > > > > > > > > > > less drastic form, as a spurious bootup delay.
> > > > > > > > > > > 
> > > > > > > > > > > I don't think it was overcommited by *that* much. With that commit it
> > > > > > > > > > > usually hangs at 20-40 vcpus, while without it I can go up to 255.
> > > > > > > > > > 
> > > > > > > > > > Here is a diagnostic patch, untested.  It assumes that your system
> > > > > > > > > > has only a few CPUs (maybe 8-16) and that timers are still running.
> > > > > > > > > > It dumps out some RCU state if grace periods extend for more than
> > > > > > > > > > a few seconds.
> > > > > > > > > > 
> > > > > > > > > > To activate it, call rcu_diag_timer_start() from process context.
> > > > > > > > > > To stop it, call rcu_diag_timer_stop(), also from process context.
> > > > > > > > > 
> > > > > > > > > Since the hang happens in guest kernel very early during boot, I can't
> > > > > > > > > call rcu_diag_timer_start(). What would be a good place to put the
> > > > > > > > > _start() code instead?
> > > > > > > > 
> > > > > > > > Assuming that the failure occurs in the host OS rather than in the guest
> > > > > > > > OS, I suggest placing rcu_diag_timer_start() in the host code that starts
> > > > > > > > up the guest.
> > > > > > > > 
> > > > > > > > On the other hand, if the failure is occuring in the guest OS, then
> > > > > > > > I suggest placing the call to rcu_diag_timer_start() just after timer
> > > > > > > > initialization -- that is, assuming that interrupts are enabled at the
> > > > > > > > time of the failure.  If interrupts are not yet enabled at the time of
> > > > > > > > the failure, color me clueless.
> > > > > > > 
> > > > > > > It happens in the guest OS, the bisection was done on a guest kernel.
> > > > > > > 
> > > > > > > I've placed the rcu debug _start() right at the end of init_timers()
> > > > > > > which happens before the hang, but I'm not seeing any output from the
> > > > > > > debug function.
> > > > > > 
> > > > > > Sounds like the hang is happening before timers start?  Are scheduling
> > > > > > clock interrupts happening in the guest, as in scheduler_tick()?  If so,
> > > > > > I could just as easily put the debug there.
> > > > > 
> > > > > Yes, scheduler_tick() is running before the hang.
> > > > > Should I just call the handler function directly from there?
> > > > 
> > > > I suggest calling it no more than once every few seconds, but yes.
> > > > (Otherwise you will get a very full dmesg.)
> > > 
> > > Here is the last group of printk(), basically it just repeats itself
> > > when it's stuck:
> > > 
> > > KVM setup async PF for cpu 19
> > >  0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc17 2/46/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc18 2/23/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc19 0/0/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > [    0.527044]  #20
> > >  0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc3 0/0/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc4 2/358/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc5 2/340/1 
> > 
> > OK, this one (RCU's per-CPU kthread for CPU 5) has work to do (this
> > is the "1" after the final "/").  Does rcuc5 show up later in the
> > output?
> > 
> 
> The output grinds to a stop, I've pasted the last part of the prints -
> there are no more prints at all past this one.
> 
> > 							Thanx, Paul
> > 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc6 2/321/1 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc7 2/301/1 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc8 2/277/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc9 2/255/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc10 2/226/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc11 2/214/1 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc12 2/193/1 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc13 2/172/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc14 2/146/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc15 2/130/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc16 2/107/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc17 2/86/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc18 2/62/1 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > rcu_diag: rcuc19 2/39/0 
> > > kworker/0:1     R  running task        0     0     14 0x00000008
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > >  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> > > Call Trace:
> > > 
> > > 
> > > -- 
> > > 
> > > Sasha.
> > > 
> 
> -- 
> 
> Sasha.
> 

^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-14 23:42                                                         ` Paul E. McKenney
@ 2011-06-15  1:25                                                           ` Sasha Levin
  2011-06-15  4:22                                                             ` Paul E. McKenney
  0 siblings, 1 reply; 72+ messages in thread
From: Sasha Levin @ 2011-06-15  1:25 UTC (permalink / raw)
  To: paulmck
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Tue, 2011-06-14 at 16:42 -0700, Paul E. McKenney wrote:
> On Tue, Jun 14, 2011 at 06:26:01PM -0400, Sasha Levin wrote:
> > Sorry for the delay on this.
> 
> Actually, you might have had excellent timing on this one.
> 
> Could you please try Shaohua Li's patch at:
> 
> 	https://lkml.org/lkml/2011/6/14/20
> 
> This patch makes RCU much less prone to causing massive scheduler lock
> contention, which might well be the cause of the problems that you
> are seeing.

This patch has solved the hang issue I've had. kvm tools now boots 60
vcpus easily with 3.0 kernel.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 72+ messages in thread

* Re: [PATCH v2 6/8] kvm tools: Add rwlock wrapper
  2011-06-15  1:25                                                           ` Sasha Levin
@ 2011-06-15  4:22                                                             ` Paul E. McKenney
  0 siblings, 0 replies; 72+ messages in thread
From: Paul E. McKenney @ 2011-06-15  4:22 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Ingo Molnar, Yinghai Lu, Avi Kivity, Takuya Yoshikawa,
	Pekka Enberg, kvm, asias.hejun, gorcunov, prasadjoshi124,
	takuya.yoshikawa

On Tue, Jun 14, 2011 at 09:25:22PM -0400, Sasha Levin wrote:
> On Tue, 2011-06-14 at 16:42 -0700, Paul E. McKenney wrote:
> > On Tue, Jun 14, 2011 at 06:26:01PM -0400, Sasha Levin wrote:
> > > Sorry for the delay on this.
> > 
> > Actually, you might have had excellent timing on this one.
> > 
> > Could you please try Shaohua Li's patch at:
> > 
> > 	https://lkml.org/lkml/2011/6/14/20
> > 
> > This patch makes RCU much less prone to causing massive scheduler lock
> > contention, which might well be the cause of the problems that you
> > are seeing.
> 
> This patch has solved the hang issue I've had. kvm tools now boots 60
> vcpus easily with 3.0 kernel.

Thank you, Sasha -- I have added your Tested-by.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 72+ messages in thread

end of thread, other threads:[~2011-06-15  4:23 UTC | newest]

Thread overview: 72+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-30  8:30 [PATCH v2 1/8] kvm tools: Use correct value for user signal base Sasha Levin
2011-05-30  8:30 ` [PATCH v2 2/8] kvm tools: Remove wrong global definition of kvm Sasha Levin
2011-05-30  8:38   ` Ingo Molnar
2011-05-30  8:59     ` Sasha Levin
2011-05-30  8:30 ` [PATCH v2 3/8] kvm tools: Allow pausing guests Sasha Levin
2011-05-30  8:39   ` Ingo Molnar
2011-05-30  8:30 ` [PATCH v2 4/8] kvm tools: Pause/resume guest using SIGUSR2 Sasha Levin
2011-05-30  8:41   ` Ingo Molnar
2011-05-30  8:30 ` [PATCH v2 5/8] kvm tools: Add a brlock Sasha Levin
2011-05-30  8:30 ` [PATCH v2 6/8] kvm tools: Add rwlock wrapper Sasha Levin
2011-05-30  8:43   ` Ingo Molnar
2011-05-30  9:29     ` Pekka Enberg
2011-05-30  9:34       ` Sasha Levin
2011-05-30  9:40         ` Pekka Enberg
2011-05-30  9:46           ` Sasha Levin
2011-05-30  9:48             ` Pekka Enberg
2011-05-30  9:54               ` Ingo Molnar
2011-05-30 11:11                 ` Takuya Yoshikawa
2011-05-30 11:12                   ` Sasha Levin
2011-05-30 11:26                     ` Takuya Yoshikawa
2011-05-30 11:39                       ` Avi Kivity
2011-05-30 11:49                         ` Ingo Molnar
2011-05-30 11:55                           ` Pekka Enberg
2011-05-30 11:58                             ` Sasha Levin
2011-05-30 12:20                               ` Ingo Molnar
2011-05-30 12:22                                 ` Sasha Levin
2011-05-30 12:25                                   ` Avi Kivity
2011-05-30 12:23                                 ` Avi Kivity
2011-05-30 12:30                                   ` Pekka Enberg
2011-05-30 12:32                                     ` Avi Kivity
2011-05-30 14:10                                   ` Ingo Molnar
2011-05-30 14:30                                     ` Avi Kivity
2011-05-30 14:43                                       ` Ingo Molnar
2011-05-30 14:50                                         ` Avi Kivity
2011-05-30 19:32                                           ` Ingo Molnar
2011-05-30 12:04                           ` Avi Kivity
2011-05-30 12:36                             ` Ingo Molnar
2011-05-30 12:44                               ` Avi Kivity
2011-05-30 12:46                                 ` Pekka Enberg
2011-05-30 12:48                                   ` Avi Kivity
2011-05-30 13:05                               ` Sasha Levin
2011-06-03  7:27                               ` Sasha Levin
2011-06-03  7:34                                 ` Ingo Molnar
2011-06-03  7:54                                   ` Sasha Levin
2011-06-03 19:31                                     ` Paul E. McKenney
2011-06-03 19:56                                       ` Sasha Levin
2011-06-03 20:22                                         ` Paul E. McKenney
2011-06-03 21:03                                           ` Sasha Levin
2011-06-03 21:20                                             ` Paul E. McKenney
2011-06-03 22:54                                               ` Sasha Levin
2011-06-03 23:05                                                 ` Paul E. McKenney
2011-06-04  6:26                                                   ` Sasha Levin
2011-06-04 16:30                                                     ` Paul E. McKenney
2011-06-14 22:26                                                       ` Sasha Levin
2011-06-14 23:42                                                         ` Paul E. McKenney
2011-06-15  1:25                                                           ` Sasha Levin
2011-06-15  4:22                                                             ` Paul E. McKenney
2011-06-05 12:12                               ` Avi Kivity
2011-05-30 14:16                           ` Takuya Yoshikawa
2011-05-30  9:56             ` Ingo Molnar
2011-05-30 10:05               ` Sasha Levin
2011-05-30 10:13                 ` Ingo Molnar
2011-05-30 10:22                   ` Sasha Levin
2011-05-30 10:30                     ` Ingo Molnar
2011-05-30 10:41                       ` Sasha Levin
2011-05-30  8:30 ` [PATCH v2 7/8] kvm tools: Add debug mode to brlock Sasha Levin
2011-05-30  8:30 ` [PATCH v2 8/8] kvm tools: Use brlock in MMIO and IOPORT Sasha Levin
2011-05-30  8:47   ` Ingo Molnar
2011-05-30  8:56     ` Sasha Levin
2011-05-30  8:35 ` [PATCH v2 1/8] kvm tools: Use correct value for user signal base Ingo Molnar
2011-05-30  8:40   ` Sasha Levin
2011-05-30  8:49     ` Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.