All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-06-28  6:29 ` Bharat Bhushan
  0 siblings, 0 replies; 29+ messages in thread
From: Bharat Bhushan @ 2012-06-28  6:17 UTC (permalink / raw)
  To: kvm-ppc, kvm, agraf; +Cc: Bharat Bhushan

This patch adds the watchdog emulation in KVM. The watchdog
emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
The kernel timer are used for watchdog emulation and emulates
h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
it is configured.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
 arch/powerpc/include/asm/kvm_host.h  |    2 +
 arch/powerpc/include/asm/kvm_ppc.h   |    3 +
 arch/powerpc/include/asm/reg_booke.h |    2 +
 arch/powerpc/kvm/44x.c               |    1 +
 arch/powerpc/kvm/booke.c             |  125 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/booke_emulate.c     |    6 ++-
 arch/powerpc/kvm/powerpc.c           |   25 ++++++-
 include/linux/kvm.h                  |    2 +
 8 files changed, 162 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..a9e5aed 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -467,6 +467,7 @@ struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	struct timer_list wdt_timer;
 	u32 tlbcfg[4];
 	u32 mmucfg;
 	u32 epr;
@@ -482,6 +483,7 @@ struct kvm_vcpu_arch {
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
+	u8 watchdog_enable;
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..e5cf4b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
+extern void kvmppc_watchdog_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
@@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                          struct kvm_interrupt *irq);
+extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4..7efb3f5 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -538,6 +538,8 @@
 #define FP_2_21		3		/* 2^21 clocks */
 #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
 #define TCR_ARE		0x00400000	/* Auto Reload Enable */
+#define TCR_GET_FSL_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
+			      (((tcr) & 0x1E0000) >> 15))
 
 /* Bit definitions for the TSR. */
 #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 50e7dbc..a213aba 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -28,6 +28,7 @@
 #include <asm/kvm_44x.h>
 #include <asm/kvm_ppc.h>
 
+#include "booke.h"
 #include "44x_tlb.h"
 #include "booke.h"
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..ad30f75 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -404,12 +415,98 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	return allowed;
 }
 
+/*
+ * The timer system can almost deal with LONG_MAX timeouts, except that
+ * when you get very close to LONG_MAX, the slack added can cause overflow.
+ *
+ * LONG_MAX/2 is a conservative threshold, but it should be adequate for
+ * any realistic use.
+ */
+#define MAX_TIMEOUT (LONG_MAX/2)
+
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	unsigned long long tb, mask, nr_jiffies = 0;
+	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);
+
+	mask = 1ULL << (63 - period);
+	tb = get_tb();
+	if (tb & mask)
+		nr_jiffies += mask;
+
+	nr_jiffies += mask - (tb & (mask - 1));
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, MAX_TIMEOUT);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	if (nr_jiffies < MAX_TIMEOUT)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS) {
+				new_tsr = (tsr & ~TCR_WRC_MASK) |
+					  (vcpu->arch.tcr & TCR_WRC_MASK);
+				vcpu->arch.tcr &= ~TCR_WRC_MASK;
+				final = 1;
+			} else {
+				new_tsr = tsr | TSR_WIS;
+			}
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Avoid getting a storm of timers if the guest sets
+	 * the period very short.  We'll restart it if anything
+	 * changes.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
 static void update_timer_ints(struct kvm_vcpu *vcpu)
 {
 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
 		kvmppc_core_queue_dec(vcpu);
 	else
 		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
 }
 
 static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -516,6 +613,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
+	if (vcpu->arch.tsr & TCR_WRC_MASK) {
+		kvm_run->exit_reason = KVM_EXIT_WDT;
+		ret = 0;
+		goto out;
+	}
+
 	kvm_guest_enter();
 
 #ifdef CONFIG_PPC_FPU
@@ -977,6 +1080,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
 		}
 	}
+	if (vcpu->arch.tsr & TCR_WRC_MASK) {
+		run->exit_reason = KVM_EXIT_WDT;
+		r = RESUME_HOST | (r & RESUME_FLAG_NV);
+	}
 
 	return r;
 }
@@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 	}
 
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
+		u32 old_tsr = vcpu->arch.tsr;
+
 		vcpu->arch.tsr = sregs->u.e.tsr;
+
+		if ((old_tsr ^ vcpu->arch.tsr) &
+		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
+			arm_next_watchdog(vcpu);
+
 		update_timer_ints(vcpu);
 	}
 
@@ -1267,6 +1381,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
+	if (vcpu->arch.watchdog_enable)
+		arm_next_watchdog(vcpu);
 	update_timer_ints(vcpu);
 }
 
@@ -1281,6 +1397,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 {
 	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
+			TSR_WIS | TCR_WRC_MASK)))
+		arm_next_watchdog(vcpu);
+
 	update_timer_ints(vcpu);
 }
 
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb..cc94d42 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -145,7 +145,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		kvmppc_clr_tsr_bits(vcpu, spr_val);
 		break;
 	case SPRN_TCR:
-		kvmppc_set_tcr(vcpu, spr_val);
+		/* WRC can only be programmed when WRC=0 */
+		if (TCR_WRC_MASK & vcpu->arch.tcr)
+			spr_val &= ~TCR_WRC_MASK;
+		kvmppc_set_tcr(vcpu,
+		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
 		break;
 
 	case SPRN_DECAR:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc8..35a1ff3 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,9 +38,15 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.shared->msr & MSR_WE) ||
-	       !!(v->arch.pending_exceptions) ||
-	       v->requests;
+	bool ret = !(v->arch.shared->msr & MSR_WE) ||
+		   !!(v->arch.pending_exceptions) ||
+		   v->requests;
+
+#ifdef CONFIG_BOOKE
+	ret = ret || (v->arch.tsr & TCR_WRC_MASK);
+#endif
+
+	return ret;
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
@@ -237,6 +243,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_GET_PVINFO:
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB:
+	case KVM_CAP_PPC_WDT:
 #endif
 		r = 1;
 		break;
@@ -393,6 +400,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_destroy(vcpu);
+#ifdef CONFIG_BOOKE
+	if (vcpu->arch.watchdog_enable)
+		del_timer(&vcpu->arch.wdt_timer);
+#endif
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -637,6 +648,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_WDT:
+		r = 0;
+		vcpu->arch.watchdog_enable = true;
+		setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
+		            (unsigned long)vcpu);
+		break;
+#endif
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..b9fdb52 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -163,6 +163,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_PAPR_HCALL	  19
 #define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WDT              21
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -618,6 +619,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_PPC_WDT	81
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
1.7.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-06-28  6:29 ` Bharat Bhushan
  0 siblings, 0 replies; 29+ messages in thread
From: Bharat Bhushan @ 2012-06-28  6:29 UTC (permalink / raw)
  To: kvm-ppc, kvm, agraf; +Cc: Bharat Bhushan

This patch adds the watchdog emulation in KVM. The watchdog
emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
The kernel timer are used for watchdog emulation and emulates
h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
it is configured.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
 arch/powerpc/include/asm/kvm_host.h  |    2 +
 arch/powerpc/include/asm/kvm_ppc.h   |    3 +
 arch/powerpc/include/asm/reg_booke.h |    2 +
 arch/powerpc/kvm/44x.c               |    1 +
 arch/powerpc/kvm/booke.c             |  125 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/booke_emulate.c     |    6 ++-
 arch/powerpc/kvm/powerpc.c           |   25 ++++++-
 include/linux/kvm.h                  |    2 +
 8 files changed, 162 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..a9e5aed 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -467,6 +467,7 @@ struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	struct timer_list wdt_timer;
 	u32 tlbcfg[4];
 	u32 mmucfg;
 	u32 epr;
@@ -482,6 +483,7 @@ struct kvm_vcpu_arch {
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
+	u8 watchdog_enable;
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..e5cf4b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
+extern void kvmppc_watchdog_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
@@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                          struct kvm_interrupt *irq);
+extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4..7efb3f5 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -538,6 +538,8 @@
 #define FP_2_21		3		/* 2^21 clocks */
 #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
 #define TCR_ARE		0x00400000	/* Auto Reload Enable */
+#define TCR_GET_FSL_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
+			      (((tcr) & 0x1E0000) >> 15))
 
 /* Bit definitions for the TSR. */
 #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 50e7dbc..a213aba 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -28,6 +28,7 @@
 #include <asm/kvm_44x.h>
 #include <asm/kvm_ppc.h>
 
+#include "booke.h"
 #include "44x_tlb.h"
 #include "booke.h"
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..ad30f75 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -404,12 +415,98 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	return allowed;
 }
 
+/*
+ * The timer system can almost deal with LONG_MAX timeouts, except that
+ * when you get very close to LONG_MAX, the slack added can cause overflow.
+ *
+ * LONG_MAX/2 is a conservative threshold, but it should be adequate for
+ * any realistic use.
+ */
+#define MAX_TIMEOUT (LONG_MAX/2)
+
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	unsigned long long tb, mask, nr_jiffies = 0;
+	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);
+
+	mask = 1ULL << (63 - period);
+	tb = get_tb();
+	if (tb & mask)
+		nr_jiffies += mask;
+
+	nr_jiffies += mask - (tb & (mask - 1));
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, MAX_TIMEOUT);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	if (nr_jiffies < MAX_TIMEOUT)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS) {
+				new_tsr = (tsr & ~TCR_WRC_MASK) |
+					  (vcpu->arch.tcr & TCR_WRC_MASK);
+				vcpu->arch.tcr &= ~TCR_WRC_MASK;
+				final = 1;
+			} else {
+				new_tsr = tsr | TSR_WIS;
+			}
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Avoid getting a storm of timers if the guest sets
+	 * the period very short.  We'll restart it if anything
+	 * changes.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
 static void update_timer_ints(struct kvm_vcpu *vcpu)
 {
 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
 		kvmppc_core_queue_dec(vcpu);
 	else
 		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
 }
 
 static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -516,6 +613,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
+	if (vcpu->arch.tsr & TCR_WRC_MASK) {
+		kvm_run->exit_reason = KVM_EXIT_WDT;
+		ret = 0;
+		goto out;
+	}
+
 	kvm_guest_enter();
 
 #ifdef CONFIG_PPC_FPU
@@ -977,6 +1080,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
 		}
 	}
+	if (vcpu->arch.tsr & TCR_WRC_MASK) {
+		run->exit_reason = KVM_EXIT_WDT;
+		r = RESUME_HOST | (r & RESUME_FLAG_NV);
+	}
 
 	return r;
 }
@@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 	}
 
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
+		u32 old_tsr = vcpu->arch.tsr;
+
 		vcpu->arch.tsr = sregs->u.e.tsr;
+
+		if ((old_tsr ^ vcpu->arch.tsr) &
+		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
+			arm_next_watchdog(vcpu);
+
 		update_timer_ints(vcpu);
 	}
 
@@ -1267,6 +1381,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
+	if (vcpu->arch.watchdog_enable)
+		arm_next_watchdog(vcpu);
 	update_timer_ints(vcpu);
 }
 
@@ -1281,6 +1397,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 {
 	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
+			TSR_WIS | TCR_WRC_MASK)))
+		arm_next_watchdog(vcpu);
+
 	update_timer_ints(vcpu);
 }
 
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb..cc94d42 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -145,7 +145,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		kvmppc_clr_tsr_bits(vcpu, spr_val);
 		break;
 	case SPRN_TCR:
-		kvmppc_set_tcr(vcpu, spr_val);
+		/* WRC can only be programmed when WRC=0 */
+		if (TCR_WRC_MASK & vcpu->arch.tcr)
+			spr_val &= ~TCR_WRC_MASK;
+		kvmppc_set_tcr(vcpu,
+		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
 		break;
 
 	case SPRN_DECAR:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc8..35a1ff3 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,9 +38,15 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.shared->msr & MSR_WE) ||
-	       !!(v->arch.pending_exceptions) ||
-	       v->requests;
+	bool ret = !(v->arch.shared->msr & MSR_WE) ||
+		   !!(v->arch.pending_exceptions) ||
+		   v->requests;
+
+#ifdef CONFIG_BOOKE
+	ret = ret || (v->arch.tsr & TCR_WRC_MASK);
+#endif
+
+	return ret;
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
@@ -237,6 +243,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_GET_PVINFO:
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB:
+	case KVM_CAP_PPC_WDT:
 #endif
 		r = 1;
 		break;
@@ -393,6 +400,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_destroy(vcpu);
+#ifdef CONFIG_BOOKE
+	if (vcpu->arch.watchdog_enable)
+		del_timer(&vcpu->arch.wdt_timer);
+#endif
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -637,6 +648,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_WDT:
+		r = 0;
+		vcpu->arch.watchdog_enable = true;
+		setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
+		            (unsigned long)vcpu);
+		break;
+#endif
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..b9fdb52 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -163,6 +163,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_PAPR_HCALL	  19
 #define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WDT              21
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -618,6 +619,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_PPC_WDT	81
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
1.7.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-06-28  6:29 ` Bharat Bhushan
@ 2012-07-06 13:17   ` Alexander Graf
  -1 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-06 13:17 UTC (permalink / raw)
  To: Bharat Bhushan; +Cc: kvm-ppc, kvm, Bharat Bhushan


On 28.06.2012, at 08:17, Bharat Bhushan wrote:

> This patch adds the watchdog emulation in KVM. The watchdog
> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
> The kernel timer are used for watchdog emulation and emulates
> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
> it is configured.
> 
> Signed-off-by: Liu Yu <yu.liu@freescale.com>
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> ---
> arch/powerpc/include/asm/kvm_host.h  |    2 +
> arch/powerpc/include/asm/kvm_ppc.h   |    3 +
> arch/powerpc/include/asm/reg_booke.h |    2 +
> arch/powerpc/kvm/44x.c               |    1 +
> arch/powerpc/kvm/booke.c             |  125 ++++++++++++++++++++++++++++++++++
> arch/powerpc/kvm/booke_emulate.c     |    6 ++-
> arch/powerpc/kvm/powerpc.c           |   25 ++++++-
> include/linux/kvm.h                  |    2 +
> 8 files changed, 162 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 50ea12f..a9e5aed 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -467,6 +467,7 @@ struct kvm_vcpu_arch {
> 	ulong fault_esr;
> 	ulong queued_dear;
> 	ulong queued_esr;
> +	struct timer_list wdt_timer;
> 	u32 tlbcfg[4];
> 	u32 mmucfg;
> 	u32 epr;
> @@ -482,6 +483,7 @@ struct kvm_vcpu_arch {
> 	u8 osi_needed;
> 	u8 osi_enabled;
> 	u8 papr_enabled;
> +	u8 watchdog_enable;
> 	u8 sane;
> 	u8 cpu_type;
> 	u8 hcall_needed;
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 0124937..e5cf4b9 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
> extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
> extern void kvmppc_decrementer_func(unsigned long data);
> +extern void kvmppc_watchdog_func(unsigned long data);
> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
> 
> /* Core-specific hooks */
> @@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
>                                        struct kvm_interrupt *irq);
> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
>                                          struct kvm_interrupt *irq);
> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
> 
> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
>                                   unsigned int op, int *advance);
> diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
> index 2d916c4..7efb3f5 100644
> --- a/arch/powerpc/include/asm/reg_booke.h
> +++ b/arch/powerpc/include/asm/reg_booke.h
> @@ -538,6 +538,8 @@
> #define FP_2_21		3		/* 2^21 clocks */
> #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
> #define TCR_ARE		0x00400000	/* Auto Reload Enable */
> +#define TCR_GET_FSL_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
> +			      (((tcr) & 0x1E0000) >> 15))
> 
> /* Bit definitions for the TSR. */
> #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
> diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
> index 50e7dbc..a213aba 100644
> --- a/arch/powerpc/kvm/44x.c
> +++ b/arch/powerpc/kvm/44x.c
> @@ -28,6 +28,7 @@
> #include <asm/kvm_44x.h>
> #include <asm/kvm_ppc.h>
> 
> +#include "booke.h"
> #include "44x_tlb.h"
> #include "booke.h"
> 
> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
> index d25a097..ad30f75 100644
> --- a/arch/powerpc/kvm/booke.c
> +++ b/arch/powerpc/kvm/booke.c
> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
> 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
> }
> 
> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
> +}
> +
> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
> +}
> +
> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
> {
> #ifdef CONFIG_KVM_BOOKE_HV
> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
> 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
> 		int_class = INT_CLASS_NONCRIT;
> 		break;
> +	case BOOKE_IRQPRIO_WATCHDOG:
> 	case BOOKE_IRQPRIO_CRITICAL:
> 	case BOOKE_IRQPRIO_DBELL_CRIT:
> 		allowed = vcpu->arch.shared->msr & MSR_CE;
> @@ -404,12 +415,98 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
> 	return allowed;
> }
> 
> +/*
> + * The timer system can almost deal with LONG_MAX timeouts, except that
> + * when you get very close to LONG_MAX, the slack added can cause overflow.
> + *
> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
> + * any realistic use.
> + */
> +#define MAX_TIMEOUT (LONG_MAX/2)

Should this really be in kvm code?

> +
> +/*
> + * Return the number of jiffies until the next timeout.  If the timeout is
> + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
> + */
> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long long tb, mask, nr_jiffies = 0;

u64?

> +	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);

Doesn't sound like something booke generic to me.

> +
> +	mask = 1ULL << (63 - period);
> +	tb = get_tb();
> +	if (tb & mask)
> +		nr_jiffies += mask;

To be honest, you lost me here. nr_jiffies is jiffies, right? While mask is basically in timebase granularity. I suppose you're just reusing the variable here for no good reason - the compiler will gladly optimize things for you if you write things a bit more verbose :).

Please make this function more readable. With comments if needed.

> +
> +	nr_jiffies += mask - (tb & (mask - 1));
> +
> +	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
> +		nr_jiffies++;
> +
> +	return min_t(unsigned long long, nr_jiffies, MAX_TIMEOUT);
> +}
> +
> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long nr_jiffies;
> +
> +	nr_jiffies = watchdog_next_timeout(vcpu);
> +	if (nr_jiffies < MAX_TIMEOUT)
> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> +	else
> +		del_timer(&vcpu->arch.wdt_timer);

Can you del a timer that's not armed? Could that ever happen in this case?

Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?

> +}
> +
> +void kvmppc_watchdog_func(unsigned long data)
> +{
> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> +	u32 tsr, new_tsr;
> +	int final;
> +
> +	do {
> +		new_tsr = tsr = vcpu->arch.tsr;
> +		final = 0;
> +
> +		/* Time out event */
> +		if (tsr & TSR_ENW) {
> +			if (tsr & TSR_WIS) {
> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
> +				final = 1;
> +			} else {
> +				new_tsr = tsr | TSR_WIS;
> +			}
> +		} else {
> +			new_tsr = tsr | TSR_ENW;
> +		}
> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> +
> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
> +		smp_wmb();
> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> +		kvm_vcpu_kick(vcpu);
> +	}
> +
> +	/*
> +	 * Avoid getting a storm of timers if the guest sets
> +	 * the period very short.  We'll restart it if anything
> +	 * changes.
> +	 */
> +	if (!final)
> +		arm_next_watchdog(vcpu);

Mind to explain this part a bit further?

> +}
> static void update_timer_ints(struct kvm_vcpu *vcpu)
> {
> 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
> 		kvmppc_core_queue_dec(vcpu);
> 	else
> 		kvmppc_core_dequeue_dec(vcpu);
> +
> +	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
> +		kvmppc_core_queue_watchdog(vcpu);
> +	else
> +		kvmppc_core_dequeue_watchdog(vcpu);
> }
> 
> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
> @@ -516,6 +613,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
> 		goto out;
> 	}
> 
> +	if (vcpu->arch.tsr & TCR_WRC_MASK) {
> +		kvm_run->exit_reason = KVM_EXIT_WDT;
> +		ret = 0;
> +		goto out;
> +	}
> +
> 	kvm_guest_enter();
> 
> #ifdef CONFIG_PPC_FPU
> @@ -977,6 +1080,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
> 			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
> 		}
> 	}
> +	if (vcpu->arch.tsr & TCR_WRC_MASK) {
> +		run->exit_reason = KVM_EXIT_WDT;
> +		r = RESUME_HOST | (r & RESUME_FLAG_NV);
> +	}
> 
> 	return r;
> }
> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
> 	}
> 
> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> +		u32 old_tsr = vcpu->arch.tsr;
> +
> 		vcpu->arch.tsr = sregs->u.e.tsr;
> +
> +		if ((old_tsr ^ vcpu->arch.tsr) &
> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
> +			arm_next_watchdog(vcpu);

Why isn't this one guarded by vcpu->arch.watchdog_enable?

> +
> 		update_timer_ints(vcpu);
> 	}
> 
> @@ -1267,6 +1381,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
> void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
> {
> 	vcpu->arch.tcr = new_tcr;
> +	if (vcpu->arch.watchdog_enable)
> +		arm_next_watchdog(vcpu);
> 	update_timer_ints(vcpu);
> }
> 
> @@ -1281,6 +1397,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
> void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
> {
> 	clear_bits(tsr_bits, &vcpu->arch.tsr);
> +
> +	/*
> +	 * We may have stopped the watchdog due to
> +	 * being stuck on final expiration.
> +	 */
> +	if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
> +			TSR_WIS | TCR_WRC_MASK)))
> +		arm_next_watchdog(vcpu);
> +
> 	update_timer_ints(vcpu);
> }
> 
> diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
> index 12834bb..cc94d42 100644
> --- a/arch/powerpc/kvm/booke_emulate.c
> +++ b/arch/powerpc/kvm/booke_emulate.c
> @@ -145,7 +145,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
> 		kvmppc_clr_tsr_bits(vcpu, spr_val);
> 		break;
> 	case SPRN_TCR:
> -		kvmppc_set_tcr(vcpu, spr_val);
> +		/* WRC can only be programmed when WRC=0 */
> +		if (TCR_WRC_MASK & vcpu->arch.tcr)

Please reverse the order.

> +			spr_val &= ~TCR_WRC_MASK;
> +		kvmppc_set_tcr(vcpu,
> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));

In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.

> 		break;
> 
> 	case SPRN_DECAR:
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 87f4dc8..35a1ff3 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -38,9 +38,15 @@
> 
> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> {
> -	return !(v->arch.shared->msr & MSR_WE) ||
> -	       !!(v->arch.pending_exceptions) ||
> -	       v->requests;
> +	bool ret = !(v->arch.shared->msr & MSR_WE) ||
> +		   !!(v->arch.pending_exceptions) ||
> +		   v->requests;
> +
> +#ifdef CONFIG_BOOKE
> +	ret = ret || (v->arch.tsr & TCR_WRC_MASK);

Please make this a callback. In a header if you think it's performance critical, but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c.

> +#endif
> +
> +	return ret;
> }
> 
> int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
> @@ -237,6 +243,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> 	case KVM_CAP_PPC_GET_PVINFO:
> #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
> 	case KVM_CAP_SW_TLB:
> +	case KVM_CAP_PPC_WDT:
> #endif
> 		r = 1;
> 		break;
> @@ -393,6 +400,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> {
> 	kvmppc_mmu_destroy(vcpu);
> +#ifdef CONFIG_BOOKE
> +	if (vcpu->arch.watchdog_enable)
> +		del_timer(&vcpu->arch.wdt_timer);
> +#endif
> }
> 
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> @@ -637,6 +648,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
> 		r = 0;
> 		vcpu->arch.papr_enabled = true;
> 		break;
> +#ifdef CONFIG_BOOKE
> +	case KVM_CAP_PPC_WDT:
> +		r = 0;
> +		vcpu->arch.watchdog_enable = true;
> +		setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
> +		            (unsigned long)vcpu);

Shouldn't we guard against user space calling this twice?


Alex

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-06 13:17   ` Alexander Graf
  0 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-06 13:17 UTC (permalink / raw)
  To: Bharat Bhushan; +Cc: kvm-ppc, kvm, Bharat Bhushan


On 28.06.2012, at 08:17, Bharat Bhushan wrote:

> This patch adds the watchdog emulation in KVM. The watchdog
> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
> The kernel timer are used for watchdog emulation and emulates
> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
> it is configured.
> 
> Signed-off-by: Liu Yu <yu.liu@freescale.com>
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> ---
> arch/powerpc/include/asm/kvm_host.h  |    2 +
> arch/powerpc/include/asm/kvm_ppc.h   |    3 +
> arch/powerpc/include/asm/reg_booke.h |    2 +
> arch/powerpc/kvm/44x.c               |    1 +
> arch/powerpc/kvm/booke.c             |  125 ++++++++++++++++++++++++++++++++++
> arch/powerpc/kvm/booke_emulate.c     |    6 ++-
> arch/powerpc/kvm/powerpc.c           |   25 ++++++-
> include/linux/kvm.h                  |    2 +
> 8 files changed, 162 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 50ea12f..a9e5aed 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -467,6 +467,7 @@ struct kvm_vcpu_arch {
> 	ulong fault_esr;
> 	ulong queued_dear;
> 	ulong queued_esr;
> +	struct timer_list wdt_timer;
> 	u32 tlbcfg[4];
> 	u32 mmucfg;
> 	u32 epr;
> @@ -482,6 +483,7 @@ struct kvm_vcpu_arch {
> 	u8 osi_needed;
> 	u8 osi_enabled;
> 	u8 papr_enabled;
> +	u8 watchdog_enable;
> 	u8 sane;
> 	u8 cpu_type;
> 	u8 hcall_needed;
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 0124937..e5cf4b9 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
> extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
> extern void kvmppc_decrementer_func(unsigned long data);
> +extern void kvmppc_watchdog_func(unsigned long data);
> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
> 
> /* Core-specific hooks */
> @@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
>                                        struct kvm_interrupt *irq);
> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
>                                          struct kvm_interrupt *irq);
> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
> 
> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
>                                   unsigned int op, int *advance);
> diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
> index 2d916c4..7efb3f5 100644
> --- a/arch/powerpc/include/asm/reg_booke.h
> +++ b/arch/powerpc/include/asm/reg_booke.h
> @@ -538,6 +538,8 @@
> #define FP_2_21		3		/* 2^21 clocks */
> #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
> #define TCR_ARE		0x00400000	/* Auto Reload Enable */
> +#define TCR_GET_FSL_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
> +			      (((tcr) & 0x1E0000) >> 15))
> 
> /* Bit definitions for the TSR. */
> #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
> diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
> index 50e7dbc..a213aba 100644
> --- a/arch/powerpc/kvm/44x.c
> +++ b/arch/powerpc/kvm/44x.c
> @@ -28,6 +28,7 @@
> #include <asm/kvm_44x.h>
> #include <asm/kvm_ppc.h>
> 
> +#include "booke.h"
> #include "44x_tlb.h"
> #include "booke.h"
> 
> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
> index d25a097..ad30f75 100644
> --- a/arch/powerpc/kvm/booke.c
> +++ b/arch/powerpc/kvm/booke.c
> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
> 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
> }
> 
> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
> +}
> +
> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
> +}
> +
> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
> {
> #ifdef CONFIG_KVM_BOOKE_HV
> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
> 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
> 		int_class = INT_CLASS_NONCRIT;
> 		break;
> +	case BOOKE_IRQPRIO_WATCHDOG:
> 	case BOOKE_IRQPRIO_CRITICAL:
> 	case BOOKE_IRQPRIO_DBELL_CRIT:
> 		allowed = vcpu->arch.shared->msr & MSR_CE;
> @@ -404,12 +415,98 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
> 	return allowed;
> }
> 
> +/*
> + * The timer system can almost deal with LONG_MAX timeouts, except that
> + * when you get very close to LONG_MAX, the slack added can cause overflow.
> + *
> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
> + * any realistic use.
> + */
> +#define MAX_TIMEOUT (LONG_MAX/2)

Should this really be in kvm code?

> +
> +/*
> + * Return the number of jiffies until the next timeout.  If the timeout is
> + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
> + */
> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long long tb, mask, nr_jiffies = 0;

u64?

> +	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);

Doesn't sound like something booke generic to me.

> +
> +	mask = 1ULL << (63 - period);
> +	tb = get_tb();
> +	if (tb & mask)
> +		nr_jiffies += mask;

To be honest, you lost me here. nr_jiffies is jiffies, right? While mask is basically in timebase granularity. I suppose you're just reusing the variable here for no good reason - the compiler will gladly optimize things for you if you write things a bit more verbose :).

Please make this function more readable. With comments if needed.

> +
> +	nr_jiffies += mask - (tb & (mask - 1));
> +
> +	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
> +		nr_jiffies++;
> +
> +	return min_t(unsigned long long, nr_jiffies, MAX_TIMEOUT);
> +}
> +
> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long nr_jiffies;
> +
> +	nr_jiffies = watchdog_next_timeout(vcpu);
> +	if (nr_jiffies < MAX_TIMEOUT)
> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> +	else
> +		del_timer(&vcpu->arch.wdt_timer);

Can you del a timer that's not armed? Could that ever happen in this case?

Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?

> +}
> +
> +void kvmppc_watchdog_func(unsigned long data)
> +{
> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> +	u32 tsr, new_tsr;
> +	int final;
> +
> +	do {
> +		new_tsr = tsr = vcpu->arch.tsr;
> +		final = 0;
> +
> +		/* Time out event */
> +		if (tsr & TSR_ENW) {
> +			if (tsr & TSR_WIS) {
> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
> +				final = 1;
> +			} else {
> +				new_tsr = tsr | TSR_WIS;
> +			}
> +		} else {
> +			new_tsr = tsr | TSR_ENW;
> +		}
> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> +
> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
> +		smp_wmb();
> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> +		kvm_vcpu_kick(vcpu);
> +	}
> +
> +	/*
> +	 * Avoid getting a storm of timers if the guest sets
> +	 * the period very short.  We'll restart it if anything
> +	 * changes.
> +	 */
> +	if (!final)
> +		arm_next_watchdog(vcpu);

Mind to explain this part a bit further?

> +}
> static void update_timer_ints(struct kvm_vcpu *vcpu)
> {
> 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
> 		kvmppc_core_queue_dec(vcpu);
> 	else
> 		kvmppc_core_dequeue_dec(vcpu);
> +
> +	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
> +		kvmppc_core_queue_watchdog(vcpu);
> +	else
> +		kvmppc_core_dequeue_watchdog(vcpu);
> }
> 
> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
> @@ -516,6 +613,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
> 		goto out;
> 	}
> 
> +	if (vcpu->arch.tsr & TCR_WRC_MASK) {
> +		kvm_run->exit_reason = KVM_EXIT_WDT;
> +		ret = 0;
> +		goto out;
> +	}
> +
> 	kvm_guest_enter();
> 
> #ifdef CONFIG_PPC_FPU
> @@ -977,6 +1080,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
> 			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
> 		}
> 	}
> +	if (vcpu->arch.tsr & TCR_WRC_MASK) {
> +		run->exit_reason = KVM_EXIT_WDT;
> +		r = RESUME_HOST | (r & RESUME_FLAG_NV);
> +	}
> 
> 	return r;
> }
> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
> 	}
> 
> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> +		u32 old_tsr = vcpu->arch.tsr;
> +
> 		vcpu->arch.tsr = sregs->u.e.tsr;
> +
> +		if ((old_tsr ^ vcpu->arch.tsr) &
> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
> +			arm_next_watchdog(vcpu);

Why isn't this one guarded by vcpu->arch.watchdog_enable?

> +
> 		update_timer_ints(vcpu);
> 	}
> 
> @@ -1267,6 +1381,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
> void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
> {
> 	vcpu->arch.tcr = new_tcr;
> +	if (vcpu->arch.watchdog_enable)
> +		arm_next_watchdog(vcpu);
> 	update_timer_ints(vcpu);
> }
> 
> @@ -1281,6 +1397,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
> void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
> {
> 	clear_bits(tsr_bits, &vcpu->arch.tsr);
> +
> +	/*
> +	 * We may have stopped the watchdog due to
> +	 * being stuck on final expiration.
> +	 */
> +	if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
> +			TSR_WIS | TCR_WRC_MASK)))
> +		arm_next_watchdog(vcpu);
> +
> 	update_timer_ints(vcpu);
> }
> 
> diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
> index 12834bb..cc94d42 100644
> --- a/arch/powerpc/kvm/booke_emulate.c
> +++ b/arch/powerpc/kvm/booke_emulate.c
> @@ -145,7 +145,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
> 		kvmppc_clr_tsr_bits(vcpu, spr_val);
> 		break;
> 	case SPRN_TCR:
> -		kvmppc_set_tcr(vcpu, spr_val);
> +		/* WRC can only be programmed when WRC=0 */
> +		if (TCR_WRC_MASK & vcpu->arch.tcr)

Please reverse the order.

> +			spr_val &= ~TCR_WRC_MASK;
> +		kvmppc_set_tcr(vcpu,
> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));

In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.

> 		break;
> 
> 	case SPRN_DECAR:
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 87f4dc8..35a1ff3 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -38,9 +38,15 @@
> 
> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> {
> -	return !(v->arch.shared->msr & MSR_WE) ||
> -	       !!(v->arch.pending_exceptions) ||
> -	       v->requests;
> +	bool ret = !(v->arch.shared->msr & MSR_WE) ||
> +		   !!(v->arch.pending_exceptions) ||
> +		   v->requests;
> +
> +#ifdef CONFIG_BOOKE
> +	ret = ret || (v->arch.tsr & TCR_WRC_MASK);

Please make this a callback. In a header if you think it's performance critical, but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c.

> +#endif
> +
> +	return ret;
> }
> 
> int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
> @@ -237,6 +243,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> 	case KVM_CAP_PPC_GET_PVINFO:
> #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
> 	case KVM_CAP_SW_TLB:
> +	case KVM_CAP_PPC_WDT:
> #endif
> 		r = 1;
> 		break;
> @@ -393,6 +400,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> {
> 	kvmppc_mmu_destroy(vcpu);
> +#ifdef CONFIG_BOOKE
> +	if (vcpu->arch.watchdog_enable)
> +		del_timer(&vcpu->arch.wdt_timer);
> +#endif
> }
> 
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> @@ -637,6 +648,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
> 		r = 0;
> 		vcpu->arch.papr_enabled = true;
> 		break;
> +#ifdef CONFIG_BOOKE
> +	case KVM_CAP_PPC_WDT:
> +		r = 0;
> +		vcpu->arch.watchdog_enable = true;
> +		setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
> +		            (unsigned long)vcpu);

Shouldn't we guard against user space calling this twice?


Alex


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-06 13:17   ` Alexander Graf
@ 2012-07-06 23:37     ` Scott Wood
  -1 siblings, 0 replies; 29+ messages in thread
From: Scott Wood @ 2012-07-06 23:37 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Bharat Bhushan, kvm-ppc, kvm, Bharat Bhushan

On 07/06/2012 08:17 AM, Alexander Graf wrote:
> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>> +/*
>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>> + *
>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>> + * any realistic use.
>> + */
>> +#define MAX_TIMEOUT (LONG_MAX/2)
> 
> Should this really be in kvm code?

It looks like we can use NEXT_TIMER_MAX_DELTA for this.

>> +	mask = 1ULL << (63 - period);
>> +	tb = get_tb();
>> +	if (tb & mask)
>> +		nr_jiffies += mask;
> 
> To be honest, you lost me here. nr_jiffies is jiffies, right? While
> mask is basically in timebase granularity. I suppose you're just
> reusing the variable here for no good reason - the compiler will
> gladly optimize things for you if you write things a bit more verbose
> :).

Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
something generic like "ticks", "interval", "remaining", etc. would be
better, with a comment on the do_div saying it's converting timebase
ticks into jiffies.

>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long nr_jiffies;
>> +
>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>> +	if (nr_jiffies < MAX_TIMEOUT)
>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>> +	else
>> +		del_timer(&vcpu->arch.wdt_timer);
> 
> Can you del a timer that's not armed? Could that ever happen in this case?

"del_timer() deactivates a timer - this works on both active and
inactive timers."

> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?

This can be called in the context of the timer, so del_timer_sync()
would hang.

As for what would happen if a caller from a different context were to
race with a timer, I think you could end up with the timer armed based
on an old TCR.  del_timer_sync() won't help though, unless you replace
mod_timer() with del_timer_sync() plus add_timer() (with a check to see
whether it's running in timer context).  A better solution is probably
to use a spinlock in arm_next_watchdog().

>> +void kvmppc_watchdog_func(unsigned long data)
>> +{
>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>> +	u32 tsr, new_tsr;
>> +	int final;
>> +
>> +	do {
>> +		new_tsr = tsr = vcpu->arch.tsr;
>> +		final = 0;
>> +
>> +		/* Time out event */
>> +		if (tsr & TSR_ENW) {
>> +			if (tsr & TSR_WIS) {
>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>> +				final = 1;
>> +			} else {
>> +				new_tsr = tsr | TSR_WIS;
>> +			}
>> +		} else {
>> +			new_tsr = tsr | TSR_ENW;
>> +		}
>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>> +
>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>> +		smp_wmb();
>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>> +		kvm_vcpu_kick(vcpu);
>> +	}
>> +
>> +	/*
>> +	 * Avoid getting a storm of timers if the guest sets
>> +	 * the period very short.  We'll restart it if anything
>> +	 * changes.
>> +	 */
>> +	if (!final)
>> +		arm_next_watchdog(vcpu);
> 
> Mind to explain this part a bit further?

The whole function, or some subset near the end?

The "if (!final)" check means that we stop running the timer after final
expiration, to prevent the host from being flooded with timers if the
guest sets a short period but does not have TCR set to exit to QEMU.
Timers will resume the next time TSR/TCR is updated.

>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>> 	}
>>
>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>> +		u32 old_tsr = vcpu->arch.tsr;
>> +
>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>> +
>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>> +			arm_next_watchdog(vcpu);
> 
> Why isn't this one guarded by vcpu->arch.watchdog_enable?

I'm not sure that any of them should be -- there's no reason for the
watchdog interrupt mechanism to be dependent on QEMU, only the
heavyweight exit on final expiration.

>> +			spr_val &= ~TCR_WRC_MASK;
>> +		kvmppc_set_tcr(vcpu,
>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
> 
> In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.

WRC is a 2-bit field that is supposed to preserve its value once written
to be non-zero.  Not that we actually do anything different based on the
specific non-zero value, but still we should implement the architected
semantics.

-Scott

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-06 23:37     ` Scott Wood
  0 siblings, 0 replies; 29+ messages in thread
From: Scott Wood @ 2012-07-06 23:37 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Bharat Bhushan, kvm-ppc, kvm, Bharat Bhushan

On 07/06/2012 08:17 AM, Alexander Graf wrote:
> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>> +/*
>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>> + *
>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>> + * any realistic use.
>> + */
>> +#define MAX_TIMEOUT (LONG_MAX/2)
> 
> Should this really be in kvm code?

It looks like we can use NEXT_TIMER_MAX_DELTA for this.

>> +	mask = 1ULL << (63 - period);
>> +	tb = get_tb();
>> +	if (tb & mask)
>> +		nr_jiffies += mask;
> 
> To be honest, you lost me here. nr_jiffies is jiffies, right? While
> mask is basically in timebase granularity. I suppose you're just
> reusing the variable here for no good reason - the compiler will
> gladly optimize things for you if you write things a bit more verbose
> :).

Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
something generic like "ticks", "interval", "remaining", etc. would be
better, with a comment on the do_div saying it's converting timebase
ticks into jiffies.

>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long nr_jiffies;
>> +
>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>> +	if (nr_jiffies < MAX_TIMEOUT)
>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>> +	else
>> +		del_timer(&vcpu->arch.wdt_timer);
> 
> Can you del a timer that's not armed? Could that ever happen in this case?

"del_timer() deactivates a timer - this works on both active and
inactive timers."

> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?

This can be called in the context of the timer, so del_timer_sync()
would hang.

As for what would happen if a caller from a different context were to
race with a timer, I think you could end up with the timer armed based
on an old TCR.  del_timer_sync() won't help though, unless you replace
mod_timer() with del_timer_sync() plus add_timer() (with a check to see
whether it's running in timer context).  A better solution is probably
to use a spinlock in arm_next_watchdog().

>> +void kvmppc_watchdog_func(unsigned long data)
>> +{
>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>> +	u32 tsr, new_tsr;
>> +	int final;
>> +
>> +	do {
>> +		new_tsr = tsr = vcpu->arch.tsr;
>> +		final = 0;
>> +
>> +		/* Time out event */
>> +		if (tsr & TSR_ENW) {
>> +			if (tsr & TSR_WIS) {
>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>> +				final = 1;
>> +			} else {
>> +				new_tsr = tsr | TSR_WIS;
>> +			}
>> +		} else {
>> +			new_tsr = tsr | TSR_ENW;
>> +		}
>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>> +
>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>> +		smp_wmb();
>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>> +		kvm_vcpu_kick(vcpu);
>> +	}
>> +
>> +	/*
>> +	 * Avoid getting a storm of timers if the guest sets
>> +	 * the period very short.  We'll restart it if anything
>> +	 * changes.
>> +	 */
>> +	if (!final)
>> +		arm_next_watchdog(vcpu);
> 
> Mind to explain this part a bit further?

The whole function, or some subset near the end?

The "if (!final)" check means that we stop running the timer after final
expiration, to prevent the host from being flooded with timers if the
guest sets a short period but does not have TCR set to exit to QEMU.
Timers will resume the next time TSR/TCR is updated.

>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>> 	}
>>
>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>> +		u32 old_tsr = vcpu->arch.tsr;
>> +
>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>> +
>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>> +			arm_next_watchdog(vcpu);
> 
> Why isn't this one guarded by vcpu->arch.watchdog_enable?

I'm not sure that any of them should be -- there's no reason for the
watchdog interrupt mechanism to be dependent on QEMU, only the
heavyweight exit on final expiration.

>> +			spr_val &= ~TCR_WRC_MASK;
>> +		kvmppc_set_tcr(vcpu,
>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
> 
> In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.

WRC is a 2-bit field that is supposed to preserve its value once written
to be non-zero.  Not that we actually do anything different based on the
specific non-zero value, but still we should implement the architected
semantics.

-Scott


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-06 23:37     ` Scott Wood
@ 2012-07-07  7:50       ` Alexander Graf
  -1 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-07  7:50 UTC (permalink / raw)
  To: Scott Wood; +Cc: Bharat Bhushan, kvm-ppc, kvm, Bharat Bhushan


On 07.07.2012, at 01:37, Scott Wood wrote:

> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>> +/*
>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>> + *
>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>> + * any realistic use.
>>> + */
>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>> 
>> Should this really be in kvm code?
> 
> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
> 
>>> +	mask = 1ULL << (63 - period);
>>> +	tb = get_tb();
>>> +	if (tb & mask)
>>> +		nr_jiffies += mask;
>> 
>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>> mask is basically in timebase granularity. I suppose you're just
>> reusing the variable here for no good reason - the compiler will
>> gladly optimize things for you if you write things a bit more verbose
>> :).
> 
> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
> something generic like "ticks", "interval", "remaining", etc. would be
> better, with a comment on the do_div saying it's converting timebase
> ticks into jiffies.

Well, you could start off with a variable "delta_tb", then do

  nr_jiffies = delta_tb;
  x = do_div(...);

and things would suddenly become readable :). Of course I don't object to comments along the code either :).

> 
>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>> +{
>>> +	unsigned long nr_jiffies;
>>> +
>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>> +	else
>>> +		del_timer(&vcpu->arch.wdt_timer);
>> 
>> Can you del a timer that's not armed? Could that ever happen in this case?
> 
> "del_timer() deactivates a timer - this works on both active and
> inactive timers."

Ah, good :).

> 
>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
> 
> This can be called in the context of the timer, so del_timer_sync()
> would hang.
> 
> As for what would happen if a caller from a different context were to
> race with a timer, I think you could end up with the timer armed based
> on an old TCR.  del_timer_sync() won't help though, unless you replace
> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
> whether it's running in timer context).  A better solution is probably
> to use a spinlock in arm_next_watchdog().

Yup. Either way, we have a race that the guest might not expect.

> 
>>> +void kvmppc_watchdog_func(unsigned long data)
>>> +{
>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>> +	u32 tsr, new_tsr;
>>> +	int final;
>>> +
>>> +	do {
>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>> +		final = 0;
>>> +
>>> +		/* Time out event */
>>> +		if (tsr & TSR_ENW) {
>>> +			if (tsr & TSR_WIS) {
>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>> +				final = 1;
>>> +			} else {
>>> +				new_tsr = tsr | TSR_WIS;
>>> +			}
>>> +		} else {
>>> +			new_tsr = tsr | TSR_ENW;
>>> +		}
>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>> +
>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>> +		smp_wmb();
>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>> +		kvm_vcpu_kick(vcpu);
>>> +	}
>>> +
>>> +	/*
>>> +	 * Avoid getting a storm of timers if the guest sets
>>> +	 * the period very short.  We'll restart it if anything
>>> +	 * changes.
>>> +	 */
>>> +	if (!final)
>>> +		arm_next_watchdog(vcpu);
>> 
>> Mind to explain this part a bit further?
> 
> The whole function, or some subset near the end?
> 
> The "if (!final)" check means that we stop running the timer after final
> expiration, to prevent the host from being flooded with timers if the
> guest sets a short period but does not have TCR set to exit to QEMU.
> Timers will resume the next time TSR/TCR is updated.

Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).

> 
>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>> 	}
>>> 
>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>> +		u32 old_tsr = vcpu->arch.tsr;
>>> +
>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>> +
>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>> +			arm_next_watchdog(vcpu);
>> 
>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
> 
> I'm not sure that any of them should be -- there's no reason for the
> watchdog interrupt mechanism to be dependent on QEMU, only the
> heavyweight exit on final expiration.

Well - I like the concept of having new features switchable. Overlapping the "watchdog is implemented" feature with "user space wants watchdog exits" makes sense. But I definitely want to have a switch for the former, because we otherwise differ quite substantially from the emulation we had before.

> 
>>> +			spr_val &= ~TCR_WRC_MASK;
>>> +		kvmppc_set_tcr(vcpu,
>>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
>> 
>> In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.
> 
> WRC is a 2-bit field that is supposed to preserve its value once written
> to be non-zero.  Not that we actually do anything different based on the
> specific non-zero value, but still we should implement the architected
> semantics.

Ah, being 2 bits wide, the above code suddenly makes more sense :). How about

/* WRC is a 2-bit field that is supposed to preserve its value once written to be non-zero */
spr_val &= ~TCR_WRC_MASK;
spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
kvmppc_set_tcr(vcpu, spr_val);


Alex

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-07  7:50       ` Alexander Graf
  0 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-07  7:50 UTC (permalink / raw)
  To: Scott Wood; +Cc: Bharat Bhushan, kvm-ppc, kvm, Bharat Bhushan


On 07.07.2012, at 01:37, Scott Wood wrote:

> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>> +/*
>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>> + *
>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>> + * any realistic use.
>>> + */
>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>> 
>> Should this really be in kvm code?
> 
> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
> 
>>> +	mask = 1ULL << (63 - period);
>>> +	tb = get_tb();
>>> +	if (tb & mask)
>>> +		nr_jiffies += mask;
>> 
>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>> mask is basically in timebase granularity. I suppose you're just
>> reusing the variable here for no good reason - the compiler will
>> gladly optimize things for you if you write things a bit more verbose
>> :).
> 
> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
> something generic like "ticks", "interval", "remaining", etc. would be
> better, with a comment on the do_div saying it's converting timebase
> ticks into jiffies.

Well, you could start off with a variable "delta_tb", then do

  nr_jiffies = delta_tb;
  x = do_div(...);

and things would suddenly become readable :). Of course I don't object to comments along the code either :).

> 
>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>> +{
>>> +	unsigned long nr_jiffies;
>>> +
>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>> +	else
>>> +		del_timer(&vcpu->arch.wdt_timer);
>> 
>> Can you del a timer that's not armed? Could that ever happen in this case?
> 
> "del_timer() deactivates a timer - this works on both active and
> inactive timers."

Ah, good :).

> 
>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
> 
> This can be called in the context of the timer, so del_timer_sync()
> would hang.
> 
> As for what would happen if a caller from a different context were to
> race with a timer, I think you could end up with the timer armed based
> on an old TCR.  del_timer_sync() won't help though, unless you replace
> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
> whether it's running in timer context).  A better solution is probably
> to use a spinlock in arm_next_watchdog().

Yup. Either way, we have a race that the guest might not expect.

> 
>>> +void kvmppc_watchdog_func(unsigned long data)
>>> +{
>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>> +	u32 tsr, new_tsr;
>>> +	int final;
>>> +
>>> +	do {
>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>> +		final = 0;
>>> +
>>> +		/* Time out event */
>>> +		if (tsr & TSR_ENW) {
>>> +			if (tsr & TSR_WIS) {
>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>> +				final = 1;
>>> +			} else {
>>> +				new_tsr = tsr | TSR_WIS;
>>> +			}
>>> +		} else {
>>> +			new_tsr = tsr | TSR_ENW;
>>> +		}
>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>> +
>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>> +		smp_wmb();
>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>> +		kvm_vcpu_kick(vcpu);
>>> +	}
>>> +
>>> +	/*
>>> +	 * Avoid getting a storm of timers if the guest sets
>>> +	 * the period very short.  We'll restart it if anything
>>> +	 * changes.
>>> +	 */
>>> +	if (!final)
>>> +		arm_next_watchdog(vcpu);
>> 
>> Mind to explain this part a bit further?
> 
> The whole function, or some subset near the end?
> 
> The "if (!final)" check means that we stop running the timer after final
> expiration, to prevent the host from being flooded with timers if the
> guest sets a short period but does not have TCR set to exit to QEMU.
> Timers will resume the next time TSR/TCR is updated.

Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).

> 
>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>> 	}
>>> 
>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>> +		u32 old_tsr = vcpu->arch.tsr;
>>> +
>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>> +
>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>> +			arm_next_watchdog(vcpu);
>> 
>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
> 
> I'm not sure that any of them should be -- there's no reason for the
> watchdog interrupt mechanism to be dependent on QEMU, only the
> heavyweight exit on final expiration.

Well - I like the concept of having new features switchable. Overlapping the "watchdog is implemented" feature with "user space wants watchdog exits" makes sense. But I definitely want to have a switch for the former, because we otherwise differ quite substantially from the emulation we had before.

> 
>>> +			spr_val &= ~TCR_WRC_MASK;
>>> +		kvmppc_set_tcr(vcpu,
>>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
>> 
>> In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above.
> 
> WRC is a 2-bit field that is supposed to preserve its value once written
> to be non-zero.  Not that we actually do anything different based on the
> specific non-zero value, but still we should implement the architected
> semantics.

Ah, being 2 bits wide, the above code suddenly makes more sense :). How about

/* WRC is a 2-bit field that is supposed to preserve its value once written to be non-zero */
spr_val &= ~TCR_WRC_MASK;
spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
kvmppc_set_tcr(vcpu, spr_val);


Alex


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-07  7:50       ` Alexander Graf
@ 2012-07-09  5:13         ` Bhushan Bharat-R65777
  -1 siblings, 0 replies; 29+ messages in thread
From: Bhushan Bharat-R65777 @ 2012-07-09  5:13 UTC (permalink / raw)
  To: Alexander Graf, Wood Scott-B07421; +Cc: kvm-ppc, kvm



> -----Original Message-----
> From: Alexander Graf [mailto:agraf@suse.de]
> Sent: Saturday, July 07, 2012 1:21 PM
> To: Wood Scott-B07421
> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
> Bharat-R65777
> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> 
> 
> On 07.07.2012, at 01:37, Scott Wood wrote:
> 
> > On 07/06/2012 08:17 AM, Alexander Graf wrote:
> >> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
> >>> +/*
> >>> + * The timer system can almost deal with LONG_MAX timeouts, except that
> >>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
> >>> + *
> >>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
> >>> + * any realistic use.
> >>> + */
> >>> +#define MAX_TIMEOUT (LONG_MAX/2)
> >>
> >> Should this really be in kvm code?
> >
> > It looks like we can use NEXT_TIMER_MAX_DELTA for this.
> >
> >>> +	mask = 1ULL << (63 - period);
> >>> +	tb = get_tb();
> >>> +	if (tb & mask)
> >>> +		nr_jiffies += mask;
> >>
> >> To be honest, you lost me here. nr_jiffies is jiffies, right? While
> >> mask is basically in timebase granularity. I suppose you're just
> >> reusing the variable here for no good reason - the compiler will
> >> gladly optimize things for you if you write things a bit more verbose
> >> :).
> >
> > Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
> > something generic like "ticks", "interval", "remaining", etc. would be
> > better, with a comment on the do_div saying it's converting timebase
> > ticks into jiffies.
> 
> Well, you could start off with a variable "delta_tb", then do
> 
>   nr_jiffies = delta_tb;
>   x = do_div(...);
> 
> and things would suddenly become readable :). Of course I don't object to
> comments along the code either :).

Ok

> 
> >
> >>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
> >>> +{
> >>> +	unsigned long nr_jiffies;
> >>> +
> >>> +	nr_jiffies = watchdog_next_timeout(vcpu);
> >>> +	if (nr_jiffies < MAX_TIMEOUT)
> >>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> >>> +	else
> >>> +		del_timer(&vcpu->arch.wdt_timer);
> >>
> >> Can you del a timer that's not armed? Could that ever happen in this case?
> >
> > "del_timer() deactivates a timer - this works on both active and
> > inactive timers."
> 
> Ah, good :).
> 
> >
> >> Also, could the timer possibly be running somewhere, so do we need
> del_timer_sync? Or don't we need to care?
> >
> > This can be called in the context of the timer, so del_timer_sync()
> > would hang.
> >
> > As for what would happen if a caller from a different context were to
> > race with a timer, I think you could end up with the timer armed based
> > on an old TCR.  del_timer_sync() won't help though, unless you replace
> > mod_timer() with del_timer_sync() plus add_timer() (with a check to see
> > whether it's running in timer context).  A better solution is probably
> > to use a spinlock in arm_next_watchdog().
> 
> Yup. Either way, we have a race that the guest might not expect.

Ok, will use spinlock in arm_next_watchdog().

> 
> >
> >>> +void kvmppc_watchdog_func(unsigned long data)
> >>> +{
> >>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> >>> +	u32 tsr, new_tsr;
> >>> +	int final;
> >>> +
> >>> +	do {
> >>> +		new_tsr = tsr = vcpu->arch.tsr;
> >>> +		final = 0;
> >>> +
> >>> +		/* Time out event */
> >>> +		if (tsr & TSR_ENW) {
> >>> +			if (tsr & TSR_WIS) {
> >>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
> >>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
> >>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
> >>> +				final = 1;
> >>> +			} else {
> >>> +				new_tsr = tsr | TSR_WIS;
> >>> +			}
> >>> +		} else {
> >>> +			new_tsr = tsr | TSR_ENW;
> >>> +		}
> >>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> >>> +
> >>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
> >>> +		smp_wmb();
> >>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> >>> +		kvm_vcpu_kick(vcpu);
> >>> +	}
> >>> +
> >>> +	/*
> >>> +	 * Avoid getting a storm of timers if the guest sets
> >>> +	 * the period very short.  We'll restart it if anything
> >>> +	 * changes.
> >>> +	 */
> >>> +	if (!final)
> >>> +		arm_next_watchdog(vcpu);
> >>
> >> Mind to explain this part a bit further?
> >
> > The whole function, or some subset near the end?
> >
> > The "if (!final)" check means that we stop running the timer after final
> > expiration, to prevent the host from being flooded with timers if the
> > guest sets a short period but does not have TCR set to exit to QEMU.
> > Timers will resume the next time TSR/TCR is updated.
> 
> Ah. The semantics make sense. The comment however is slightly too short. Please
> explain this in a more verbose way, so someone who didn't write the code knows
> what's going on :).

Ok.

> 
> >
> >>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
> >>> 	}
> >>>
> >>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> >>> +		u32 old_tsr = vcpu->arch.tsr;
> >>> +
> >>> 		vcpu->arch.tsr = sregs->u.e.tsr;
> >>> +
> >>> +		if ((old_tsr ^ vcpu->arch.tsr) &
> >>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
> >>> +			arm_next_watchdog(vcpu);
> >>
> >> Why isn't this one guarded by vcpu->arch.watchdog_enable?
> >
> > I'm not sure that any of them should be -- there's no reason for the
> > watchdog interrupt mechanism to be dependent on QEMU, only the
> > heavyweight exit on final expiration.
> 
> Well - I like the concept of having new features switchable. Overlapping the
> "watchdog is implemented" feature with "user space wants watchdog exits" makes
> sense. But I definitely want to have a switch for the former, because we
> otherwise differ quite substantially from the emulation we had before.

Ok, will guard with watchdog_enable.

> 
> >
> >>> +			spr_val &= ~TCR_WRC_MASK;
> >>> +		kvmppc_set_tcr(vcpu,
> >>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
> >>
> >> In fact, what you're trying to do here is keep TCR_WRC always on when it was
> enabled once. So all you need is the OR here. No need for the mask above.
> >
> > WRC is a 2-bit field that is supposed to preserve its value once written
> > to be non-zero.  Not that we actually do anything different based on the
> > specific non-zero value, but still we should implement the architected
> > semantics.
> 
> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
> 
> /* WRC is a 2-bit field that is supposed to preserve its value once written to
> be non-zero */
> spr_val &= ~TCR_WRC_MASK;
> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> kvmppc_set_tcr(vcpu, spr_val);

I think you mean:

if (TCR_WRC_MASK & vcpu->arch.tcr) {
    spr_val &= ~TCR_WRC_MASK;
    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
}
kvmppc_set_tcr(vcpu, spr_val);

Thanks
-Bharat

> 
> 
> Alex
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09  5:13         ` Bhushan Bharat-R65777
  0 siblings, 0 replies; 29+ messages in thread
From: Bhushan Bharat-R65777 @ 2012-07-09  5:13 UTC (permalink / raw)
  To: Alexander Graf, Wood Scott-B07421; +Cc: kvm-ppc, kvm



> -----Original Message-----
> From: Alexander Graf [mailto:agraf@suse.de]
> Sent: Saturday, July 07, 2012 1:21 PM
> To: Wood Scott-B07421
> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
> Bharat-R65777
> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> 
> 
> On 07.07.2012, at 01:37, Scott Wood wrote:
> 
> > On 07/06/2012 08:17 AM, Alexander Graf wrote:
> >> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
> >>> +/*
> >>> + * The timer system can almost deal with LONG_MAX timeouts, except that
> >>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
> >>> + *
> >>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
> >>> + * any realistic use.
> >>> + */
> >>> +#define MAX_TIMEOUT (LONG_MAX/2)
> >>
> >> Should this really be in kvm code?
> >
> > It looks like we can use NEXT_TIMER_MAX_DELTA for this.
> >
> >>> +	mask = 1ULL << (63 - period);
> >>> +	tb = get_tb();
> >>> +	if (tb & mask)
> >>> +		nr_jiffies += mask;
> >>
> >> To be honest, you lost me here. nr_jiffies is jiffies, right? While
> >> mask is basically in timebase granularity. I suppose you're just
> >> reusing the variable here for no good reason - the compiler will
> >> gladly optimize things for you if you write things a bit more verbose
> >> :).
> >
> > Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
> > something generic like "ticks", "interval", "remaining", etc. would be
> > better, with a comment on the do_div saying it's converting timebase
> > ticks into jiffies.
> 
> Well, you could start off with a variable "delta_tb", then do
> 
>   nr_jiffies = delta_tb;
>   x = do_div(...);
> 
> and things would suddenly become readable :). Of course I don't object to
> comments along the code either :).

Ok

> 
> >
> >>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
> >>> +{
> >>> +	unsigned long nr_jiffies;
> >>> +
> >>> +	nr_jiffies = watchdog_next_timeout(vcpu);
> >>> +	if (nr_jiffies < MAX_TIMEOUT)
> >>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> >>> +	else
> >>> +		del_timer(&vcpu->arch.wdt_timer);
> >>
> >> Can you del a timer that's not armed? Could that ever happen in this case?
> >
> > "del_timer() deactivates a timer - this works on both active and
> > inactive timers."
> 
> Ah, good :).
> 
> >
> >> Also, could the timer possibly be running somewhere, so do we need
> del_timer_sync? Or don't we need to care?
> >
> > This can be called in the context of the timer, so del_timer_sync()
> > would hang.
> >
> > As for what would happen if a caller from a different context were to
> > race with a timer, I think you could end up with the timer armed based
> > on an old TCR.  del_timer_sync() won't help though, unless you replace
> > mod_timer() with del_timer_sync() plus add_timer() (with a check to see
> > whether it's running in timer context).  A better solution is probably
> > to use a spinlock in arm_next_watchdog().
> 
> Yup. Either way, we have a race that the guest might not expect.

Ok, will use spinlock in arm_next_watchdog().

> 
> >
> >>> +void kvmppc_watchdog_func(unsigned long data)
> >>> +{
> >>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> >>> +	u32 tsr, new_tsr;
> >>> +	int final;
> >>> +
> >>> +	do {
> >>> +		new_tsr = tsr = vcpu->arch.tsr;
> >>> +		final = 0;
> >>> +
> >>> +		/* Time out event */
> >>> +		if (tsr & TSR_ENW) {
> >>> +			if (tsr & TSR_WIS) {
> >>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
> >>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
> >>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
> >>> +				final = 1;
> >>> +			} else {
> >>> +				new_tsr = tsr | TSR_WIS;
> >>> +			}
> >>> +		} else {
> >>> +			new_tsr = tsr | TSR_ENW;
> >>> +		}
> >>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> >>> +
> >>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
> >>> +		smp_wmb();
> >>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> >>> +		kvm_vcpu_kick(vcpu);
> >>> +	}
> >>> +
> >>> +	/*
> >>> +	 * Avoid getting a storm of timers if the guest sets
> >>> +	 * the period very short.  We'll restart it if anything
> >>> +	 * changes.
> >>> +	 */
> >>> +	if (!final)
> >>> +		arm_next_watchdog(vcpu);
> >>
> >> Mind to explain this part a bit further?
> >
> > The whole function, or some subset near the end?
> >
> > The "if (!final)" check means that we stop running the timer after final
> > expiration, to prevent the host from being flooded with timers if the
> > guest sets a short period but does not have TCR set to exit to QEMU.
> > Timers will resume the next time TSR/TCR is updated.
> 
> Ah. The semantics make sense. The comment however is slightly too short. Please
> explain this in a more verbose way, so someone who didn't write the code knows
> what's going on :).

Ok.

> 
> >
> >>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
> >>> 	}
> >>>
> >>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> >>> +		u32 old_tsr = vcpu->arch.tsr;
> >>> +
> >>> 		vcpu->arch.tsr = sregs->u.e.tsr;
> >>> +
> >>> +		if ((old_tsr ^ vcpu->arch.tsr) &
> >>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
> >>> +			arm_next_watchdog(vcpu);
> >>
> >> Why isn't this one guarded by vcpu->arch.watchdog_enable?
> >
> > I'm not sure that any of them should be -- there's no reason for the
> > watchdog interrupt mechanism to be dependent on QEMU, only the
> > heavyweight exit on final expiration.
> 
> Well - I like the concept of having new features switchable. Overlapping the
> "watchdog is implemented" feature with "user space wants watchdog exits" makes
> sense. But I definitely want to have a switch for the former, because we
> otherwise differ quite substantially from the emulation we had before.

Ok, will guard with watchdog_enable.

> 
> >
> >>> +			spr_val &= ~TCR_WRC_MASK;
> >>> +		kvmppc_set_tcr(vcpu,
> >>> +		               spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
> >>
> >> In fact, what you're trying to do here is keep TCR_WRC always on when it was
> enabled once. So all you need is the OR here. No need for the mask above.
> >
> > WRC is a 2-bit field that is supposed to preserve its value once written
> > to be non-zero.  Not that we actually do anything different based on the
> > specific non-zero value, but still we should implement the architected
> > semantics.
> 
> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
> 
> /* WRC is a 2-bit field that is supposed to preserve its value once written to
> be non-zero */
> spr_val &= ~TCR_WRC_MASK;
> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> kvmppc_set_tcr(vcpu, spr_val);

I think you mean:

if (TCR_WRC_MASK & vcpu->arch.tcr) {
    spr_val &= ~TCR_WRC_MASK;
    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
}
kvmppc_set_tcr(vcpu, spr_val);

Thanks
-Bharat

> 
> 
> Alex
> 



^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-06 13:17   ` Alexander Graf
@ 2012-07-09  6:43     ` Bhushan Bharat-R65777
  -1 siblings, 0 replies; 29+ messages in thread
From: Bhushan Bharat-R65777 @ 2012-07-09  6:43 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm-ppc, kvm

> > +
> > +/*
> > + * Return the number of jiffies until the next timeout.  If the
> > +timeout is
> > + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
> > + */
> > +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) {
> > +	unsigned long long tb, mask, nr_jiffies = 0;
> 
> u64?
> 
> > +	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);
> 
> Doesn't sound like something booke generic to me.

the name '*FSL*' does not look good, right?

> 
> > +#ifdef CONFIG_BOOKE
> > +	ret = ret || (v->arch.tsr & TCR_WRC_MASK);
> 
> Please make this a callback. In a header if you think it's performance critical,
> but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c.

Not sure: do you mean something like this:

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index f0e0c6a..7bbc6cd 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -446,6 +446,11 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 }
 #endif
 
+static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3                        0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index b7cd335..e5b86c1 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -100,4 +100,9 @@ static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.shared->msr;
 }
+
+static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.tsr & TCR_WRC_MASK;
+}


Thanks
-Bharat

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* RE: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09  6:43     ` Bhushan Bharat-R65777
  0 siblings, 0 replies; 29+ messages in thread
From: Bhushan Bharat-R65777 @ 2012-07-09  6:43 UTC (permalink / raw)
  To: Alexander Graf; +Cc: kvm-ppc, kvm

> > +
> > +/*
> > + * Return the number of jiffies until the next timeout.  If the
> > +timeout is
> > + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
> > + */
> > +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) {
> > +	unsigned long long tb, mask, nr_jiffies = 0;
> 
> u64?
> 
> > +	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);
> 
> Doesn't sound like something booke generic to me.

the name '*FSL*' does not look good, right?

> 
> > +#ifdef CONFIG_BOOKE
> > +	ret = ret || (v->arch.tsr & TCR_WRC_MASK);
> 
> Please make this a callback. In a header if you think it's performance critical,
> but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c.

Not sure: do you mean something like this:

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index f0e0c6a..7bbc6cd 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -446,6 +446,11 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 }
 #endif
 
+static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3                        0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index b7cd335..e5b86c1 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -100,4 +100,9 @@ static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.shared->msr;
 }
+
+static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.tsr & TCR_WRC_MASK;
+}


Thanks
-Bharat


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-09  5:13         ` Bhushan Bharat-R65777
@ 2012-07-09  8:49           ` Alexander Graf
  -1 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-09  8:49 UTC (permalink / raw)
  To: Bhushan Bharat-R65777; +Cc: Wood Scott-B07421, kvm-ppc, kvm



On 09.07.2012, at 07:13, Bhushan Bharat-R65777 <R65777@freescale.com> wrote:

> 
> 
>> -----Original Message-----
>> From: Alexander Graf [mailto:agraf@suse.de]
>> Sent: Saturday, July 07, 2012 1:21 PM
>> To: Wood Scott-B07421
>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>> Bharat-R65777
>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>> 
>> 
>> On 07.07.2012, at 01:37, Scott Wood wrote:
>> 
>>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>>> +/*
>>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>>> + *
>>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>>> + * any realistic use.
>>>>> + */
>>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>> 
>>>> Should this really be in kvm code?
>>> 
>>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>> 
>>>>> +    mask = 1ULL << (63 - period);
>>>>> +    tb = get_tb();
>>>>> +    if (tb & mask)
>>>>> +        nr_jiffies += mask;
>>>> 
>>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>>> mask is basically in timebase granularity. I suppose you're just
>>>> reusing the variable here for no good reason - the compiler will
>>>> gladly optimize things for you if you write things a bit more verbose
>>>> :).
>>> 
>>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>>> something generic like "ticks", "interval", "remaining", etc. would be
>>> better, with a comment on the do_div saying it's converting timebase
>>> ticks into jiffies.
>> 
>> Well, you could start off with a variable "delta_tb", then do
>> 
>>  nr_jiffies = delta_tb;
>>  x = do_div(...);
>> 
>> and things would suddenly become readable :). Of course I don't object to
>> comments along the code either :).
> 
> Ok
> 
>> 
>>> 
>>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +    unsigned long nr_jiffies;
>>>>> +
>>>>> +    nr_jiffies = watchdog_next_timeout(vcpu);
>>>>> +    if (nr_jiffies < MAX_TIMEOUT)
>>>>> +        mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>>> +    else
>>>>> +        del_timer(&vcpu->arch.wdt_timer);
>>>> 
>>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>> 
>>> "del_timer() deactivates a timer - this works on both active and
>>> inactive timers."
>> 
>> Ah, good :).
>> 
>>> 
>>>> Also, could the timer possibly be running somewhere, so do we need
>> del_timer_sync? Or don't we need to care?
>>> 
>>> This can be called in the context of the timer, so del_timer_sync()
>>> would hang.
>>> 
>>> As for what would happen if a caller from a different context were to
>>> race with a timer, I think you could end up with the timer armed based
>>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>>> whether it's running in timer context).  A better solution is probably
>>> to use a spinlock in arm_next_watchdog().
>> 
>> Yup. Either way, we have a race that the guest might not expect.
> 
> Ok, will use spinlock in arm_next_watchdog().
> 
>> 
>>> 
>>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>>> +{
>>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>> +    u32 tsr, new_tsr;
>>>>> +    int final;
>>>>> +
>>>>> +    do {
>>>>> +        new_tsr = tsr = vcpu->arch.tsr;
>>>>> +        final = 0;
>>>>> +
>>>>> +        /* Time out event */
>>>>> +        if (tsr & TSR_ENW) {
>>>>> +            if (tsr & TSR_WIS) {
>>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>>> +                final = 1;
>>>>> +            } else {
>>>>> +                new_tsr = tsr | TSR_WIS;
>>>>> +            }
>>>>> +        } else {
>>>>> +            new_tsr = tsr | TSR_ENW;
>>>>> +        }
>>>>> +    } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>>> +
>>>>> +    if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>>> +        smp_wmb();
>>>>> +        kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>>> +        kvm_vcpu_kick(vcpu);
>>>>> +    }
>>>>> +
>>>>> +    /*
>>>>> +     * Avoid getting a storm of timers if the guest sets
>>>>> +     * the period very short.  We'll restart it if anything
>>>>> +     * changes.
>>>>> +     */
>>>>> +    if (!final)
>>>>> +        arm_next_watchdog(vcpu);
>>>> 
>>>> Mind to explain this part a bit further?
>>> 
>>> The whole function, or some subset near the end?
>>> 
>>> The "if (!final)" check means that we stop running the timer after final
>>> expiration, to prevent the host from being flooded with timers if the
>>> guest sets a short period but does not have TCR set to exit to QEMU.
>>> Timers will resume the next time TSR/TCR is updated.
>> 
>> Ah. The semantics make sense. The comment however is slightly too short. Please
>> explain this in a more verbose way, so someone who didn't write the code knows
>> what's going on :).
> 
> Ok.
> 
>> 
>>> 
>>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>>>    }
>>>>> 
>>>>>    if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>>> +        u32 old_tsr = vcpu->arch.tsr;
>>>>> +
>>>>>        vcpu->arch.tsr = sregs->u.e.tsr;
>>>>> +
>>>>> +        if ((old_tsr ^ vcpu->arch.tsr) &
>>>>> +            (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>>> +            arm_next_watchdog(vcpu);
>>>> 
>>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>> 
>>> I'm not sure that any of them should be -- there's no reason for the
>>> watchdog interrupt mechanism to be dependent on QEMU, only the
>>> heavyweight exit on final expiration.
>> 
>> Well - I like the concept of having new features switchable. Overlapping the
>> "watchdog is implemented" feature with "user space wants watchdog exits" makes
>> sense. But I definitely want to have a switch for the former, because we
>> otherwise differ quite substantially from the emulation we had before.
> 
> Ok, will guard with watchdog_enable.
> 
>> 
>>> 
>>>>> +            spr_val &= ~TCR_WRC_MASK;
>>>>> +        kvmppc_set_tcr(vcpu,
>>>>> +                       spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
>>>> 
>>>> In fact, what you're trying to do here is keep TCR_WRC always on when it was
>> enabled once. So all you need is the OR here. No need for the mask above.
>>> 
>>> WRC is a 2-bit field that is supposed to preserve its value once written
>>> to be non-zero.  Not that we actually do anything different based on the
>>> specific non-zero value, but still we should implement the architected
>>> semantics.
>> 
>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>> 
>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>> be non-zero */
>> spr_val &= ~TCR_WRC_MASK;
>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> I think you mean:
> 
> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>    spr_val &= ~TCR_WRC_MASK;
>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> kvmppc_set_tcr(vcpu, spr_val);

Eh, yes, of course :). Plus the comment.

Alex

> 
> Thanks
> -Bharat
> 
>> 
>> 
>> Alex
>> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09  8:49           ` Alexander Graf
  0 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-09  8:49 UTC (permalink / raw)
  To: Bhushan Bharat-R65777; +Cc: Wood Scott-B07421, kvm-ppc, kvm



On 09.07.2012, at 07:13, Bhushan Bharat-R65777 <R65777@freescale.com> wrote:

> 
> 
>> -----Original Message-----
>> From: Alexander Graf [mailto:agraf@suse.de]
>> Sent: Saturday, July 07, 2012 1:21 PM
>> To: Wood Scott-B07421
>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>> Bharat-R65777
>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>> 
>> 
>> On 07.07.2012, at 01:37, Scott Wood wrote:
>> 
>>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>>> +/*
>>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>>> + *
>>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>>> + * any realistic use.
>>>>> + */
>>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>> 
>>>> Should this really be in kvm code?
>>> 
>>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>> 
>>>>> +    mask = 1ULL << (63 - period);
>>>>> +    tb = get_tb();
>>>>> +    if (tb & mask)
>>>>> +        nr_jiffies += mask;
>>>> 
>>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>>> mask is basically in timebase granularity. I suppose you're just
>>>> reusing the variable here for no good reason - the compiler will
>>>> gladly optimize things for you if you write things a bit more verbose
>>>> :).
>>> 
>>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>>> something generic like "ticks", "interval", "remaining", etc. would be
>>> better, with a comment on the do_div saying it's converting timebase
>>> ticks into jiffies.
>> 
>> Well, you could start off with a variable "delta_tb", then do
>> 
>>  nr_jiffies = delta_tb;
>>  x = do_div(...);
>> 
>> and things would suddenly become readable :). Of course I don't object to
>> comments along the code either :).
> 
> Ok
> 
>> 
>>> 
>>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +    unsigned long nr_jiffies;
>>>>> +
>>>>> +    nr_jiffies = watchdog_next_timeout(vcpu);
>>>>> +    if (nr_jiffies < MAX_TIMEOUT)
>>>>> +        mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>>> +    else
>>>>> +        del_timer(&vcpu->arch.wdt_timer);
>>>> 
>>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>> 
>>> "del_timer() deactivates a timer - this works on both active and
>>> inactive timers."
>> 
>> Ah, good :).
>> 
>>> 
>>>> Also, could the timer possibly be running somewhere, so do we need
>> del_timer_sync? Or don't we need to care?
>>> 
>>> This can be called in the context of the timer, so del_timer_sync()
>>> would hang.
>>> 
>>> As for what would happen if a caller from a different context were to
>>> race with a timer, I think you could end up with the timer armed based
>>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>>> whether it's running in timer context).  A better solution is probably
>>> to use a spinlock in arm_next_watchdog().
>> 
>> Yup. Either way, we have a race that the guest might not expect.
> 
> Ok, will use spinlock in arm_next_watchdog().
> 
>> 
>>> 
>>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>>> +{
>>>>> +    struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>> +    u32 tsr, new_tsr;
>>>>> +    int final;
>>>>> +
>>>>> +    do {
>>>>> +        new_tsr = tsr = vcpu->arch.tsr;
>>>>> +        final = 0;
>>>>> +
>>>>> +        /* Time out event */
>>>>> +        if (tsr & TSR_ENW) {
>>>>> +            if (tsr & TSR_WIS) {
>>>>> +                new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>> +                      (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>> +                vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>>> +                final = 1;
>>>>> +            } else {
>>>>> +                new_tsr = tsr | TSR_WIS;
>>>>> +            }
>>>>> +        } else {
>>>>> +            new_tsr = tsr | TSR_ENW;
>>>>> +        }
>>>>> +    } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>>> +
>>>>> +    if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>>> +        smp_wmb();
>>>>> +        kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>>> +        kvm_vcpu_kick(vcpu);
>>>>> +    }
>>>>> +
>>>>> +    /*
>>>>> +     * Avoid getting a storm of timers if the guest sets
>>>>> +     * the period very short.  We'll restart it if anything
>>>>> +     * changes.
>>>>> +     */
>>>>> +    if (!final)
>>>>> +        arm_next_watchdog(vcpu);
>>>> 
>>>> Mind to explain this part a bit further?
>>> 
>>> The whole function, or some subset near the end?
>>> 
>>> The "if (!final)" check means that we stop running the timer after final
>>> expiration, to prevent the host from being flooded with timers if the
>>> guest sets a short period but does not have TCR set to exit to QEMU.
>>> Timers will resume the next time TSR/TCR is updated.
>> 
>> Ah. The semantics make sense. The comment however is slightly too short. Please
>> explain this in a more verbose way, so someone who didn't write the code knows
>> what's going on :).
> 
> Ok.
> 
>> 
>>> 
>>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>>>    }
>>>>> 
>>>>>    if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>>> +        u32 old_tsr = vcpu->arch.tsr;
>>>>> +
>>>>>        vcpu->arch.tsr = sregs->u.e.tsr;
>>>>> +
>>>>> +        if ((old_tsr ^ vcpu->arch.tsr) &
>>>>> +            (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>>> +            arm_next_watchdog(vcpu);
>>>> 
>>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>> 
>>> I'm not sure that any of them should be -- there's no reason for the
>>> watchdog interrupt mechanism to be dependent on QEMU, only the
>>> heavyweight exit on final expiration.
>> 
>> Well - I like the concept of having new features switchable. Overlapping the
>> "watchdog is implemented" feature with "user space wants watchdog exits" makes
>> sense. But I definitely want to have a switch for the former, because we
>> otherwise differ quite substantially from the emulation we had before.
> 
> Ok, will guard with watchdog_enable.
> 
>> 
>>> 
>>>>> +            spr_val &= ~TCR_WRC_MASK;
>>>>> +        kvmppc_set_tcr(vcpu,
>>>>> +                       spr_val | (TCR_WRC_MASK & vcpu->arch.tcr));
>>>> 
>>>> In fact, what you're trying to do here is keep TCR_WRC always on when it was
>> enabled once. So all you need is the OR here. No need for the mask above.
>>> 
>>> WRC is a 2-bit field that is supposed to preserve its value once written
>>> to be non-zero.  Not that we actually do anything different based on the
>>> specific non-zero value, but still we should implement the architected
>>> semantics.
>> 
>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>> 
>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>> be non-zero */
>> spr_val &= ~TCR_WRC_MASK;
>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> I think you mean:
> 
> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>    spr_val &= ~TCR_WRC_MASK;
>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> kvmppc_set_tcr(vcpu, spr_val);

Eh, yes, of course :). Plus the comment.

Alex

> 
> Thanks
> -Bharat
> 
>> 
>> 
>> Alex
>> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-09  6:43     ` Bhushan Bharat-R65777
@ 2012-07-09  9:11       ` Alexander Graf
  -1 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-09  9:11 UTC (permalink / raw)
  To: Bhushan Bharat-R65777; +Cc: kvm-ppc, kvm


On 09.07.2012, at 08:43, Bhushan Bharat-R65777 wrote:

>>> +
>>> +/*
>>> + * Return the number of jiffies until the next timeout.  If the
>>> +timeout is
>>> + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
>>> + */
>>> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) {
>>> +	unsigned long long tb, mask, nr_jiffies = 0;
>> 
>> u64?
>> 
>>> +	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);
>> 
>> Doesn't sound like something booke generic to me.
> 
> the name '*FSL*' does not look good, right?

It usually indicates that it's not booke generic :).

> 
>> 
>>> +#ifdef CONFIG_BOOKE
>>> +	ret = ret || (v->arch.tsr & TCR_WRC_MASK);
>> 
>> Please make this a callback. In a header if you think it's performance critical,
>> but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c.
> 
> Not sure: do you mean something like this:
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index f0e0c6a..7bbc6cd 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -446,6 +446,11 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> }
> #endif
> 
> +static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
> +{
> +       return 0;
> +}
> +
> /* Magic register values loaded into r3 and r4 before the 'sc' assembly
>  * instruction for the OSI hypercalls */
> #define OSI_SC_MAGIC_R3                        0x113724FA
> diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
> index b7cd335..e5b86c1 100644
> --- a/arch/powerpc/include/asm/kvm_booke.h
> +++ b/arch/powerpc/include/asm/kvm_booke.h
> @@ -100,4 +100,9 @@ static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> {
>        return vcpu->arch.shared->msr;
> }
> +
> +static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
> +{
> +       return vcpu->arch.tsr & TCR_WRC_MASK;
> +}

No, I was more thinking along the lines of e500/440 here. Also it should simply be a generic callback, like

ret = kvmppc_core_check_runnable(vcpu, ret);

So that if 440 wants to do something special about modifying the runnable semantics, it can easily do so. Or we just move the whole function to core specific code.


Alex

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09  9:11       ` Alexander Graf
  0 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-09  9:11 UTC (permalink / raw)
  To: Bhushan Bharat-R65777; +Cc: kvm-ppc, kvm


On 09.07.2012, at 08:43, Bhushan Bharat-R65777 wrote:

>>> +
>>> +/*
>>> + * Return the number of jiffies until the next timeout.  If the
>>> +timeout is
>>> + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead.
>>> + */
>>> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) {
>>> +	unsigned long long tb, mask, nr_jiffies = 0;
>> 
>> u64?
>> 
>>> +	u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr);
>> 
>> Doesn't sound like something booke generic to me.
> 
> the name '*FSL*' does not look good, right?

It usually indicates that it's not booke generic :).

> 
>> 
>>> +#ifdef CONFIG_BOOKE
>>> +	ret = ret || (v->arch.tsr & TCR_WRC_MASK);
>> 
>> Please make this a callback. In a header if you think it's performance critical,
>> but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c.
> 
> Not sure: do you mean something like this:
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index f0e0c6a..7bbc6cd 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -446,6 +446,11 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> }
> #endif
> 
> +static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
> +{
> +       return 0;
> +}
> +
> /* Magic register values loaded into r3 and r4 before the 'sc' assembly
>  * instruction for the OSI hypercalls */
> #define OSI_SC_MAGIC_R3                        0x113724FA
> diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
> index b7cd335..e5b86c1 100644
> --- a/arch/powerpc/include/asm/kvm_booke.h
> +++ b/arch/powerpc/include/asm/kvm_booke.h
> @@ -100,4 +100,9 @@ static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> {
>        return vcpu->arch.shared->msr;
> }
> +
> +static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
> +{
> +       return vcpu->arch.tsr & TCR_WRC_MASK;
> +}

No, I was more thinking along the lines of e500/440 here. Also it should simply be a generic callback, like

ret = kvmppc_core_check_runnable(vcpu, ret);

So that if 440 wants to do something special about modifying the runnable semantics, it can easily do so. Or we just move the whole function to core specific code.


Alex


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-09  5:13         ` Bhushan Bharat-R65777
@ 2012-07-09 14:28           ` Scott Wood
  -1 siblings, 0 replies; 29+ messages in thread
From: Scott Wood @ 2012-07-09 14:28 UTC (permalink / raw)
  To: Bhushan Bharat-R65777; +Cc: Alexander Graf, Wood Scott-B07421, kvm-ppc, kvm

On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
> 
> 
>> -----Original Message-----
>> From: Alexander Graf [mailto:agraf@suse.de]
>> Sent: Saturday, July 07, 2012 1:21 PM
>> To: Wood Scott-B07421
>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>> Bharat-R65777
>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>>
>>
>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>>
>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>> be non-zero */
>> spr_val &= ~TCR_WRC_MASK;
>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> I think you mean:
> 
> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>     spr_val &= ~TCR_WRC_MASK;
>     spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> kvmppc_set_tcr(vcpu, spr_val);

Actually I think he means:

if (vcpu->arch.tcr & TCR_WRC_MASK) {
	spr_val &= ~TCR_WRC_MASK;
	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
}

kvmppc_set_tcr(vcpu, spr_val);

:-)

-Scott

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09 14:28           ` Scott Wood
  0 siblings, 0 replies; 29+ messages in thread
From: Scott Wood @ 2012-07-09 14:28 UTC (permalink / raw)
  To: Bhushan Bharat-R65777; +Cc: Alexander Graf, Wood Scott-B07421, kvm-ppc, kvm

On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
> 
> 
>> -----Original Message-----
>> From: Alexander Graf [mailto:agraf@suse.de]
>> Sent: Saturday, July 07, 2012 1:21 PM
>> To: Wood Scott-B07421
>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>> Bharat-R65777
>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>>
>>
>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>>
>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>> be non-zero */
>> spr_val &= ~TCR_WRC_MASK;
>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> I think you mean:
> 
> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>     spr_val &= ~TCR_WRC_MASK;
>     spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> kvmppc_set_tcr(vcpu, spr_val);

Actually I think he means:

if (vcpu->arch.tcr & TCR_WRC_MASK) {
	spr_val &= ~TCR_WRC_MASK;
	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
}

kvmppc_set_tcr(vcpu, spr_val);

:-)

-Scott


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-09 14:28           ` Scott Wood
@ 2012-07-09 14:36             ` Alexander Graf
  -1 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-09 14:36 UTC (permalink / raw)
  To: Scott Wood; +Cc: Bhushan Bharat-R65777, Wood Scott-B07421, kvm-ppc, kvm


On 09.07.2012, at 16:28, Scott Wood wrote:

> On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
>> 
>> 
>>> -----Original Message-----
>>> From: Alexander Graf [mailto:agraf@suse.de]
>>> Sent: Saturday, July 07, 2012 1:21 PM
>>> To: Wood Scott-B07421
>>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>>> Bharat-R65777
>>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>>> 
>>> 
>>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>>> 
>>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>>> be non-zero */
>>> spr_val &= ~TCR_WRC_MASK;
>>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>>> kvmppc_set_tcr(vcpu, spr_val);
>> 
>> I think you mean:
>> 
>> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>>    spr_val &= ~TCR_WRC_MASK;
>>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> }
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> Actually I think he means:
> 
> if (vcpu->arch.tcr & TCR_WRC_MASK) {
> 	spr_val &= ~TCR_WRC_MASK;
> 	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> 
> kvmppc_set_tcr(vcpu, spr_val);

Plus the comment, yes :). I really don't like (mask & val) constructs. About as much as I dislike if (0 == x) ones.


Alex

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09 14:36             ` Alexander Graf
  0 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-09 14:36 UTC (permalink / raw)
  To: Scott Wood; +Cc: Bhushan Bharat-R65777, Wood Scott-B07421, kvm-ppc, kvm


On 09.07.2012, at 16:28, Scott Wood wrote:

> On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
>> 
>> 
>>> -----Original Message-----
>>> From: Alexander Graf [mailto:agraf@suse.de]
>>> Sent: Saturday, July 07, 2012 1:21 PM
>>> To: Wood Scott-B07421
>>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; Bhushan
>>> Bharat-R65777
>>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
>>> 
>>> 
>>> Ah, being 2 bits wide, the above code suddenly makes more sense :). How about
>>> 
>>> /* WRC is a 2-bit field that is supposed to preserve its value once written to
>>> be non-zero */
>>> spr_val &= ~TCR_WRC_MASK;
>>> spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>>> kvmppc_set_tcr(vcpu, spr_val);
>> 
>> I think you mean:
>> 
>> if (TCR_WRC_MASK & vcpu->arch.tcr) {
>>    spr_val &= ~TCR_WRC_MASK;
>>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
>> }
>> kvmppc_set_tcr(vcpu, spr_val);
> 
> Actually I think he means:
> 
> if (vcpu->arch.tcr & TCR_WRC_MASK) {
> 	spr_val &= ~TCR_WRC_MASK;
> 	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> }
> 
> kvmppc_set_tcr(vcpu, spr_val);

Plus the comment, yes :). I really don't like (mask & val) constructs. About as much as I dislike if (0 = x) ones.


Alex


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-09 14:36             ` Alexander Graf
@ 2012-07-09 14:44               ` Bhushan Bharat-R65777
  -1 siblings, 0 replies; 29+ messages in thread
From: Bhushan Bharat-R65777 @ 2012-07-09 14:44 UTC (permalink / raw)
  To: Alexander Graf, Wood Scott-B07421; +Cc: Wood Scott-B07421, kvm-ppc, kvm



> -----Original Message-----
> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
> Behalf Of Alexander Graf
> Sent: Monday, July 09, 2012 8:07 PM
> To: Wood Scott-B07421
> Cc: Bhushan Bharat-R65777; Wood Scott-B07421; kvm-ppc@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> 
> 
> On 09.07.2012, at 16:28, Scott Wood wrote:
> 
> > On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Alexander Graf [mailto:agraf@suse.de]
> >>> Sent: Saturday, July 07, 2012 1:21 PM
> >>> To: Wood Scott-B07421
> >>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org;
> >>> kvm@vger.kernel.org; Bhushan
> >>> Bharat-R65777
> >>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> >>>
> >>>
> >>> Ah, being 2 bits wide, the above code suddenly makes more sense :).
> >>> How about
> >>>
> >>> /* WRC is a 2-bit field that is supposed to preserve its value once
> >>> written to be non-zero */ spr_val &= ~TCR_WRC_MASK; spr_val |=
> >>> vcpu->arch.tcr & TCR_WRC_MASK; kvmppc_set_tcr(vcpu, spr_val);
> >>
> >> I think you mean:
> >>
> >> if (TCR_WRC_MASK & vcpu->arch.tcr) {
> >>    spr_val &= ~TCR_WRC_MASK;
> >>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; } kvmppc_set_tcr(vcpu,
> >> spr_val);
> >
> > Actually I think he means:
> >
> > if (vcpu->arch.tcr & TCR_WRC_MASK) {
> > 	spr_val &= ~TCR_WRC_MASK;
> > 	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; }
> >
> > kvmppc_set_tcr(vcpu, spr_val);
> 
> Plus the comment, yes :). I really don't like (mask & val) constructs. About as
> much as I dislike if (0 == x) ones.

I think I did in v2 :)

> 
> 
> Alex
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09 14:44               ` Bhushan Bharat-R65777
  0 siblings, 0 replies; 29+ messages in thread
From: Bhushan Bharat-R65777 @ 2012-07-09 14:44 UTC (permalink / raw)
  To: Alexander Graf, Wood Scott-B07421; +Cc: Wood Scott-B07421, kvm-ppc, kvm



> -----Original Message-----
> From: kvm-ppc-owner@vger.kernel.org [mailto:kvm-ppc-owner@vger.kernel.org] On
> Behalf Of Alexander Graf
> Sent: Monday, July 09, 2012 8:07 PM
> To: Wood Scott-B07421
> Cc: Bhushan Bharat-R65777; Wood Scott-B07421; kvm-ppc@vger.kernel.org;
> kvm@vger.kernel.org
> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> 
> 
> On 09.07.2012, at 16:28, Scott Wood wrote:
> 
> > On 07/09/2012 12:13 AM, Bhushan Bharat-R65777 wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Alexander Graf [mailto:agraf@suse.de]
> >>> Sent: Saturday, July 07, 2012 1:21 PM
> >>> To: Wood Scott-B07421
> >>> Cc: Bhushan Bharat-R65777; kvm-ppc@vger.kernel.org;
> >>> kvm@vger.kernel.org; Bhushan
> >>> Bharat-R65777
> >>> Subject: Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> >>>
> >>>
> >>> Ah, being 2 bits wide, the above code suddenly makes more sense :).
> >>> How about
> >>>
> >>> /* WRC is a 2-bit field that is supposed to preserve its value once
> >>> written to be non-zero */ spr_val &= ~TCR_WRC_MASK; spr_val |> >>> vcpu->arch.tcr & TCR_WRC_MASK; kvmppc_set_tcr(vcpu, spr_val);
> >>
> >> I think you mean:
> >>
> >> if (TCR_WRC_MASK & vcpu->arch.tcr) {
> >>    spr_val &= ~TCR_WRC_MASK;
> >>    spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; } kvmppc_set_tcr(vcpu,
> >> spr_val);
> >
> > Actually I think he means:
> >
> > if (vcpu->arch.tcr & TCR_WRC_MASK) {
> > 	spr_val &= ~TCR_WRC_MASK;
> > 	spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; }
> >
> > kvmppc_set_tcr(vcpu, spr_val);
> 
> Plus the comment, yes :). I really don't like (mask & val) constructs. About as
> much as I dislike if (0 = x) ones.

I think I did in v2 :)

> 
> 
> Alex
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-07  7:50       ` Alexander Graf
@ 2012-07-09 17:09         ` Scott Wood
  -1 siblings, 0 replies; 29+ messages in thread
From: Scott Wood @ 2012-07-09 17:09 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Bharat Bhushan, kvm-ppc, kvm, Bharat Bhushan

On 07/07/2012 02:50 AM, Alexander Graf wrote:
> 
> On 07.07.2012, at 01:37, Scott Wood wrote:
> 
>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>> +/*
>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>> + *
>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>> + * any realistic use.
>>>> + */
>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>
>>> Should this really be in kvm code?
>>
>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>
>>>> +	mask = 1ULL << (63 - period);
>>>> +	tb = get_tb();
>>>> +	if (tb & mask)
>>>> +		nr_jiffies += mask;
>>>
>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>> mask is basically in timebase granularity. I suppose you're just
>>> reusing the variable here for no good reason - the compiler will
>>> gladly optimize things for you if you write things a bit more verbose
>>> :).
>>
>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>> something generic like "ticks", "interval", "remaining", etc. would be
>> better, with a comment on the do_div saying it's converting timebase
>> ticks into jiffies.
> 
> Well, you could start off with a variable "delta_tb", then do
> 
>   nr_jiffies = delta_tb;
>   x = do_div(...);
> 
> and things would suddenly become readable :). Of course I don't object to comments along the code either :).
> 
>>
>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	unsigned long nr_jiffies;
>>>> +
>>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>> +	else
>>>> +		del_timer(&vcpu->arch.wdt_timer);
>>>
>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>
>> "del_timer() deactivates a timer - this works on both active and
>> inactive timers."
> 
> Ah, good :).
> 
>>
>>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
>>
>> This can be called in the context of the timer, so del_timer_sync()
>> would hang.
>>
>> As for what would happen if a caller from a different context were to
>> race with a timer, I think you could end up with the timer armed based
>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>> whether it's running in timer context).  A better solution is probably
>> to use a spinlock in arm_next_watchdog().
> 
> Yup. Either way, we have a race that the guest might not expect.
> 
>>
>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>> +{
>>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>> +	u32 tsr, new_tsr;
>>>> +	int final;
>>>> +
>>>> +	do {
>>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>>> +		final = 0;
>>>> +
>>>> +		/* Time out event */
>>>> +		if (tsr & TSR_ENW) {
>>>> +			if (tsr & TSR_WIS) {
>>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>> +				final = 1;
>>>> +			} else {
>>>> +				new_tsr = tsr | TSR_WIS;
>>>> +			}
>>>> +		} else {
>>>> +			new_tsr = tsr | TSR_ENW;
>>>> +		}
>>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>> +
>>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>> +		smp_wmb();
>>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>> +		kvm_vcpu_kick(vcpu);
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Avoid getting a storm of timers if the guest sets
>>>> +	 * the period very short.  We'll restart it if anything
>>>> +	 * changes.
>>>> +	 */
>>>> +	if (!final)
>>>> +		arm_next_watchdog(vcpu);
>>>
>>> Mind to explain this part a bit further?
>>
>> The whole function, or some subset near the end?
>>
>> The "if (!final)" check means that we stop running the timer after final
>> expiration, to prevent the host from being flooded with timers if the
>> guest sets a short period but does not have TCR set to exit to QEMU.
>> Timers will resume the next time TSR/TCR is updated.
> 
> Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).
> 
>>
>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>> 	}
>>>>
>>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>> +		u32 old_tsr = vcpu->arch.tsr;
>>>> +
>>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>>> +
>>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>> +			arm_next_watchdog(vcpu);
>>>
>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>
>> I'm not sure that any of them should be -- there's no reason for the
>> watchdog interrupt mechanism to be dependent on QEMU, only the
>> heavyweight exit on final expiration.
> 
> Well - I like the concept of having new features switchable.

The code quickly becomes an unmantainable mess if every new feature is
switchable (and where is the line between new feature and bugfix when it
comes to filling in missing emulation?).  The reason for switchability
in this case is limited to QEMU interface compatibility.

The internal-to-KVM part of the watchdog isn't much different from the
FIT -- we're not going to conditionalize that, are we?

> Overlapping the "watchdog is implemented" feature with "user space
> wants watchdog exits" makes sense. But I definitely want to have a
> switch for the former, because we otherwise differ quite
> substantially from the emulation we had before.

It differs in that it is now a bit closer to being correct.

-Scott


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09 17:09         ` Scott Wood
  0 siblings, 0 replies; 29+ messages in thread
From: Scott Wood @ 2012-07-09 17:09 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Bharat Bhushan, kvm-ppc, kvm, Bharat Bhushan

On 07/07/2012 02:50 AM, Alexander Graf wrote:
> 
> On 07.07.2012, at 01:37, Scott Wood wrote:
> 
>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>> +/*
>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>> + *
>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>> + * any realistic use.
>>>> + */
>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>
>>> Should this really be in kvm code?
>>
>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>
>>>> +	mask = 1ULL << (63 - period);
>>>> +	tb = get_tb();
>>>> +	if (tb & mask)
>>>> +		nr_jiffies += mask;
>>>
>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>> mask is basically in timebase granularity. I suppose you're just
>>> reusing the variable here for no good reason - the compiler will
>>> gladly optimize things for you if you write things a bit more verbose
>>> :).
>>
>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>> something generic like "ticks", "interval", "remaining", etc. would be
>> better, with a comment on the do_div saying it's converting timebase
>> ticks into jiffies.
> 
> Well, you could start off with a variable "delta_tb", then do
> 
>   nr_jiffies = delta_tb;
>   x = do_div(...);
> 
> and things would suddenly become readable :). Of course I don't object to comments along the code either :).
> 
>>
>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	unsigned long nr_jiffies;
>>>> +
>>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>> +	else
>>>> +		del_timer(&vcpu->arch.wdt_timer);
>>>
>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>
>> "del_timer() deactivates a timer - this works on both active and
>> inactive timers."
> 
> Ah, good :).
> 
>>
>>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
>>
>> This can be called in the context of the timer, so del_timer_sync()
>> would hang.
>>
>> As for what would happen if a caller from a different context were to
>> race with a timer, I think you could end up with the timer armed based
>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>> whether it's running in timer context).  A better solution is probably
>> to use a spinlock in arm_next_watchdog().
> 
> Yup. Either way, we have a race that the guest might not expect.
> 
>>
>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>> +{
>>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>> +	u32 tsr, new_tsr;
>>>> +	int final;
>>>> +
>>>> +	do {
>>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>>> +		final = 0;
>>>> +
>>>> +		/* Time out event */
>>>> +		if (tsr & TSR_ENW) {
>>>> +			if (tsr & TSR_WIS) {
>>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>> +				final = 1;
>>>> +			} else {
>>>> +				new_tsr = tsr | TSR_WIS;
>>>> +			}
>>>> +		} else {
>>>> +			new_tsr = tsr | TSR_ENW;
>>>> +		}
>>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>> +
>>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>> +		smp_wmb();
>>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>> +		kvm_vcpu_kick(vcpu);
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Avoid getting a storm of timers if the guest sets
>>>> +	 * the period very short.  We'll restart it if anything
>>>> +	 * changes.
>>>> +	 */
>>>> +	if (!final)
>>>> +		arm_next_watchdog(vcpu);
>>>
>>> Mind to explain this part a bit further?
>>
>> The whole function, or some subset near the end?
>>
>> The "if (!final)" check means that we stop running the timer after final
>> expiration, to prevent the host from being flooded with timers if the
>> guest sets a short period but does not have TCR set to exit to QEMU.
>> Timers will resume the next time TSR/TCR is updated.
> 
> Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).
> 
>>
>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>> 	}
>>>>
>>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>> +		u32 old_tsr = vcpu->arch.tsr;
>>>> +
>>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>>> +
>>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>> +			arm_next_watchdog(vcpu);
>>>
>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>
>> I'm not sure that any of them should be -- there's no reason for the
>> watchdog interrupt mechanism to be dependent on QEMU, only the
>> heavyweight exit on final expiration.
> 
> Well - I like the concept of having new features switchable.

The code quickly becomes an unmantainable mess if every new feature is
switchable (and where is the line between new feature and bugfix when it
comes to filling in missing emulation?).  The reason for switchability
in this case is limited to QEMU interface compatibility.

The internal-to-KVM part of the watchdog isn't much different from the
FIT -- we're not going to conditionalize that, are we?

> Overlapping the "watchdog is implemented" feature with "user space
> wants watchdog exits" makes sense. But I definitely want to have a
> switch for the former, because we otherwise differ quite
> substantially from the emulation we had before.

It differs in that it is now a bit closer to being correct.

-Scott


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-09 17:09         ` Scott Wood
@ 2012-07-09 17:29           ` Alexander Graf
  -1 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-09 17:29 UTC (permalink / raw)
  To: Scott Wood; +Cc: Bharat Bhushan, kvm-ppc, kvm, Bharat Bhushan


On 09.07.2012, at 19:09, Scott Wood wrote:

> On 07/07/2012 02:50 AM, Alexander Graf wrote:
>> 
>> On 07.07.2012, at 01:37, Scott Wood wrote:
>> 
>>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>>> +/*
>>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>>> + *
>>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>>> + * any realistic use.
>>>>> + */
>>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>> 
>>>> Should this really be in kvm code?
>>> 
>>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>> 
>>>>> +	mask = 1ULL << (63 - period);
>>>>> +	tb = get_tb();
>>>>> +	if (tb & mask)
>>>>> +		nr_jiffies += mask;
>>>> 
>>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>>> mask is basically in timebase granularity. I suppose you're just
>>>> reusing the variable here for no good reason - the compiler will
>>>> gladly optimize things for you if you write things a bit more verbose
>>>> :).
>>> 
>>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>>> something generic like "ticks", "interval", "remaining", etc. would be
>>> better, with a comment on the do_div saying it's converting timebase
>>> ticks into jiffies.
>> 
>> Well, you could start off with a variable "delta_tb", then do
>> 
>>  nr_jiffies = delta_tb;
>>  x = do_div(...);
>> 
>> and things would suddenly become readable :). Of course I don't object to comments along the code either :).
>> 
>>> 
>>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	unsigned long nr_jiffies;
>>>>> +
>>>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>>> +	else
>>>>> +		del_timer(&vcpu->arch.wdt_timer);
>>>> 
>>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>> 
>>> "del_timer() deactivates a timer - this works on both active and
>>> inactive timers."
>> 
>> Ah, good :).
>> 
>>> 
>>>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
>>> 
>>> This can be called in the context of the timer, so del_timer_sync()
>>> would hang.
>>> 
>>> As for what would happen if a caller from a different context were to
>>> race with a timer, I think you could end up with the timer armed based
>>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>>> whether it's running in timer context).  A better solution is probably
>>> to use a spinlock in arm_next_watchdog().
>> 
>> Yup. Either way, we have a race that the guest might not expect.
>> 
>>> 
>>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>>> +{
>>>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>> +	u32 tsr, new_tsr;
>>>>> +	int final;
>>>>> +
>>>>> +	do {
>>>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>>>> +		final = 0;
>>>>> +
>>>>> +		/* Time out event */
>>>>> +		if (tsr & TSR_ENW) {
>>>>> +			if (tsr & TSR_WIS) {
>>>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>>> +				final = 1;
>>>>> +			} else {
>>>>> +				new_tsr = tsr | TSR_WIS;
>>>>> +			}
>>>>> +		} else {
>>>>> +			new_tsr = tsr | TSR_ENW;
>>>>> +		}
>>>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>>> +
>>>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>>> +		smp_wmb();
>>>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>>> +		kvm_vcpu_kick(vcpu);
>>>>> +	}
>>>>> +
>>>>> +	/*
>>>>> +	 * Avoid getting a storm of timers if the guest sets
>>>>> +	 * the period very short.  We'll restart it if anything
>>>>> +	 * changes.
>>>>> +	 */
>>>>> +	if (!final)
>>>>> +		arm_next_watchdog(vcpu);
>>>> 
>>>> Mind to explain this part a bit further?
>>> 
>>> The whole function, or some subset near the end?
>>> 
>>> The "if (!final)" check means that we stop running the timer after final
>>> expiration, to prevent the host from being flooded with timers if the
>>> guest sets a short period but does not have TCR set to exit to QEMU.
>>> Timers will resume the next time TSR/TCR is updated.
>> 
>> Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).
>> 
>>> 
>>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>>> 	}
>>>>> 
>>>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>>> +		u32 old_tsr = vcpu->arch.tsr;
>>>>> +
>>>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>>>> +
>>>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>>> +			arm_next_watchdog(vcpu);
>>>> 
>>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>> 
>>> I'm not sure that any of them should be -- there's no reason for the
>>> watchdog interrupt mechanism to be dependent on QEMU, only the
>>> heavyweight exit on final expiration.
>> 
>> Well - I like the concept of having new features switchable.
> 
> The code quickly becomes an unmantainable mess if every new feature is
> switchable (and where is the line between new feature and bugfix when it
> comes to filling in missing emulation?).  The reason for switchability
> in this case is limited to QEMU interface compatibility.
> 
> The internal-to-KVM part of the watchdog isn't much different from the
> FIT -- we're not going to conditionalize that, are we?
> 
>> Overlapping the "watchdog is implemented" feature with "user space
>> wants watchdog exits" makes sense. But I definitely want to have a
>> switch for the former, because we otherwise differ quite
>> substantially from the emulation we had before.
> 
> It differs in that it is now a bit closer to being correct.

If we can maintain user space backwards compatibility, I'm fine with not switching it, since we're effectively fixing a bug wrt the implementation of the announced features, yes.


Alex


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-09 17:29           ` Alexander Graf
  0 siblings, 0 replies; 29+ messages in thread
From: Alexander Graf @ 2012-07-09 17:29 UTC (permalink / raw)
  To: Scott Wood; +Cc: Bharat Bhushan, kvm-ppc, kvm, Bharat Bhushan


On 09.07.2012, at 19:09, Scott Wood wrote:

> On 07/07/2012 02:50 AM, Alexander Graf wrote:
>> 
>> On 07.07.2012, at 01:37, Scott Wood wrote:
>> 
>>> On 07/06/2012 08:17 AM, Alexander Graf wrote:
>>>> On 28.06.2012, at 08:17, Bharat Bhushan wrote:
>>>>> +/*
>>>>> + * The timer system can almost deal with LONG_MAX timeouts, except that
>>>>> + * when you get very close to LONG_MAX, the slack added can cause overflow.
>>>>> + *
>>>>> + * LONG_MAX/2 is a conservative threshold, but it should be adequate for
>>>>> + * any realistic use.
>>>>> + */
>>>>> +#define MAX_TIMEOUT (LONG_MAX/2)
>>>> 
>>>> Should this really be in kvm code?
>>> 
>>> It looks like we can use NEXT_TIMER_MAX_DELTA for this.
>>> 
>>>>> +	mask = 1ULL << (63 - period);
>>>>> +	tb = get_tb();
>>>>> +	if (tb & mask)
>>>>> +		nr_jiffies += mask;
>>>> 
>>>> To be honest, you lost me here. nr_jiffies is jiffies, right? While
>>>> mask is basically in timebase granularity. I suppose you're just
>>>> reusing the variable here for no good reason - the compiler will
>>>> gladly optimize things for you if you write things a bit more verbose
>>>> :).
>>> 
>>> Probably due to the way do_div() works, but yeah, it's confusing.  Maybe
>>> something generic like "ticks", "interval", "remaining", etc. would be
>>> better, with a comment on the do_div saying it's converting timebase
>>> ticks into jiffies.
>> 
>> Well, you could start off with a variable "delta_tb", then do
>> 
>>  nr_jiffies = delta_tb;
>>  x = do_div(...);
>> 
>> and things would suddenly become readable :). Of course I don't object to comments along the code either :).
>> 
>>> 
>>>>> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	unsigned long nr_jiffies;
>>>>> +
>>>>> +	nr_jiffies = watchdog_next_timeout(vcpu);
>>>>> +	if (nr_jiffies < MAX_TIMEOUT)
>>>>> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
>>>>> +	else
>>>>> +		del_timer(&vcpu->arch.wdt_timer);
>>>> 
>>>> Can you del a timer that's not armed? Could that ever happen in this case?
>>> 
>>> "del_timer() deactivates a timer - this works on both active and
>>> inactive timers."
>> 
>> Ah, good :).
>> 
>>> 
>>>> Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care?
>>> 
>>> This can be called in the context of the timer, so del_timer_sync()
>>> would hang.
>>> 
>>> As for what would happen if a caller from a different context were to
>>> race with a timer, I think you could end up with the timer armed based
>>> on an old TCR.  del_timer_sync() won't help though, unless you replace
>>> mod_timer() with del_timer_sync() plus add_timer() (with a check to see
>>> whether it's running in timer context).  A better solution is probably
>>> to use a spinlock in arm_next_watchdog().
>> 
>> Yup. Either way, we have a race that the guest might not expect.
>> 
>>> 
>>>>> +void kvmppc_watchdog_func(unsigned long data)
>>>>> +{
>>>>> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
>>>>> +	u32 tsr, new_tsr;
>>>>> +	int final;
>>>>> +
>>>>> +	do {
>>>>> +		new_tsr = tsr = vcpu->arch.tsr;
>>>>> +		final = 0;
>>>>> +
>>>>> +		/* Time out event */
>>>>> +		if (tsr & TSR_ENW) {
>>>>> +			if (tsr & TSR_WIS) {
>>>>> +				new_tsr = (tsr & ~TCR_WRC_MASK) |
>>>>> +					  (vcpu->arch.tcr & TCR_WRC_MASK);
>>>>> +				vcpu->arch.tcr &= ~TCR_WRC_MASK;
>>>>> +				final = 1;
>>>>> +			} else {
>>>>> +				new_tsr = tsr | TSR_WIS;
>>>>> +			}
>>>>> +		} else {
>>>>> +			new_tsr = tsr | TSR_ENW;
>>>>> +		}
>>>>> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
>>>>> +
>>>>> +	if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
>>>>> +		smp_wmb();
>>>>> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
>>>>> +		kvm_vcpu_kick(vcpu);
>>>>> +	}
>>>>> +
>>>>> +	/*
>>>>> +	 * Avoid getting a storm of timers if the guest sets
>>>>> +	 * the period very short.  We'll restart it if anything
>>>>> +	 * changes.
>>>>> +	 */
>>>>> +	if (!final)
>>>>> +		arm_next_watchdog(vcpu);
>>>> 
>>>> Mind to explain this part a bit further?
>>> 
>>> The whole function, or some subset near the end?
>>> 
>>> The "if (!final)" check means that we stop running the timer after final
>>> expiration, to prevent the host from being flooded with timers if the
>>> guest sets a short period but does not have TCR set to exit to QEMU.
>>> Timers will resume the next time TSR/TCR is updated.
>> 
>> Ah. The semantics make sense. The comment however is slightly too short. Please explain this in a more verbose way, so someone who didn't write the code knows what's going on :).
>> 
>>> 
>>>>> @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>>>>> 	}
>>>>> 
>>>>> 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
>>>>> +		u32 old_tsr = vcpu->arch.tsr;
>>>>> +
>>>>> 		vcpu->arch.tsr = sregs->u.e.tsr;
>>>>> +
>>>>> +		if ((old_tsr ^ vcpu->arch.tsr) &
>>>>> +		    (TSR_ENW | TSR_WIS | TCR_WRC_MASK))
>>>>> +			arm_next_watchdog(vcpu);
>>>> 
>>>> Why isn't this one guarded by vcpu->arch.watchdog_enable?
>>> 
>>> I'm not sure that any of them should be -- there's no reason for the
>>> watchdog interrupt mechanism to be dependent on QEMU, only the
>>> heavyweight exit on final expiration.
>> 
>> Well - I like the concept of having new features switchable.
> 
> The code quickly becomes an unmantainable mess if every new feature is
> switchable (and where is the line between new feature and bugfix when it
> comes to filling in missing emulation?).  The reason for switchability
> in this case is limited to QEMU interface compatibility.
> 
> The internal-to-KVM part of the watchdog isn't much different from the
> FIT -- we're not going to conditionalize that, are we?
> 
>> Overlapping the "watchdog is implemented" feature with "user space
>> wants watchdog exits" makes sense. But I definitely want to have a
>> switch for the former, because we otherwise differ quite
>> substantially from the emulation we had before.
> 
> It differs in that it is now a bit closer to being correct.

If we can maintain user space backwards compatibility, I'm fine with not switching it, since we're effectively fixing a bug wrt the implementation of the announced features, yes.


Alex


^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-07-20  5:11 ` Bharat Bhushan
  (?)
@ 2012-07-20  4:59 ` Bhushan Bharat-R65777
  -1 siblings, 0 replies; 29+ messages in thread
From: Bhushan Bharat-R65777 @ 2012-07-20  4:59 UTC (permalink / raw)
  To: Bhushan Bharat-R65777, kvm-ppc, kvm, agraf

Please ignore this as I forget to get [v5] is subject.. 

Thanks
-Bharat

> -----Original Message-----
> From: Bhushan Bharat-R65777
> Sent: Friday, July 20, 2012 10:29 AM
> To: kvm-ppc@vger.kernel.org; kvm@vger.kernel.org; agraf@suse.de
> Cc: Bhushan Bharat-R65777
> Subject: [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
> 
> This patch adds the watchdog emulation in KVM. The watchdog emulation is enabled
> by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
> The kernel timer are used for watchdog emulation and emulates h/w watchdog state
> machine. On watchdog timer expiry, it exit to QEMU if TCR.WRC is non ZERO. QEMU
> can reset/shutdown etc depending upon how it is configured.
> 
> Signed-off-by: Liu Yu <yu.liu@freescale.com>
> Signed-off-by: Scott Wood <scottwood@freescale.com>
> Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
> [bharat.bhushan@freescale.com: reworked patch]
> 
> ---
> v5:
>  - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG.
>  - Moved watchdog_next_timeout() in lock.
> 
> v4:
>  - in v3 i forgot to add Scott Wood and Liu Yu signoff
> 
> v3:
>  - Using KVM_REQ_WATCHDOG for userspace exit.
>  - TSR changes left for vcpu thread.
>  - Other review comments on v2
> 
>  arch/powerpc/include/asm/kvm_host.h  |    3 +
>  arch/powerpc/include/asm/kvm_ppc.h   |    3 +
>  arch/powerpc/include/asm/reg_booke.h |    7 ++
>  arch/powerpc/kvm/booke.c             |  140 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/kvm/booke_emulate.c     |    8 ++
>  arch/powerpc/kvm/powerpc.c           |   19 +++++-
>  include/linux/kvm.h                  |    2 +
>  include/linux/kvm_host.h             |    1 +
>  8 files changed, 182 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h
> b/arch/powerpc/include/asm/kvm_host.h
> index 50ea12f..43cac48 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -467,6 +467,8 @@ struct kvm_vcpu_arch {
>  	ulong fault_esr;
>  	ulong queued_dear;
>  	ulong queued_esr;
> +	spinlock_t wdt_lock;
> +	struct timer_list wdt_timer;
>  	u32 tlbcfg[4];
>  	u32 mmucfg;
>  	u32 epr;
> @@ -482,6 +484,7 @@ struct kvm_vcpu_arch {
>  	u8 osi_needed;
>  	u8 osi_enabled;
>  	u8 papr_enabled;
> +	u8 watchdog_enabled;
>  	u8 sane;
>  	u8 cpu_type;
>  	u8 hcall_needed;
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h
> b/arch/powerpc/include/asm/kvm_ppc.h
> index 0124937..e5cf4b9 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct
> kvm_vcpu *vcpu);  extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);  extern
> u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);  extern void
> kvmppc_decrementer_func(unsigned long data);
> +extern void kvmppc_watchdog_func(unsigned long data);
>  extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
> 
>  /* Core-specific hooks */
> @@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu
> *vcpu,
>                                         struct kvm_interrupt *irq);  extern void
> kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
>                                           struct kvm_interrupt *irq);
> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu); extern
> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
> 
>  extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
>                                    unsigned int op, int *advance); diff --git
> a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
> index 2d916c4..e07e6af 100644
> --- a/arch/powerpc/include/asm/reg_booke.h
> +++ b/arch/powerpc/include/asm/reg_booke.h
> @@ -539,6 +539,13 @@
>  #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
>  #define TCR_ARE		0x00400000	/* Auto Reload Enable */
> 
> +#ifdef CONFIG_E500
> +#define TCR_GET_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
> +			      (((tcr) & 0x1E0000) >> 15))
> +#else
> +#define TCR_GET_WP(tcr)  (((tcr) & 0xC0000000) >> 30) #endif
> +
>  /* Bit definitions for the TSR. */
>  #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
>  #define TSR_WIS		0x40000000	/* WDT Interrupt Status */
> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index
> d25a097..71b4ce9 100644
> --- a/arch/powerpc/kvm/booke.c
> +++ b/arch/powerpc/kvm/booke.c
> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
>  	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
> }
> 
> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) {
> +	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); }
> +
> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) {
> +	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); }
> +
>  static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
> {  #ifdef CONFIG_KVM_BOOKE_HV @@ -325,6 +335,7 @@ static int
> kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
>  		msr_mask = MSR_CE | MSR_ME | MSR_DE;
>  		int_class = INT_CLASS_NONCRIT;
>  		break;
> +	case BOOKE_IRQPRIO_WATCHDOG:
>  	case BOOKE_IRQPRIO_CRITICAL:
>  	case BOOKE_IRQPRIO_DBELL_CRIT:
>  		allowed = vcpu->arch.shared->msr & MSR_CE; @@ -404,12 +415,113 @@
> static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
>  	return allowed;
>  }
> 
> +/*
> + * Return the number of jiffies until the next timeout.  If the timeout
> +is
> + * longer than the NEXT_TIMER_MAX_DELTA, then return
> +NEXT_TIMER_MAX_DELTA
> + * because the larger value can break the timer APIs.
> + */
> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) {
> +	u64 tb, wdt_tb, wdt_ticks = 0;
> +	u64 nr_jiffies = 0;
> +	u32 period = TCR_GET_WP(vcpu->arch.tcr);
> +
> +	wdt_tb = 1ULL << (63 - period);
> +	tb = get_tb();
> +	/*
> +	 * The watchdog timeout will hapeen when TB bit corresponding
> +	 * to watchdog will toggle from 0 to 1.
> +	 */
> +	if (tb & wdt_tb)
> +		wdt_ticks = wdt_tb;
> +
> +	wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
> +
> +	/* Convert timebase ticks to jiffies */
> +	nr_jiffies = wdt_ticks;
> +
> +	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
> +		nr_jiffies++;
> +
> +	return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); }
> +
> +static void arm_next_watchdog(struct kvm_vcpu *vcpu) {
> +	unsigned long nr_jiffies;
> +
> +	spin_lock(&vcpu->arch.wdt_lock);
> +	nr_jiffies = watchdog_next_timeout(vcpu);
> +	/*
> +	 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
> +	 * then do not run the watchdog timer as this can break timer APIs.
> +	 */
> +	if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
> +		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> +	else
> +		del_timer(&vcpu->arch.wdt_timer);
> +	spin_unlock(&vcpu->arch.wdt_lock);
> +}
> +
> +void kvmppc_watchdog_func(unsigned long data) {
> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> +	u32 tsr, new_tsr;
> +	int final;
> +
> +	do {
> +		new_tsr = tsr = vcpu->arch.tsr;
> +		final = 0;
> +
> +		/* Time out event */
> +		if (tsr & TSR_ENW) {
> +			if (tsr & TSR_WIS)
> +				final = 1;
> +			else
> +				new_tsr = tsr | TSR_WIS;
> +		} else {
> +			new_tsr = tsr | TSR_ENW;
> +		}
> +	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> +
> +	if (new_tsr & TSR_WIS) {
> +		smp_wmb();
> +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> +		kvm_vcpu_kick(vcpu);
> +	}
> +
> +	/*
> +	 * If this is final watchdog expiry and some action is required
> +	 * then exit to userspace.
> +	 */
> +	if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
> +	    vcpu->arch.watchdog_enabled) {
> +		smp_wmb();
> +		kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
> +		kvm_vcpu_kick(vcpu);
> +	}
> +
> +	/*
> +	 * Stop running the watchdog timer after final expiration to
> +	 * prevent the host from being flooded with timers if the
> +	 * guest sets a short period.
> +	 * Timers will resume when TSR/TCR is updated next time.
> +	 */
> +	if (!final)
> +		arm_next_watchdog(vcpu);
> +}
> +
>  static void update_timer_ints(struct kvm_vcpu *vcpu)  {
>  	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
>  		kvmppc_core_queue_dec(vcpu);
>  	else
>  		kvmppc_core_dequeue_dec(vcpu);
> +
> +	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
> +		kvmppc_core_queue_watchdog(vcpu);
> +	else
> +		kvmppc_core_dequeue_watchdog(vcpu);
>  }
> 
>  static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) @@ -516,6
> +628,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>  		goto out;
>  	}
> 
> +	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) &&
> +	    (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) {
> +		kvm_run->exit_reason = KVM_EXIT_WATCHDOG;
> +		ret = 0;
> +		goto out;
> +	}
> +
>  	kvm_guest_enter();
> 
>  #ifdef CONFIG_PPC_FPU
> @@ -978,6 +1097,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct
> kvm_vcpu *vcpu,
>  		}
>  	}
> 
> +	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) &&
> +	    (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) {
> +		run->exit_reason = KVM_EXIT_WATCHDOG;
> +		r = RESUME_HOST | (r & RESUME_FLAG_NV);
> +	}
> +
>  	return r;
>  }
> 
> @@ -1106,7 +1231,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
>  	}
> 
>  	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> +		u32 old_tsr = vcpu->arch.tsr;
> +
>  		vcpu->arch.tsr = sregs->u.e.tsr;
> +
> +		if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
> +			arm_next_watchdog(vcpu);
> +
>  		update_timer_ints(vcpu);
>  	}
> 
> @@ -1267,6 +1398,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
> void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)  {
>  	vcpu->arch.tcr = new_tcr;
> +	arm_next_watchdog(vcpu);
>  	update_timer_ints(vcpu);
>  }
> 
> @@ -1281,6 +1413,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32
> tsr_bits)  void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)  {
>  	clear_bits(tsr_bits, &vcpu->arch.tsr);
> +
> +	/*
> +	 * We may have stopped the watchdog due to
> +	 * being stuck on final expiration.
> +	 */
> +	if (tsr_bits & (TSR_ENW | TSR_WIS))
> +		arm_next_watchdog(vcpu);
> +
>  	update_timer_ints(vcpu);
>  }
> 
> diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
> index 12834bb..5a66ade 100644
> --- a/arch/powerpc/kvm/booke_emulate.c
> +++ b/arch/powerpc/kvm/booke_emulate.c
> @@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int
> sprn, ulong spr_val)
>  		kvmppc_clr_tsr_bits(vcpu, spr_val);
>  		break;
>  	case SPRN_TCR:
> +		/*
> +		 * WRC is a 2-bit field that is supposed to preserve its
> +		 * value once written to non-zero.
> +		 */
> +		if (vcpu->arch.tcr & TCR_WRC_MASK) {
> +			spr_val &= ~TCR_WRC_MASK;
> +			spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> +		}
>  		kvmppc_set_tcr(vcpu, spr_val);
>  		break;
> 
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index
> 87f4dc8..63457cb 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -220,6 +220,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	switch (ext) {
>  #ifdef CONFIG_BOOKE
>  	case KVM_CAP_PPC_BOOKE_SREGS:
> +	case KVM_CAP_PPC_BOOKE_WATCHDOG:
>  #else
>  	case KVM_CAP_PPC_SEGSTATE:
>  	case KVM_CAP_PPC_HIOR:
> @@ -386,13 +387,23 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)  #ifdef
> CONFIG_KVM_EXIT_TIMING
>  	mutex_init(&vcpu->arch.exit_timing_lock);
>  #endif
> -
> +#ifdef CONFIG_BOOKE
> +	spin_lock_init(&vcpu->arch.wdt_lock);
> +	/* setup watchdog timer once */
> +	setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
> +		    (unsigned long)vcpu);
> +#endif
>  	return 0;
>  }
> 
>  void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)  {
>  	kvmppc_mmu_destroy(vcpu);
> +#ifdef CONFIG_BOOKE
> +	spin_lock(&vcpu->arch.wdt_lock);
> +	del_timer(&vcpu->arch.wdt_timer);
> +	spin_unlock(&vcpu->arch.wdt_lock);
> +#endif
>  }
> 
>  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -637,6 +648,12 @@
> static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>  		r = 0;
>  		vcpu->arch.papr_enabled = true;
>  		break;
> +#ifdef CONFIG_BOOKE
> +	case KVM_CAP_PPC_BOOKE_WATCHDOG:
> +		r = 0;
> +		vcpu->arch.watchdog_enabled = true;
> +		break;
> +#endif
>  #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
>  	case KVM_CAP_SW_TLB: {
>  		struct kvm_config_tlb cfg;
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2ce09aa..9ff5aa5
> 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -163,6 +163,7 @@ struct kvm_pit_config {
>  #define KVM_EXIT_OSI              18
>  #define KVM_EXIT_PAPR_HCALL	  19
>  #define KVM_EXIT_S390_UCONTROL	  20
> +#define KVM_EXIT_WATCHDOG         21
> 
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -618,6 +619,7 @@ struct kvm_ppc_smmu_info {  #define
> KVM_CAP_PPC_GET_SMMU_INFO 78  #define KVM_CAP_S390_COW 79  #define
> KVM_CAP_PPC_ALLOC_HTAB 80
> +#define KVM_CAP_PPC_BOOKE_WATCHDOG 81
> 
>  #ifdef KVM_CAP_IRQ_ROUTING
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index
> 27ac8a4..41d32be 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -69,6 +69,7 @@
>  #define KVM_REQ_IMMEDIATE_EXIT    15
>  #define KVM_REQ_PMU               16
>  #define KVM_REQ_PMI               17
> +#define KVM_REQ_WATCHDOG          18
> 
>  #define KVM_USERSPACE_IRQ_SOURCE_ID	0
> 
> --
> 1.7.0.4

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
  2012-06-28  6:29 ` Bharat Bhushan
@ 2012-07-20  5:11 ` Bharat Bhushan
  -1 siblings, 0 replies; 29+ messages in thread
From: Bharat Bhushan @ 2012-07-20  4:59 UTC (permalink / raw)
  To: kvm-ppc, kvm, agraf; +Cc: Bharat Bhushan

This patch adds the watchdog emulation in KVM. The watchdog
emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
The kernel timer are used for watchdog emulation and emulates
h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
it is configured.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
[bharat.bhushan@freescale.com: reworked patch]

---
v5:
 - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG.
 - Moved watchdog_next_timeout() in lock.

v4:
 - in v3 i forgot to add Scott Wood and Liu Yu signoff

v3:
 - Using KVM_REQ_WATCHDOG for userspace exit.
 - TSR changes left for vcpu thread.
 - Other review comments on v2

 arch/powerpc/include/asm/kvm_host.h  |    3 +
 arch/powerpc/include/asm/kvm_ppc.h   |    3 +
 arch/powerpc/include/asm/reg_booke.h |    7 ++
 arch/powerpc/kvm/booke.c             |  140 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/booke_emulate.c     |    8 ++
 arch/powerpc/kvm/powerpc.c           |   19 +++++-
 include/linux/kvm.h                  |    2 +
 include/linux/kvm_host.h             |    1 +
 8 files changed, 182 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..43cac48 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -467,6 +467,8 @@ struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	spinlock_t wdt_lock;
+	struct timer_list wdt_timer;
 	u32 tlbcfg[4];
 	u32 mmucfg;
 	u32 epr;
@@ -482,6 +484,7 @@ struct kvm_vcpu_arch {
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
+	u8 watchdog_enabled;
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..e5cf4b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
+extern void kvmppc_watchdog_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
@@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                          struct kvm_interrupt *irq);
+extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4..e07e6af 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@
 #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
 #define TCR_ARE		0x00400000	/* Auto Reload Enable */
 
+#ifdef CONFIG_E500
+#define TCR_GET_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
+			      (((tcr) & 0x1E0000) >> 15))
+#else
+#define TCR_GET_WP(tcr)  (((tcr) & 0xC0000000) >> 30)
+#endif
+
 /* Bit definitions for the TSR. */
 #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
 #define TSR_WIS		0x40000000	/* WDT Interrupt Status */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..71b4ce9 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -404,12 +415,113 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	return allowed;
 }
 
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
+ * because the larger value can break the timer APIs.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	u64 tb, wdt_tb, wdt_ticks = 0;
+	u64 nr_jiffies = 0;
+	u32 period = TCR_GET_WP(vcpu->arch.tcr);
+
+	wdt_tb = 1ULL << (63 - period);
+	tb = get_tb();
+	/*
+	 * The watchdog timeout will hapeen when TB bit corresponding
+	 * to watchdog will toggle from 0 to 1.
+	 */
+	if (tb & wdt_tb)
+		wdt_ticks = wdt_tb;
+
+	wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
+
+	/* Convert timebase ticks to jiffies */
+	nr_jiffies = wdt_ticks;
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+
+	spin_lock(&vcpu->arch.wdt_lock);
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	/*
+	 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
+	 * then do not run the watchdog timer as this can break timer APIs.
+	 */
+	if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock(&vcpu->arch.wdt_lock);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS)
+				final = 1;
+			else
+				new_tsr = tsr | TSR_WIS;
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & TSR_WIS) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * If this is final watchdog expiry and some action is required
+	 * then exit to userspace.
+	 */
+	if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
+	    vcpu->arch.watchdog_enabled) { 
+		smp_wmb();
+		kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Stop running the watchdog timer after final expiration to
+	 * prevent the host from being flooded with timers if the
+	 * guest sets a short period.
+	 * Timers will resume when TSR/TCR is updated next time.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
+
 static void update_timer_ints(struct kvm_vcpu *vcpu)
 {
 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
 		kvmppc_core_queue_dec(vcpu);
 	else
 		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
 }
 
 static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -516,6 +628,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
+	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) &&
+	    (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) {
+		kvm_run->exit_reason = KVM_EXIT_WATCHDOG;
+		ret = 0;
+		goto out;
+	}
+
 	kvm_guest_enter();
 
 #ifdef CONFIG_PPC_FPU
@@ -978,6 +1097,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) &&
+	    (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) {
+		run->exit_reason = KVM_EXIT_WATCHDOG;
+		r = RESUME_HOST | (r & RESUME_FLAG_NV);
+	}
+
 	return r;
 }
 
@@ -1106,7 +1231,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 	}
 
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
+		u32 old_tsr = vcpu->arch.tsr;
+
 		vcpu->arch.tsr = sregs->u.e.tsr;
+
+		if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
+			arm_next_watchdog(vcpu);
+
 		update_timer_ints(vcpu);
 	}
 
@@ -1267,6 +1398,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
+	arm_next_watchdog(vcpu);
 	update_timer_ints(vcpu);
 }
 
@@ -1281,6 +1413,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 {
 	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (tsr_bits & (TSR_ENW | TSR_WIS))
+		arm_next_watchdog(vcpu);
+
 	update_timer_ints(vcpu);
 }
 
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb..5a66ade 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		kvmppc_clr_tsr_bits(vcpu, spr_val);
 		break;
 	case SPRN_TCR:
+		/*
+		 * WRC is a 2-bit field that is supposed to preserve its
+		 * value once written to non-zero.
+		 */
+		if (vcpu->arch.tcr & TCR_WRC_MASK) {
+			spr_val &= ~TCR_WRC_MASK;
+			spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
+		}
 		kvmppc_set_tcr(vcpu, spr_val);
 		break;
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc8..63457cb 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -220,6 +220,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	switch (ext) {
 #ifdef CONFIG_BOOKE
 	case KVM_CAP_PPC_BOOKE_SREGS:
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
 #else
 	case KVM_CAP_PPC_SEGSTATE:
 	case KVM_CAP_PPC_HIOR:
@@ -386,13 +387,23 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);
 #endif
-
+#ifdef CONFIG_BOOKE
+	spin_lock_init(&vcpu->arch.wdt_lock);
+	/* setup watchdog timer once */
+	setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
+		    (unsigned long)vcpu);
+#endif
 	return 0;
 }
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_destroy(vcpu);
+#ifdef CONFIG_BOOKE
+	spin_lock(&vcpu->arch.wdt_lock);
+	del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock(&vcpu->arch.wdt_lock);
+#endif
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -637,6 +648,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
+		r = 0;
+		vcpu->arch.watchdog_enabled = true;
+		break;
+#endif
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..9ff5aa5 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -163,6 +163,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_PAPR_HCALL	  19
 #define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WATCHDOG         21
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -618,6 +619,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_PPC_BOOKE_WATCHDOG 81
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 27ac8a4..41d32be 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -69,6 +69,7 @@
 #define KVM_REQ_IMMEDIATE_EXIT    15
 #define KVM_REQ_PMU               16
 #define KVM_REQ_PMI               17
+#define KVM_REQ_WATCHDOG          18
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation
@ 2012-07-20  5:11 ` Bharat Bhushan
  0 siblings, 0 replies; 29+ messages in thread
From: Bharat Bhushan @ 2012-07-20  5:11 UTC (permalink / raw)
  To: kvm-ppc, kvm, agraf; +Cc: Bharat Bhushan

This patch adds the watchdog emulation in KVM. The watchdog
emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
The kernel timer are used for watchdog emulation and emulates
h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
it is configured.

Signed-off-by: Liu Yu <yu.liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
[bharat.bhushan@freescale.com: reworked patch]

---
v5:
 - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG.
 - Moved watchdog_next_timeout() in lock.

v4:
 - in v3 i forgot to add Scott Wood and Liu Yu signoff

v3:
 - Using KVM_REQ_WATCHDOG for userspace exit.
 - TSR changes left for vcpu thread.
 - Other review comments on v2

 arch/powerpc/include/asm/kvm_host.h  |    3 +
 arch/powerpc/include/asm/kvm_ppc.h   |    3 +
 arch/powerpc/include/asm/reg_booke.h |    7 ++
 arch/powerpc/kvm/booke.c             |  140 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/booke_emulate.c     |    8 ++
 arch/powerpc/kvm/powerpc.c           |   19 +++++-
 include/linux/kvm.h                  |    2 +
 include/linux/kvm_host.h             |    1 +
 8 files changed, 182 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..43cac48 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -467,6 +467,8 @@ struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	spinlock_t wdt_lock;
+	struct timer_list wdt_timer;
 	u32 tlbcfg[4];
 	u32 mmucfg;
 	u32 epr;
@@ -482,6 +484,7 @@ struct kvm_vcpu_arch {
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
+	u8 watchdog_enabled;
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..e5cf4b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
+extern void kvmppc_watchdog_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
@@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                          struct kvm_interrupt *irq);
+extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4..e07e6af 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@
 #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
 #define TCR_ARE		0x00400000	/* Auto Reload Enable */
 
+#ifdef CONFIG_E500
+#define TCR_GET_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
+			      (((tcr) & 0x1E0000) >> 15))
+#else
+#define TCR_GET_WP(tcr)  (((tcr) & 0xC0000000) >> 30)
+#endif
+
 /* Bit definitions for the TSR. */
 #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
 #define TSR_WIS		0x40000000	/* WDT Interrupt Status */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..71b4ce9 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -404,12 +415,113 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	return allowed;
 }
 
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
+ * because the larger value can break the timer APIs.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	u64 tb, wdt_tb, wdt_ticks = 0;
+	u64 nr_jiffies = 0;
+	u32 period = TCR_GET_WP(vcpu->arch.tcr);
+
+	wdt_tb = 1ULL << (63 - period);
+	tb = get_tb();
+	/*
+	 * The watchdog timeout will hapeen when TB bit corresponding
+	 * to watchdog will toggle from 0 to 1.
+	 */
+	if (tb & wdt_tb)
+		wdt_ticks = wdt_tb;
+
+	wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
+
+	/* Convert timebase ticks to jiffies */
+	nr_jiffies = wdt_ticks;
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+
+	spin_lock(&vcpu->arch.wdt_lock);
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	/*
+	 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
+	 * then do not run the watchdog timer as this can break timer APIs.
+	 */
+	if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock(&vcpu->arch.wdt_lock);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS)
+				final = 1;
+			else
+				new_tsr = tsr | TSR_WIS;
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & TSR_WIS) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * If this is final watchdog expiry and some action is required
+	 * then exit to userspace.
+	 */
+	if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
+	    vcpu->arch.watchdog_enabled) { 
+		smp_wmb();
+		kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Stop running the watchdog timer after final expiration to
+	 * prevent the host from being flooded with timers if the
+	 * guest sets a short period.
+	 * Timers will resume when TSR/TCR is updated next time.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
+
 static void update_timer_ints(struct kvm_vcpu *vcpu)
 {
 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
 		kvmppc_core_queue_dec(vcpu);
 	else
 		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
 }
 
 static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -516,6 +628,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
+	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) &&
+	    (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) {
+		kvm_run->exit_reason = KVM_EXIT_WATCHDOG;
+		ret = 0;
+		goto out;
+	}
+
 	kvm_guest_enter();
 
 #ifdef CONFIG_PPC_FPU
@@ -978,6 +1097,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) &&
+	    (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) {
+		run->exit_reason = KVM_EXIT_WATCHDOG;
+		r = RESUME_HOST | (r & RESUME_FLAG_NV);
+	}
+
 	return r;
 }
 
@@ -1106,7 +1231,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 	}
 
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
+		u32 old_tsr = vcpu->arch.tsr;
+
 		vcpu->arch.tsr = sregs->u.e.tsr;
+
+		if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
+			arm_next_watchdog(vcpu);
+
 		update_timer_ints(vcpu);
 	}
 
@@ -1267,6 +1398,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
+	arm_next_watchdog(vcpu);
 	update_timer_ints(vcpu);
 }
 
@@ -1281,6 +1413,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 {
 	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (tsr_bits & (TSR_ENW | TSR_WIS))
+		arm_next_watchdog(vcpu);
+
 	update_timer_ints(vcpu);
 }
 
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb..5a66ade 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		kvmppc_clr_tsr_bits(vcpu, spr_val);
 		break;
 	case SPRN_TCR:
+		/*
+		 * WRC is a 2-bit field that is supposed to preserve its
+		 * value once written to non-zero.
+		 */
+		if (vcpu->arch.tcr & TCR_WRC_MASK) {
+			spr_val &= ~TCR_WRC_MASK;
+			spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
+		}
 		kvmppc_set_tcr(vcpu, spr_val);
 		break;
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc8..63457cb 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -220,6 +220,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	switch (ext) {
 #ifdef CONFIG_BOOKE
 	case KVM_CAP_PPC_BOOKE_SREGS:
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
 #else
 	case KVM_CAP_PPC_SEGSTATE:
 	case KVM_CAP_PPC_HIOR:
@@ -386,13 +387,23 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);
 #endif
-
+#ifdef CONFIG_BOOKE
+	spin_lock_init(&vcpu->arch.wdt_lock);
+	/* setup watchdog timer once */
+	setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
+		    (unsigned long)vcpu);
+#endif
 	return 0;
 }
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_destroy(vcpu);
+#ifdef CONFIG_BOOKE
+	spin_lock(&vcpu->arch.wdt_lock);
+	del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock(&vcpu->arch.wdt_lock);
+#endif
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -637,6 +648,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
+		r = 0;
+		vcpu->arch.watchdog_enabled = true;
+		break;
+#endif
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..9ff5aa5 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -163,6 +163,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_PAPR_HCALL	  19
 #define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WATCHDOG         21
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -618,6 +619,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_PPC_BOOKE_WATCHDOG 81
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 27ac8a4..41d32be 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -69,6 +69,7 @@
 #define KVM_REQ_IMMEDIATE_EXIT    15
 #define KVM_REQ_PMU               16
 #define KVM_REQ_PMI               17
+#define KVM_REQ_WATCHDOG          18
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
-- 
1.7.0.4



^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2012-07-20  5:11 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-28  6:17 [PATCH 2/2] KVM: PPC: booke: Add watchdog emulation Bharat Bhushan
2012-06-28  6:29 ` Bharat Bhushan
2012-07-06 13:17 ` Alexander Graf
2012-07-06 13:17   ` Alexander Graf
2012-07-06 23:37   ` Scott Wood
2012-07-06 23:37     ` Scott Wood
2012-07-07  7:50     ` Alexander Graf
2012-07-07  7:50       ` Alexander Graf
2012-07-09  5:13       ` Bhushan Bharat-R65777
2012-07-09  5:13         ` Bhushan Bharat-R65777
2012-07-09  8:49         ` Alexander Graf
2012-07-09  8:49           ` Alexander Graf
2012-07-09 14:28         ` Scott Wood
2012-07-09 14:28           ` Scott Wood
2012-07-09 14:36           ` Alexander Graf
2012-07-09 14:36             ` Alexander Graf
2012-07-09 14:44             ` Bhushan Bharat-R65777
2012-07-09 14:44               ` Bhushan Bharat-R65777
2012-07-09 17:09       ` Scott Wood
2012-07-09 17:09         ` Scott Wood
2012-07-09 17:29         ` Alexander Graf
2012-07-09 17:29           ` Alexander Graf
2012-07-09  6:43   ` Bhushan Bharat-R65777
2012-07-09  6:43     ` Bhushan Bharat-R65777
2012-07-09  9:11     ` Alexander Graf
2012-07-09  9:11       ` Alexander Graf
2012-07-20  4:59 Bharat Bhushan
2012-07-20  5:11 ` Bharat Bhushan
2012-07-20  4:59 ` Bhushan Bharat-R65777

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.