* [RFC v2 1/7] change headers preparing for steal time
2010-08-30 17:06 [RFC v2 0/7] kvm steal time implementation proposal Glauber Costa
@ 2010-08-30 17:06 ` Glauber Costa
2010-08-30 17:06 ` [RFC v2 2/7] always call kvm_write_guest Glauber Costa
` (5 subsequent siblings)
6 siblings, 0 replies; 9+ messages in thread
From: Glauber Costa @ 2010-08-30 17:06 UTC (permalink / raw)
To: kvm; +Cc: avi, zamsden, mtosatti, riel, peterz, mingo, jeremy
This guest/host common patch prepares infrastructure for
the steal time implementation. Some constants are added,
and a name change happens in pvclock vcpu structure.
Signed-off-by: Glauber Costa <glommer@redhat.com>
---
arch/x86/include/asm/kvm_para.h | 1 +
arch/x86/include/asm/pvclock-abi.h | 4 +++-
2 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e..1759c81 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -25,6 +25,7 @@
* in pvclock structure. If no bits are set, all flags are ignored.
*/
#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
+#define KVM_FEATURE_CLOCKSOURCE_STEAL_BIT 25
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 35f2d19..417061b 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -24,7 +24,7 @@
struct pvclock_vcpu_time_info {
u32 version;
- u32 pad0;
+ u32 steal_time;
u64 tsc_timestamp;
u64 system_time;
u32 tsc_to_system_mul;
@@ -40,5 +40,7 @@ struct pvclock_wall_clock {
} __attribute__((__packed__));
#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
+#define PVCLOCK_STEAL_BIT (2 << 0)
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PVCLOCK_ABI_H */
--
1.6.2.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC v2 2/7] always call kvm_write_guest
2010-08-30 17:06 [RFC v2 0/7] kvm steal time implementation proposal Glauber Costa
2010-08-30 17:06 ` [RFC v2 1/7] change headers preparing for steal time Glauber Costa
@ 2010-08-30 17:06 ` Glauber Costa
2010-08-30 17:06 ` [RFC v2 3/7] measure time out of guest Glauber Costa
` (4 subsequent siblings)
6 siblings, 0 replies; 9+ messages in thread
From: Glauber Costa @ 2010-08-30 17:06 UTC (permalink / raw)
To: kvm; +Cc: avi, zamsden, mtosatti, riel, peterz, mingo, jeremy
This patch makes sure that kvm_write_guest() is called
at every guest enter. We do not, however, want to update
all the structure at all times, so we add a flag
that basically tells it whether or not to do the whole operation
Signed-off-by: Glauber Costa <glommer@redhat.com>
---
arch/x86/kvm/x86.c | 10 ++++++++--
1 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e7e3b50..f4b77ea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -895,7 +895,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-static void kvm_write_guest_time(struct kvm_vcpu *v)
+static void kvm_write_guest_time(struct kvm_vcpu *v, bool update_time)
{
struct timespec ts;
unsigned long flags;
@@ -906,6 +906,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
+ if (!update_time)
+ return;
+
this_tsc_khz = get_cpu_var(cpu_tsc_khz);
if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
@@ -4671,6 +4674,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
+ bool update_kvm_time = 0;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
@@ -4680,7 +4684,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
- kvm_write_guest_time(vcpu);
+ update_kvm_time = 1;
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -4701,6 +4705,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
}
+ kvm_write_guest_time(vcpu, update_kvm_time);
+
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
goto out;
--
1.6.2.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC v2 3/7] measure time out of guest
2010-08-30 17:06 [RFC v2 0/7] kvm steal time implementation proposal Glauber Costa
2010-08-30 17:06 ` [RFC v2 1/7] change headers preparing for steal time Glauber Costa
2010-08-30 17:06 ` [RFC v2 2/7] always call kvm_write_guest Glauber Costa
@ 2010-08-30 17:06 ` Glauber Costa
2010-08-30 17:06 ` [RFC v2 4/7] change kernel accounting to include steal time Glauber Costa
` (3 subsequent siblings)
6 siblings, 0 replies; 9+ messages in thread
From: Glauber Costa @ 2010-08-30 17:06 UTC (permalink / raw)
To: kvm; +Cc: avi, zamsden, mtosatti, riel, peterz, mingo, jeremy
By measuring time between a vcpu_put and a vcpu_load, we can
estimate how much time did the guest stay out of the cpu.
This is exported to the guest at every clock update.
Signed-off-by: Glauber Costa <glommer@redhat.com>
---
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/kvm/x86.c | 13 +++++++++++--
2 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 502e53f..bc28aff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -364,6 +364,8 @@ struct kvm_vcpu_arch {
u64 hv_vapic;
cpumask_var_t wbinvd_dirty_mask;
+ u64 time_out;
+ u64 last_time_out;
};
struct kvm_arch {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f4b77ea..9d08032 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -906,6 +906,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v, bool update_time)
if ((!vcpu->time_page))
return;
+ vcpu->hv_clock.steal_time = vcpu->time_out / 1000000;
if (!update_time)
return;
@@ -928,8 +929,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v, bool update_time)
vcpu->hv_clock.system_time = ts.tv_nsec +
(NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
- vcpu->hv_clock.flags = 0;
-
+ vcpu->hv_clock.flags = PVCLOCK_STEAL_BIT;
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
@@ -1801,6 +1801,8 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ s64 now;
+
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
if (kvm_x86_ops->has_wbinvd_exit())
@@ -1818,12 +1820,19 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
per_cpu(cpu_tsc_khz, cpu) = khz;
}
kvm_request_guest_time_update(vcpu);
+
+ now = getnsboottime();
+
+ if (vcpu->arch.last_time_out != 0)
+ vcpu->arch.time_out += now - vcpu->arch.last_time_out;
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
+
+ vcpu->arch.last_time_out = getnsboottime();
}
static int is_efer_nx(void)
--
1.6.2.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC v2 4/7] change kernel accounting to include steal time
2010-08-30 17:06 [RFC v2 0/7] kvm steal time implementation proposal Glauber Costa
` (2 preceding siblings ...)
2010-08-30 17:06 ` [RFC v2 3/7] measure time out of guest Glauber Costa
@ 2010-08-30 17:06 ` Glauber Costa
2010-08-30 17:06 ` [RFC v2 5/7] kvm steal time implementation Glauber Costa
` (2 subsequent siblings)
6 siblings, 0 replies; 9+ messages in thread
From: Glauber Costa @ 2010-08-30 17:06 UTC (permalink / raw)
To: kvm; +Cc: avi, zamsden, mtosatti, riel, peterz, mingo, jeremy
This patch proposes a common steal time implementation. When no
steal time is accounted, we just add a branch to the current
accounting code, that shouldn't add much overhead.
When we do want to register steal time, we proceed as following:
- if we would account user or system time in this tick, and there is
out-of-cpu time registered, we skip it altogether, and account steal
time only.
- if we would account user or system time in this tick, and we got the
cpu for the whole slice, we proceed normaly.
- if we are idle in this tick, we flush out-of-cpu time to give it the
chance to update whatever last-measure internal variable it may have.
This approach is simple, but proved to work well for my test scenarios.
in a UP guest on UP host, with a cpu-hog in both guest and host shows
~ 50 % steal time. steal time is also accounted proportionally, if
nice values are given to the host cpu-hog.
A cpu-hog in the host with no load in the guest, produces 0 % steal time,
with 100 % idle, as one would expect.
Signed-off-by: Glauber Costa <glommer@redhat.com>
---
include/linux/sched.h | 1 +
kernel/sched.c | 29 +++++++++++++++++++++++++++++
2 files changed, 30 insertions(+), 0 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0478888..e571ddd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -312,6 +312,7 @@ long io_schedule_timeout(long timeout);
extern void cpu_init (void);
extern void trap_init(void);
extern void update_process_times(int user);
+extern cputime_t (*hypervisor_steal_time)(void);
extern void scheduler_tick(void);
extern void sched_show_task(struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a880..9695c92 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3157,6 +3157,16 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
return ns;
}
+cputime_t (*hypervisor_steal_time)(void) = NULL;
+
+static inline cputime_t get_steal_time_from_hypervisor(void)
+{
+ if (!hypervisor_steal_time)
+ return 0;
+ return hypervisor_steal_time();
+}
+
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
@@ -3169,6 +3179,12 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
cputime64_t tmp;
+ tmp = get_steal_time_from_hypervisor();
+ if (tmp) {
+ account_steal_time(tmp);
+ return;
+ }
+
/* Add user time to process. */
p->utime = cputime_add(p->utime, cputime);
p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
@@ -3234,6 +3250,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
return;
}
+ tmp = get_steal_time_from_hypervisor();
+ if (tmp) {
+ account_steal_time(tmp);
+ return;
+ }
+
/* Add system time to process. */
p->stime = cputime_add(p->stime, cputime);
p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
@@ -3276,6 +3298,13 @@ void account_idle_time(cputime_t cputime)
cputime64_t cputime64 = cputime_to_cputime64(cputime);
struct rq *rq = this_rq();
+ /*
+ * if we're idle, we don't account it as steal time, since we did
+ * not want to run anyway. We do call the steal function, however, to
+ * give the guest the chance to flush its internal buffers
+ */
+ get_steal_time_from_hypervisor();
+
if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
else
--
1.6.2.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC v2 5/7] kvm steal time implementation
2010-08-30 17:06 [RFC v2 0/7] kvm steal time implementation proposal Glauber Costa
` (3 preceding siblings ...)
2010-08-30 17:06 ` [RFC v2 4/7] change kernel accounting to include steal time Glauber Costa
@ 2010-08-30 17:06 ` Glauber Costa
2010-08-30 17:06 ` [RFC v2 6/7] touch softlockup watchdog Glauber Costa
2010-08-30 17:06 ` [RFC v2 7/7] tell guest about steal time feature Glauber Costa
6 siblings, 0 replies; 9+ messages in thread
From: Glauber Costa @ 2010-08-30 17:06 UTC (permalink / raw)
To: kvm; +Cc: avi, zamsden, mtosatti, riel, peterz, mingo, jeremy
This is the proposed kvm-side steal time implementation.
It is migration safe, as it checks flags at every read.
Signed-off-by: Glauber Costa <glommer@redhat.com>
---
arch/x86/kernel/kvmclock.c | 35 +++++++++++++++++++++++++++++++++++
1 files changed, 35 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c..a1f4852 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,8 @@
#include <linux/clocksource.h>
#include <linux/kvm_para.h>
+#include <linux/kernel_stat.h>
+#include <linux/sched.h>
#include <asm/pvclock.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -41,6 +43,7 @@ early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static DEFINE_PER_CPU(u64, steal_info);
static struct pvclock_wall_clock wall_clock;
/*
@@ -82,6 +85,32 @@ static cycle_t kvm_clock_read(void)
return ret;
}
+static DEFINE_PER_CPU(u64, steal_info);
+
+cputime_t kvm_get_steal_time(void)
+{
+ u64 delta = 0;
+ u64 *last_steal_info, this_steal_info;
+ struct pvclock_vcpu_time_info *src;
+
+ src = &get_cpu_var(hv_clock);
+ if (!(src->flags & PVCLOCK_STEAL_BIT))
+ goto out;
+
+ this_steal_info = src->steal_time;
+ put_cpu_var(hv_clock);
+
+ last_steal_info = &get_cpu_var(steal_info);
+
+ delta = this_steal_info - *last_steal_info;
+
+ *last_steal_info = this_steal_info;
+ put_cpu_var(steal_info);
+
+out:
+ return msecs_to_cputime(delta);
+}
+
static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
{
return kvm_clock_read();
@@ -134,6 +163,8 @@ static int kvm_register_clock(char *txt)
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
+ per_cpu(steal_info, cpu) = 0;
+
return native_write_msr_safe(msr_kvm_system_time, low, high);
}
@@ -218,4 +249,8 @@ void __init kvmclock_init(void)
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STEAL_BIT))
+ hypervisor_steal_time = kvm_get_steal_time;
}
--
1.6.2.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC v2 6/7] touch softlockup watchdog
2010-08-30 17:06 [RFC v2 0/7] kvm steal time implementation proposal Glauber Costa
` (4 preceding siblings ...)
2010-08-30 17:06 ` [RFC v2 5/7] kvm steal time implementation Glauber Costa
@ 2010-08-30 17:06 ` Glauber Costa
2010-08-30 17:06 ` [RFC v2 7/7] tell guest about steal time feature Glauber Costa
6 siblings, 0 replies; 9+ messages in thread
From: Glauber Costa @ 2010-08-30 17:06 UTC (permalink / raw)
To: kvm; +Cc: avi, zamsden, mtosatti, riel, peterz, mingo, jeremy
With a reliable steal time mechanism, we can tell if we're
out of the cpu for very long, differentiating from the case
that we simply got a real softlockup.
In the case we were out of cpu, the watchdog is fed, making
bogus softlockups disappear.
Signed-off-by: Glauber Costa <glommer@redhat.com>
---
arch/x86/kernel/kvmclock.c | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index a1f4852..d217475 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -91,6 +91,7 @@ cputime_t kvm_get_steal_time(void)
{
u64 delta = 0;
u64 *last_steal_info, this_steal_info;
+ int touch_wd;
struct pvclock_vcpu_time_info *src;
src = &get_cpu_var(hv_clock);
@@ -104,6 +105,10 @@ cputime_t kvm_get_steal_time(void)
delta = this_steal_info - *last_steal_info;
+ touch_wd = softlockup_thresh * 1000UL;
+ if ((touch_wd > 0) && (delta > touch_wd))
+ touch_softlockup_watchdog();
+
*last_steal_info = this_steal_info;
put_cpu_var(steal_info);
--
1.6.2.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC v2 7/7] tell guest about steal time feature
2010-08-30 17:06 [RFC v2 0/7] kvm steal time implementation proposal Glauber Costa
` (5 preceding siblings ...)
2010-08-30 17:06 ` [RFC v2 6/7] touch softlockup watchdog Glauber Costa
@ 2010-08-30 17:06 ` Glauber Costa
6 siblings, 0 replies; 9+ messages in thread
From: Glauber Costa @ 2010-08-30 17:06 UTC (permalink / raw)
To: kvm; +Cc: avi, zamsden, mtosatti, riel, peterz, mingo, jeremy
Guest kernel will only activate steal time if the host exports it.
Warn the guest about it.
---
arch/x86/kvm/x86.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9d08032..f6a2d74 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2103,7 +2103,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
- (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STEAL_BIT);
entry->ebx = 0;
entry->ecx = 0;
entry->edx = 0;
--
1.6.2.2
^ permalink raw reply related [flat|nested] 9+ messages in thread