linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/9] Sched clock paravirt op fix.patch
@ 2007-03-02  2:54 Zachary Amsden
  2007-03-13 14:01 ` Andi Kleen
  0 siblings, 1 reply; 9+ messages in thread
From: Zachary Amsden @ 2007-03-02  2:54 UTC (permalink / raw)
  To: Andi Kleen, Linus Torvalds, Rusty Russell, Jeremy Fitzhardinge,
	Chris Wright, Dan Hecht, Dan Arai, Andrew Morton,
	Virtualization Mailing List, Linux Kernel Mailing List,
	Zachary Amsden
  Cc: Daniel Walker, Daniel Hecht

The custom_sched_clock hook is broken.  The result from sched_clock needs to be
in nanoseconds, not in CPU cycles.  The TSC is insufficient for this purpose,
because TSC is poorly defined in a virtual environment, and mostly represents
real world time instead of scheduled process time (which can be interrupted
without notice when a virtual machine is descheduled).

To make the scheduler consistent, we must expose a different nature of time,
that is scheduled time.  So deprecate this custom_sched_clock hack and turn it
into a paravirt-op, as it should have been all along.  This allows the tsc.c
code which converts cycles to nanoseconds to be shared by all paravirt-ops
backends.

It is unfortunate to add a new paravirt-op, but this is a very distinct
abstraction which is clearly different for all virtual machine implementations,
and it gets rid of an ugly indirect function which I ashamedly admit I hacked
in to try to get this to work earlier, and then even got in the wrong units.

Please apply.

Signed-off-by: Zachary Amsden <zach@vmware.com>

diff -r d58e6ddfdfa9 arch/i386/kernel/paravirt.c
--- a/arch/i386/kernel/paravirt.c	Thu Feb 15 23:52:41 2007 -0800
+++ b/arch/i386/kernel/paravirt.c	Fri Feb 16 00:04:39 2007 -0800
@@ -32,6 +32,7 @@
 #include <asm/fixmap.h>
 #include <asm/apic.h>
 #include <asm/tlbflush.h>
+#include <asm/timer.h>
 
 /* nop stub */
 static void native_nop(void)
@@ -520,6 +521,7 @@ struct paravirt_ops paravirt_ops = {
 	.write_msr = native_write_msr,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
+	.get_scheduled_cycles = native_read_tsc,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
diff -r d58e6ddfdfa9 arch/i386/kernel/tsc.c
--- a/arch/i386/kernel/tsc.c	Thu Feb 15 23:52:41 2007 -0800
+++ b/arch/i386/kernel/tsc.c	Fri Feb 16 00:06:34 2007 -0800
@@ -14,6 +14,7 @@
 #include <asm/delay.h>
 #include <asm/tsc.h>
 #include <asm/io.h>
+#include <asm/timer.h>
 
 #include "mach_timer.h"
 
@@ -108,9 +109,6 @@ unsigned long long sched_clock(void)
 {
 	unsigned long long this_offset;
 
-	if (unlikely(custom_sched_clock))
-		return (*custom_sched_clock)();
-
 	/*
 	 * Fall back to jiffies if there's no TSC available:
 	 */
@@ -119,7 +117,7 @@ unsigned long long sched_clock(void)
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
 	/* read the Time Stamp Counter: */
-	rdtscll(this_offset);
+	get_scheduled_cycles(this_offset);
 
 	/* return the value in ns */
 	return cycles_2_ns(this_offset);
diff -r d58e6ddfdfa9 arch/i386/kernel/vmi.c
--- a/arch/i386/kernel/vmi.c	Thu Feb 15 23:52:41 2007 -0800
+++ b/arch/i386/kernel/vmi.c	Fri Feb 16 00:02:48 2007 -0800
@@ -873,7 +873,7 @@ static inline int __init activate_vmi(vo
 		paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
 		paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
 #endif
-		custom_sched_clock = vmi_sched_clock;
+		paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
 	}
 	if (!disable_noidle)
 		para_fill(safe_halt, Halt);
diff -r d58e6ddfdfa9 arch/i386/kernel/vmitime.c
--- a/arch/i386/kernel/vmitime.c	Thu Feb 15 23:52:41 2007 -0800
+++ b/arch/i386/kernel/vmitime.c	Fri Feb 16 00:02:48 2007 -0800
@@ -172,7 +172,7 @@ int vmi_set_wallclock(unsigned long now)
 	return -1;
 }
 
-unsigned long long vmi_sched_clock(void)
+unsigned long long vmi_get_sched_cycles(void)
 {
 	return read_available_cycles();
 }
diff -r d58e6ddfdfa9 include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h	Thu Feb 15 23:52:41 2007 -0800
+++ b/include/asm-i386/paravirt.h	Fri Feb 16 00:07:22 2007 -0800
@@ -94,6 +94,7 @@ struct paravirt_ops
 
 	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(void);
+ 	u64 (*get_scheduled_cycles)(void);
 
 	void (*load_tr_desc)(void);
 	void (*load_gdt)(const struct Xgt_desc_struct *);
@@ -273,6 +274,8 @@ static inline void halt(void)
 
 #define rdtscll(val) (val = paravirt_ops.read_tsc())
 
+#define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles())
+
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
 
 #define rdpmc(counter,low,high) do {				\
diff -r d58e6ddfdfa9 include/asm-i386/time.h
--- a/include/asm-i386/time.h	Thu Feb 15 23:52:41 2007 -0800
+++ b/include/asm-i386/time.h	Fri Feb 16 00:02:48 2007 -0800
@@ -30,7 +30,6 @@ static inline int native_set_wallclock(u
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
-extern unsigned long long native_sched_clock(void);
 #else /* !CONFIG_PARAVIRT */
 
 #define get_wallclock() native_get_wallclock()
diff -r d58e6ddfdfa9 include/asm-i386/timer.h
--- a/include/asm-i386/timer.h	Thu Feb 15 23:52:41 2007 -0800
+++ b/include/asm-i386/timer.h	Fri Feb 16 00:05:13 2007 -0800
@@ -4,13 +4,19 @@
 #include <linux/pm.h>
 
 #define TICK_SIZE (tick_nsec / 1000)
+
 void setup_pit_timer(void);
+unsigned long long native_sched_clock(void);
+
 /* Modifiers for buggy PIT handling */
 extern int pit_latch_buggy;
 extern int timer_ack;
 extern int no_timer_check;
-extern unsigned long long (*custom_sched_clock)(void);
 extern int no_sync_cmos_clock;
 extern int recalibrate_cpu_khz(void);
 
+#ifndef CONFIG_PARAVIRT
+#define get_scheduled_cycles(val) rdtscll(val)
 #endif
+
+#endif
diff -r d58e6ddfdfa9 include/asm-i386/vmi_time.h
--- a/include/asm-i386/vmi_time.h	Thu Feb 15 23:52:41 2007 -0800
+++ b/include/asm-i386/vmi_time.h	Fri Feb 16 00:02:48 2007 -0800
@@ -49,7 +49,7 @@ extern void __init vmi_time_init(void);
 extern void __init vmi_time_init(void);
 extern unsigned long vmi_get_wallclock(void);
 extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
+extern unsigned long long vmi_get_sched_cycles(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 extern void __init vmi_timer_setup_boot_alarm(void);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/9] Sched clock paravirt op fix.patch
  2007-03-02  2:54 [PATCH 2/9] Sched clock paravirt op fix.patch Zachary Amsden
@ 2007-03-13 14:01 ` Andi Kleen
  2007-03-13 15:26   ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2007-03-13 14:01 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Linus Torvalds, Rusty Russell, Jeremy Fitzhardinge, Chris Wright,
	Dan Hecht, Dan Arai, Andrew Morton, Virtualization Mailing List,
	Linux Kernel Mailing List, Daniel Walker

On Thu, Mar 01, 2007 at 06:54:24PM -0800, Zachary Amsden wrote:
> The custom_sched_clock hook is broken.  The result from sched_clock needs to be
> in nanoseconds, not in CPU cycles.  The TSC is insufficient for this purpose,
> because TSC is poorly defined in a virtual environment, and mostly represents
> real world time instead of scheduled process time (which can be interrupted
> without notice when a virtual machine is descheduled).
> 
> To make the scheduler consistent, we must expose a different nature of time,
> that is scheduled time.  So deprecate this custom_sched_clock hack and turn it
> into a paravirt-op, as it should have been all along.  This allows the tsc.c
> code which converts cycles to nanoseconds to be shared by all paravirt-ops
> backends.
> 
> It is unfortunate to add a new paravirt-op, but this is a very distinct
> abstraction which is clearly different for all virtual machine implementations,
> and it gets rid of an ugly indirect function which I ashamedly admit I hacked
> in to try to get this to work earlier, and then even got in the wrong units.
> 
> Please apply.

I think it's better to remove this completely and not allow paravirt
to hook into sched_clock.  After all a hypervisor stealing time is no
different from interrupts stealing time and we don't try to handle
that either. 

I will remove the custom hook.

-Andi

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/9] Sched clock paravirt op fix.patch
  2007-03-13 14:01 ` Andi Kleen
@ 2007-03-13 15:26   ` Jeremy Fitzhardinge
  2007-03-13 16:07     ` Chris Wright
  0 siblings, 1 reply; 9+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-13 15:26 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Zachary Amsden, Linus Torvalds, Rusty Russell, Chris Wright,
	Dan Hecht, Dan Arai, Andrew Morton, Virtualization Mailing List,
	Linux Kernel Mailing List, Daniel Walker

Andi Kleen wrote:
> On Thu, Mar 01, 2007 at 06:54:24PM -0800, Zachary Amsden wrote:
>   
>> The custom_sched_clock hook is broken.  The result from sched_clock needs to be
>> in nanoseconds, not in CPU cycles.  The TSC is insufficient for this purpose,
>> because TSC is poorly defined in a virtual environment, and mostly represents
>> real world time instead of scheduled process time (which can be interrupted
>> without notice when a virtual machine is descheduled).
>>
>> To make the scheduler consistent, we must expose a different nature of time,
>> that is scheduled time.  So deprecate this custom_sched_clock hack and turn it
>> into a paravirt-op, as it should have been all along.  This allows the tsc.c
>> code which converts cycles to nanoseconds to be shared by all paravirt-ops
>> backends.
>>
>> It is unfortunate to add a new paravirt-op, but this is a very distinct
>> abstraction which is clearly different for all virtual machine implementations,
>> and it gets rid of an ugly indirect function which I ashamedly admit I hacked
>> in to try to get this to work earlier, and then even got in the wrong units.
>>
>> Please apply.
>>     
>
> I think it's better to remove this completely and not allow paravirt
> to hook into sched_clock.  After all a hypervisor stealing time is no
> different from interrupts stealing time and we don't try to handle
> that either. 
>   

Well, its a matter of degree.  If your busy vcpu is sharing a real cpu
with 9 other busy vcpus, then you'll only see about 10% of the real cpu
cycles.  If we had legitimate loads in which 90% of the cpu were taken
up by interrupt processing, then I think we definitely would try to
account for that time properly.

SMM interrupts, in which the CPU is stolen from the kernel entirely, is
a better analogy.  But again, they're expected to be rare and
insignificant.  In a virtual machine, on the other hand, you could
expect the majority of cpu time to be stolen by other things running on
the same physical machine.

In other words, regardless of whether this particular pv_op lives or
dies, we're going to need to have to deal with stolen time properly.  I
think this hook is reasonable and useful step towards doing that.

    J

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/9] Sched clock paravirt op fix.patch
  2007-03-13 15:26   ` Jeremy Fitzhardinge
@ 2007-03-13 16:07     ` Chris Wright
  2007-03-13 16:16       ` Andi Kleen
  0 siblings, 1 reply; 9+ messages in thread
From: Chris Wright @ 2007-03-13 16:07 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Andi Kleen, Zachary Amsden, Linus Torvalds, Rusty Russell,
	Chris Wright, Dan Hecht, Dan Arai, Andrew Morton,
	Virtualization Mailing List, Linux Kernel Mailing List,
	Daniel Walker

* Jeremy Fitzhardinge (jeremy@goop.org) wrote:
> In other words, regardless of whether this particular pv_op lives or
> dies, we're going to need to have to deal with stolen time properly.  I
> think this hook is reasonable and useful step towards doing that.

Exactly.  Normal interrupts we can handle.  Having CPU completely
disappear for unkown time periods we can't, and will need to.

thanks,
-chris

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/9] Sched clock paravirt op fix.patch
  2007-03-13 16:07     ` Chris Wright
@ 2007-03-13 16:16       ` Andi Kleen
  2007-03-13 16:37         ` Rik van Riel
  0 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2007-03-13 16:16 UTC (permalink / raw)
  To: Chris Wright
  Cc: Jeremy Fitzhardinge, Zachary Amsden, Linus Torvalds,
	Rusty Russell, Dan Hecht, Dan Arai, Andrew Morton,
	Virtualization Mailing List, Linux Kernel Mailing List,
	Daniel Walker

On Tue, Mar 13, 2007 at 09:07:09AM -0700, Chris Wright wrote:
> * Jeremy Fitzhardinge (jeremy@goop.org) wrote:
> > In other words, regardless of whether this particular pv_op lives or
> > dies, we're going to need to have to deal with stolen time properly.  I
> > think this hook is reasonable and useful step towards doing that.
> 
> Exactly.  Normal interrupts we can handle.  Having CPU completely
> disappear for unkown time periods we can't, and will need to.

But that is just what a interrupt is.

-Andi

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/9] Sched clock paravirt op fix.patch
  2007-03-13 16:16       ` Andi Kleen
@ 2007-03-13 16:37         ` Rik van Riel
  2007-03-13 20:56           ` Andi Kleen
  0 siblings, 1 reply; 9+ messages in thread
From: Rik van Riel @ 2007-03-13 16:37 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Chris Wright, Andrew Morton, Daniel Walker,
	Virtualization Mailing List, Linux Kernel Mailing List,
	Linus Torvalds

Andi Kleen wrote:
> On Tue, Mar 13, 2007 at 09:07:09AM -0700, Chris Wright wrote:
>> * Jeremy Fitzhardinge (jeremy@goop.org) wrote:
>>> In other words, regardless of whether this particular pv_op lives or
>>> dies, we're going to need to have to deal with stolen time properly.  I
>>> think this hook is reasonable and useful step towards doing that.
>> Exactly.  Normal interrupts we can handle.  Having CPU completely
>> disappear for unkown time periods we can't, and will need to.
> 
> But that is just what a interrupt is.

Interrupts tend to be reasonably short though.

Steal time can be several hypervisor/host time slices long.

As an aside, normal interrupts *are* accounted for separately
in /proc/stat, so why not steal time too?

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/9] Sched clock paravirt op fix.patch
  2007-03-13 16:37         ` Rik van Riel
@ 2007-03-13 20:56           ` Andi Kleen
  2007-03-13 21:05             ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 9+ messages in thread
From: Andi Kleen @ 2007-03-13 20:56 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Chris Wright, Andrew Morton, Daniel Walker,
	Virtualization Mailing List, Linux Kernel Mailing List,
	Linus Torvalds

On Tue, Mar 13, 2007 at 12:37:53PM -0400, Rik van Riel wrote:
> Andi Kleen wrote:
> >On Tue, Mar 13, 2007 at 09:07:09AM -0700, Chris Wright wrote:
> >>* Jeremy Fitzhardinge (jeremy@goop.org) wrote:
> >>>In other words, regardless of whether this particular pv_op lives or
> >>>dies, we're going to need to have to deal with stolen time properly.  I
> >>>think this hook is reasonable and useful step towards doing that.
> >>Exactly.  Normal interrupts we can handle.  Having CPU completely
> >>disappear for unkown time periods we can't, and will need to.
> >
> >But that is just what a interrupt is.
> 
> Interrupts tend to be reasonably short though.

It depends -- under heavy network load you can spend a long time
just processing interrupts.

-Andi


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/9] Sched clock paravirt op fix.patch
  2007-03-13 20:56           ` Andi Kleen
@ 2007-03-13 21:05             ` Jeremy Fitzhardinge
  2007-03-16 21:29               ` Matt Mackall
  0 siblings, 1 reply; 9+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-13 21:05 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Rik van Riel, Chris Wright, Andrew Morton, Linus Torvalds,
	Daniel Walker, Virtualization Mailing List,
	Linux Kernel Mailing List

Andi Kleen wrote:
> It depends -- under heavy network load you can spend a long time
> just processing interrupts.

Well, in that case you probably don't want to charge them to the process
which happens to be running at the time.

    J


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/9] Sched clock paravirt op fix.patch
  2007-03-13 21:05             ` Jeremy Fitzhardinge
@ 2007-03-16 21:29               ` Matt Mackall
  0 siblings, 0 replies; 9+ messages in thread
From: Matt Mackall @ 2007-03-16 21:29 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Andi Kleen, Rik van Riel, Chris Wright, Andrew Morton,
	Linus Torvalds, Daniel Walker, Virtualization Mailing List,
	Linux Kernel Mailing List

On Tue, Mar 13, 2007 at 02:05:11PM -0700, Jeremy Fitzhardinge wrote:
> Andi Kleen wrote:
> > It depends -- under heavy network load you can spend a long time
> > just processing interrupts.
> 
> Well, in that case you probably don't want to charge them to the process
> which happens to be running at the time.

It's actually a good first-order approximation of the right thing to
do, as it will generally correlate with the userspace process
servicing that network load.

If not (for instance, with routing loads), then you'd basically expect
the charge to get spread around evenly in proportion to an
application's CPU usage.

The -rt kernel pushes most of the interrupt work off to threads, which
of course follow the same scheduling and accounting rules as
everything other thread.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2007-03-16 21:43 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-02  2:54 [PATCH 2/9] Sched clock paravirt op fix.patch Zachary Amsden
2007-03-13 14:01 ` Andi Kleen
2007-03-13 15:26   ` Jeremy Fitzhardinge
2007-03-13 16:07     ` Chris Wright
2007-03-13 16:16       ` Andi Kleen
2007-03-13 16:37         ` Rik van Riel
2007-03-13 20:56           ` Andi Kleen
2007-03-13 21:05             ` Jeremy Fitzhardinge
2007-03-16 21:29               ` Matt Mackall

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).