All of lore.kernel.org
 help / color / mirror / Atom feed
* Linux spin lock enhancement on xen
@ 2010-08-17  1:33 Mukesh Rathor
  2010-08-17  7:33 ` Keir Fraser
                   ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Mukesh Rathor @ 2010-08-17  1:33 UTC (permalink / raw)
  To: Xen-devel

[-- Attachment #1: Type: text/plain, Size: 859 bytes --]

Hi guys,

Check out the attached patches. I changed the spin lock semantics so the
lock contains the vcpu id of the vcpu holding it. This then tells xen
to make that vcpu runnable if not already running:

Linux:
   spin_lock()
       if (try_lock() == failed)
           loop X times
           if (try_lock() == failed)
               sched_op_yield_to(vcpu_num of holder)
               start again;
           endif
       endif

Xen:
     sched_op_yield_to:
          if (vcpu_running(vcpu_num arg))
              do nothing
          else
              vcpu_kick(vcpu_num arg)
              do_yield()
          endif


In my worst case test scenario, I get about 20-36% improvement when the
system is two to three times over provisioned. 

Please provide any feedback. I would like to submit official patch for
SCHEDOP_yield_to in xen.

thanks,
Mukesh


[-- Attachment #2: spin-lin.diff --]
[-- Type: text/x-patch, Size: 5303 bytes --]

diff --git a/Makefile b/Makefile
index 8b0b42f..d8d1dbb 100644
--- a/Makefile
+++ b/Makefile
@@ -303,7 +303,8 @@ LINUXINCLUDE    := -Iinclude \
                    $(if $(KBUILD_SRC),-Iinclude2 -I$(srctree)/include) \
 		   -include include/linux/autoconf.h
 
-CPPFLAGS        := -D__KERNEL__ $(LINUXINCLUDE)
+# CPPFLAGS        := -D__KERNEL__ $(LINUXINCLUDE) 
+CPPFLAGS        := -D__KERNEL__ $(LINUXINCLUDE) -D _XEN_SPIN_LOCK
 
 CFLAGS          := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
                    -fno-strict-aliasing -fno-common -Wstrict-prototypes -Wundef -Werror-implicit-function-declaration -fno-delete-null-pointer-checks
diff --git a/include/asm-i386/mach-xen/asm/hypervisor.h b/include/asm-i386/mach-xen/asm/hypervisor.h
index 89cde62..a3806f1 100644
--- a/include/asm-i386/mach-xen/asm/hypervisor.h
+++ b/include/asm-i386/mach-xen/asm/hypervisor.h
@@ -143,6 +143,15 @@ HYPERVISOR_yield(
 }
 
 static inline int
+HYPERVISOR_yield_to(uint vcpu)
+{
+        struct sched_yield_to yield_to = { .version = 1, .vcpu_id = vcpu };
+	int rc = HYPERVISOR_sched_op(SCHEDOP_yield_to, &yield_to);
+        /* TBD: compat */
+	return rc;
+}
+
+static inline int
 HYPERVISOR_block(
 	void)
 {
diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index a8e3d89..c76e20f 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -16,6 +16,81 @@
  * (the type definitions are in asm/spinlock_types.h)
  */
 
+#ifdef _XEN_SPIN_LOCK
+#include <asm/hypervisor.h>
+
+#define __raw_spin_is_locked(x) \
+		(*(volatile signed int *)(&(x)->slock) >= 0)
+
+static inline int _attempt_raw_spin_lock(raw_spinlock_t *lock)
+{
+    const int COUNTMAX = 10000, myid=read_pda(cpunumber);
+    int oldval;
+
+    asm volatile 
+        ("1:  movsxl %1, %%rax              \n"
+         "    cmpq $0, %%rax             \n"
+         "    jge 4f                      \n"
+         "2:                              \n"
+         LOCK_PREFIX " cmpxchgl %k2, %1   \n"
+         "    jnz 4f                      \n"
+         "3:  /* exit */                  \n" 
+         LOCK_SECTION_START("")
+         "4:  xor %%rdx, %%rdx            \n"
+         "6:  inc %%rdx                   \n"
+         "    cmpl %k3,   %%edx           \n"
+         "    jge 3b                      \n"
+         "    pause                       \n"
+         "    movsxl %1, %%rax              \n"
+         "    cmpq $0, %%rax             \n"
+         "    jge 6b                      \n"
+         "    jmp 2b                       \n"
+        LOCK_SECTION_END
+
+         : "=&a" (oldval)
+         : "m" (lock->slock), "c" (myid), "g" (COUNTMAX)
+         : "rdx", "memory", "cc"
+        );
+        return oldval;
+}
+
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+    int rc, old_lock_holder;
+
+    do {
+        old_lock_holder = _attempt_raw_spin_lock(lock);
+
+        if (old_lock_holder >= 0)
+            if ((rc=HYPERVISOR_yield_to(old_lock_holder)) != 0)
+                printk("XEN: Yield failed. rc:%d\n", rc);
+    } while (old_lock_holder != -1);
+}
+
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+        int oldval, myid = read_pda(cpunumber);
+
+        __asm__ __volatile__ (
+                "movl $-1, %%eax                \n"
+                LOCK_PREFIX " cmpxchgl %k2, %1  \n"
+                : "=&a" (oldval) 
+                : "m" (lock->slock), "c" (myid)
+                : "memory", "cc"
+        );
+
+	return (oldval == -1);
+}
+
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+	__asm__ __volatile__ ("movl $-1, %0" : "=m"(lock->slock) : : "memory");
+}
+
+#else
+
 #define __raw_spin_is_locked(x) \
 		(*(volatile signed int *)(&(x)->slock) <= 0)
 
@@ -64,6 +139,8 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 	);
 }
 
+#endif
+
 #define __raw_spin_unlock_wait(lock) \
 	do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
 
@@ -124,4 +201,5 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 				: "=m" (rw->lock) : : "memory");
 }
 
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-x86_64/spinlock_types.h b/include/asm-x86_64/spinlock_types.h
index 59efe84..6fb8da0 100644
--- a/include/asm-x86_64/spinlock_types.h
+++ b/include/asm-x86_64/spinlock_types.h
@@ -9,7 +9,11 @@ typedef struct {
 	volatile unsigned int slock;
 } raw_spinlock_t;
 
+#ifdef _XEN_SPIN_LOCK
+#define __RAW_SPIN_LOCK_UNLOCKED	{ -1 }
+#else
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
+#endif
 
 typedef struct {
 	volatile unsigned int lock;
diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h
index abf11cc..dc60001 100644
--- a/include/xen/interface/sched.h
+++ b/include/xen/interface/sched.h
@@ -90,6 +90,17 @@ DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
 #define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
 #define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
 
+
+/*
+ * Voluntarily yield the CPU to another given vcpu
+ * @arg == vcpu info.
+ */
+#define SCHEDOP_yield_to      5
+struct sched_yield_to {
+    unsigned int version;
+    unsigned int vcpu_id;
+};
+
 #endif /* __XEN_PUBLIC_SCHED_H__ */
 
 /*

[-- Attachment #3: spin-xen.diff --]
[-- Type: text/x-patch, Size: 1540 bytes --]

diff -r c840095b9359 xen/common/schedule.c
--- a/xen/common/schedule.c	Mon Jul 26 03:55:45 2010 -0700
+++ b/xen/common/schedule.c	Mon Aug 16 18:33:07 2010 -0700
@@ -627,6 +627,30 @@ ret_t do_sched_op(int cmd, XEN_GUEST_HAN
         break;
     }
 
+    case SCHEDOP_yield_to:
+    {
+        struct sched_yield_to yld_s;
+        struct vcpu *vp;
+        struct domain *dp = current->domain;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&yld_s, arg, 1) )
+            break;
+
+        ret = -EINVAL;
+        if (is_idle_vcpu(current) || yld_s.vcpu_id > dp->max_vcpus)
+            break;
+
+        vp = dp->vcpu[yld_s.vcpu_id];
+        if (!vp->is_running) {
+            vcpu_kick(dp->vcpu[yld_s.vcpu_id]);
+            ret = do_yield();
+        } else
+            ret = 0;
+
+        break;
+    }
+
     case SCHEDOP_block:
     {
         ret = do_block();
diff -r c840095b9359 xen/include/public/sched.h
--- a/xen/include/public/sched.h	Mon Jul 26 03:55:45 2010 -0700
+++ b/xen/include/public/sched.h	Mon Aug 16 18:33:07 2010 -0700
@@ -108,6 +108,17 @@ DEFINE_XEN_GUEST_HANDLE(sched_remote_shu
 #define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
 #define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
 
+
+/*
+ *  * Voluntarily yield the CPU to another given vcpu
+ *   * @arg == vcpu info.
+ *    */
+#define SCHEDOP_yield_to      5
+struct sched_yield_to {
+    unsigned int version;
+    unsigned int vcpu_id;
+};
+
 #endif /* __XEN_PUBLIC_SCHED_H__ */
 
 /*

[-- Attachment #4: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-17  1:33 Linux spin lock enhancement on xen Mukesh Rathor
@ 2010-08-17  7:33 ` Keir Fraser
  2010-08-17  7:53 ` Jan Beulich
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 23+ messages in thread
From: Keir Fraser @ 2010-08-17  7:33 UTC (permalink / raw)
  To: Mukesh Rathor, Xen-devel; +Cc: Jeremy Fitzhardinge

How does this compare with Jeremy's existing paravirtualised spinlocks in
pv_ops? They required no hypervsior changes. Cc'ing Jeremy.

 -- Keir

On 17/08/2010 02:33, "Mukesh Rathor" <mukesh.rathor@oracle.com> wrote:

> Hi guys,
> 
> Check out the attached patches. I changed the spin lock semantics so the
> lock contains the vcpu id of the vcpu holding it. This then tells xen
> to make that vcpu runnable if not already running:
> 
> Linux:
>    spin_lock()
>        if (try_lock() == failed)
>            loop X times
>            if (try_lock() == failed)
>                sched_op_yield_to(vcpu_num of holder)
>                start again;
>            endif
>        endif
> 
> Xen:
>      sched_op_yield_to:
>           if (vcpu_running(vcpu_num arg))
>               do nothing
>           else
>               vcpu_kick(vcpu_num arg)
>               do_yield()
>           endif
> 
> 
> In my worst case test scenario, I get about 20-36% improvement when the
> system is two to three times over provisioned.
> 
> Please provide any feedback. I would like to submit official patch for
> SCHEDOP_yield_to in xen.
> 
> thanks,
> Mukesh
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-17  1:33 Linux spin lock enhancement on xen Mukesh Rathor
  2010-08-17  7:33 ` Keir Fraser
@ 2010-08-17  7:53 ` Jan Beulich
  2010-08-18  1:58   ` Mukesh Rathor
  2010-08-17 14:34 ` Ky Srinivasan
  2010-08-17 17:43 ` Jeremy Fitzhardinge
  3 siblings, 1 reply; 23+ messages in thread
From: Jan Beulich @ 2010-08-17  7:53 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: Xen-devel

>>> On 17.08.10 at 03:33, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:

A mere vcpu_kick()+do_yield() seems pretty simplistic to me - if the
current vCPU still has higher priority than the one kicked you'll achieve
nothing. Instead, I think you really want to offer the current vCPU's
time slice to the target, making sure the target yields back as soon as
it released the lock (thus transferring the borrowed time slice back to
where it belongs).

And then, without using ticket locks, you likely increase unfairness
(as any other actively running vCPU going for the same lock will
have much better chances of acquiring it than the vCPU that
originally tried to and yielded), including the risk of starvation.

Still, I'm glad to see we're not the only ones wanting a directed
yield capability in Xen.

>+struct sched_yield_to {
>+    unsigned int version;
>+    unsigned int vcpu_id;
>+};

Why do you need a version field here, the more as it doesn't
appear to get read by the hypervisor.

Jan

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-17  1:33 Linux spin lock enhancement on xen Mukesh Rathor
  2010-08-17  7:33 ` Keir Fraser
  2010-08-17  7:53 ` Jan Beulich
@ 2010-08-17 14:34 ` Ky Srinivasan
  2010-08-18  1:58   ` Mukesh Rathor
  2010-08-17 17:43 ` Jeremy Fitzhardinge
  3 siblings, 1 reply; 23+ messages in thread
From: Ky Srinivasan @ 2010-08-17 14:34 UTC (permalink / raw)
  To: Xen-devel, Mukesh Rathor



>>> On 8/16/2010 at  9:33 PM, in message
<20100816183357.08623c4c@mantra.us.oracle.com>, Mukesh Rathor
<mukesh.rathor@oracle.com> wrote: 
> Hi guys,
> 
> Check out the attached patches. I changed the spin lock semantics so the
> lock contains the vcpu id of the vcpu holding it. This then tells xen
> to make that vcpu runnable if not already running:
> 
> Linux:
>    spin_lock()
>        if (try_lock() == failed)
>            loop X times
>            if (try_lock() == failed)
>                sched_op_yield_to(vcpu_num of holder)
>                start again;
>            endif
>        endif
> 
> Xen:
>      sched_op_yield_to:
>           if (vcpu_running(vcpu_num arg))
>               do nothing
>           else
>               vcpu_kick(vcpu_num arg)
>               do_yield()
>           endif
> 
> 
> In my worst case test scenario, I get about 20-36% improvement when the
> system is two to three times over provisioned. 
> 
> Please provide any feedback. I would like to submit official patch for
> SCHEDOP_yield_to in xen.
While I agree that a directed yield is a useful construct, I am not sure how this protocol would deal with ticket spin locks as you would want to implement some form of priority inheritance - if the vcpu you are yielding to is currently blocked on another (ticket) spin lock, you would want to yield to the owner of that other spin lock. Clearly, this dependency information is only available in the guest and that is where we would need to implement this logic. I think Jan's "enlightened" spin locks implemented this kind of logic.

Perhaps, another way to deal with this  generic problem of  inopportune guest preemption might be to coordinate guest preemption - allow the guest to notify the hypervisor that it is in a critical section. If the no-preempt guest state is set, the hypervisor can choose to defer the preemption by giving the guest vcpu in question an additional time quantum to run. In this case, the hypervisor would post the fact that a preemption is pending on guest and the guest vcpu is expected to relinquish control to the hypervisor as part of exiting the critical section. Since guest preemption is not a "correctness" issue, the hypervisor can choose to not honor the "no-preempt" state the guest may post if the hypervisor detects that the guest is buggy  (or malicious). Much of what we have been discussing with "enlightened" spin locks is how to recover from the situation that results when we have an inopportune guest preemption. The coordinated preemption protocol described here attempts to avoid getting into pathological situations. If I recall correctly, I think there were some patches for doing this form of preemption management.

Regards,

K. Y
 


> 
> thanks,
> Mukesh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-17  1:33 Linux spin lock enhancement on xen Mukesh Rathor
                   ` (2 preceding siblings ...)
  2010-08-17 14:34 ` Ky Srinivasan
@ 2010-08-17 17:43 ` Jeremy Fitzhardinge
  2010-08-18  1:58   ` Mukesh Rathor
  3 siblings, 1 reply; 23+ messages in thread
From: Jeremy Fitzhardinge @ 2010-08-17 17:43 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: Xen-devel

 On 08/16/2010 06:33 PM, Mukesh Rathor wrote:
> In my worst case test scenario, I get about 20-36% improvement when the
> system is two to three times over provisioned. 
>
> Please provide any feedback. I would like to submit official patch for
> SCHEDOP_yield_to in xen.

This approach only works for old-style spinlocks.  Ticketlocks also have
the problem of making sure the next vcpu gets scheduled on unlock.

Have you looked at the pv spinlocks I have upstream in the pvops
kernels, which use the (existing) poll hypercall to block the waiting
vcpu until the lock is free?

    J

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-17 17:43 ` Jeremy Fitzhardinge
@ 2010-08-18  1:58   ` Mukesh Rathor
  2010-08-18 16:37     ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 23+ messages in thread
From: Mukesh Rathor @ 2010-08-18  1:58 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: Keir, Xen-devel, Fraser

On Tue, 17 Aug 2010 10:43:04 -0700
Jeremy Fitzhardinge <jeremy@goop.org> wrote:

>  On 08/16/2010 06:33 PM, Mukesh Rathor wrote:
> > In my worst case test scenario, I get about 20-36% improvement when
> > the system is two to three times over provisioned. 
> >
> > Please provide any feedback. I would like to submit official patch
> > for SCHEDOP_yield_to in xen.
> 
> This approach only works for old-style spinlocks.  Ticketlocks also
> have the problem of making sure the next vcpu gets scheduled on
> unlock.

Well, unfortunately, looks like old-style spinlocks are gonna be around 
for a very long time. I've heard there are customers still on EL3!


> Have you looked at the pv spinlocks I have upstream in the pvops
> kernels, which use the (existing) poll hypercall to block the waiting
> vcpu until the lock is free?
>     J

>How does this compare with Jeremy's existing paravirtualised spinlocks
>in pv_ops? They required no hypervsior changes. Cc'ing Jeremy.
> -- Keir

Yeah, I looked at it today. What pv-ops is doing is forcing a yield
via a fake irq/event channel poll, after storing the lock pointer in
a per cpu area. The unlock'er then IPIs the vcpus waiting. The lock
holder may not be running tho, and there is no hint to hypervisor
to run it. So you may have many waitor's come and leave for no
reason.

To me this is more of an overhead than needed in a guest. In my
approach, the hypervisor is hinted exactly which vcpu is the 
lock holder. Often many VCPUs are pinned to a set of physical cpus
due to licensing and other reasons. So this really helps a vcpu
that is holding a spin lock, wanting to do some possibly real
time work, get scheduled and move on. Moreover, number of vcpus is
going up pretty fast.

Thanks,
Mukesh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-17  7:53 ` Jan Beulich
@ 2010-08-18  1:58   ` Mukesh Rathor
  0 siblings, 0 replies; 23+ messages in thread
From: Mukesh Rathor @ 2010-08-18  1:58 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Xen-devel

On Tue, 17 Aug 2010 08:53:32 +0100
"Jan Beulich" <JBeulich@novell.com> wrote:

> >>> On 17.08.10 at 03:33, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> 
> A mere vcpu_kick()+do_yield() seems pretty simplistic to me - if the
> current vCPU still has higher priority than the one kicked you'll
> achieve nothing. Instead, I think you really want to offer the
> current vCPU's time slice to the target, making sure the target
> yields back as soon as it released the lock (thus transferring the
> borrowed time slice back to where it belongs).

True, that is phase II enhancement.

> And then, without using ticket locks, you likely increase unfairness
> (as any other actively running vCPU going for the same lock will
> have much better chances of acquiring it than the vCPU that
> originally tried to and yielded), including the risk of starvation.

Please see other thread on my thoughts on this.

> Still, I'm glad to see we're not the only ones wanting a directed
> yield capability in Xen.
> 
> >+struct sched_yield_to {
> >+    unsigned int version;
> >+    unsigned int vcpu_id;
> >+};
> 
> Why do you need a version field here, the more as it doesn't
> appear to get read by the hypervisor.

No reason, just forgot to remove it.

thanks,
Mukesh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-17 14:34 ` Ky Srinivasan
@ 2010-08-18  1:58   ` Mukesh Rathor
  0 siblings, 0 replies; 23+ messages in thread
From: Mukesh Rathor @ 2010-08-18  1:58 UTC (permalink / raw)
  To: Ky Srinivasan; +Cc: Jan, Xen-devel

On Tue, 17 Aug 2010 08:34:49 -0600
"Ky Srinivasan" <ksrinivasan@novell.com> wrote:
..
> While I agree that a directed yield is a useful construct, I am not
> sure how this protocol would deal with ticket spin locks as you would
> want to implement some form of priority inheritance - if the vcpu you
> are yielding to is currently blocked on another (ticket) spin lock,
> you would want to yield to the owner of that other spin lock.
> Clearly, this dependency information is only available in the guest
> and that is where we would need to implement this logic. I think
> Jan's "enlightened" spin locks implemented this kind of logic.
 
Frankly, I'm opposed to ticket spin locks. IMO, starvation and fairness
are schedular problems and not of spin locks. If a vcpu has higher
priority, it is for a reason, and I'd like it to get prioritized.
Imagine a cluster stack in a 128 vcpu environment, the thread doing
heartbeat definitely needs the prirority it deserves.

Having said that, my proposal can be enhanced to take into
consideration ticket spin locks by having unlock make sure
next vcpu in line has temporary priority boost.

thanks,
Mukesh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-18  1:58   ` Mukesh Rathor
@ 2010-08-18 16:37     ` Jeremy Fitzhardinge
  2010-08-18 17:09       ` Keir Fraser
  2010-08-19  2:52       ` Mukesh Rathor
  0 siblings, 2 replies; 23+ messages in thread
From: Jeremy Fitzhardinge @ 2010-08-18 16:37 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: Xen-devel, Keir Fraser

 On 08/17/2010 06:58 PM, Mukesh Rathor wrote:
>> How does this compare with Jeremy's existing paravirtualised spinlocks
>> in pv_ops? They required no hypervsior changes. Cc'ing Jeremy.
>> -- Keir
> Yeah, I looked at it today. What pv-ops is doing is forcing a yield
> via a fake irq/event channel poll, after storing the lock pointer in
> a per cpu area. The unlock'er then IPIs the vcpus waiting. The lock
> holder may not be running tho, and there is no hint to hypervisor
> to run it. So you may have many waitor's come and leave for no
> reason.

(They don't leave for no reason; they leave when they're told they can
take the lock next.)

I don't see why the guest should micromanage Xen's scheduler decisions. 
If a VCPU is waiting for another VCPU and can put itself to sleep in the
meantime, then its up to Xen to take advantage of that newly freed PCPU
to schedule something.  It may decide to run something in your domain
that's runnable, or it may decide to run something else.  There's no
reason why the spinlock holder is the best VCPU to run overall, or even
the best VCPU in your domain.

My view is you should just put any VCPU which has nothing to do to
sleep, and let Xen sort out the scheduling of the remainder.

> To me this is more of an overhead than needed in a guest. In my
> approach, the hypervisor is hinted exactly which vcpu is the 
> lock holder.

The slow path should be rare.  In general locks should be taken
uncontended, or with brief contention.  Locks should be held for a short
period of time, so risk of being preempted while holding the lock should
be low.  The effects of the preemption a pretty disastrous, so we need
to handle it, but the slow path will be rare, so the time spent handling
it is not a critical factor (and can be compensated for by tuning the
timeout before dropping into the slow path).

>  Often many VCPUs are pinned to a set of physical cpus
> due to licensing and other reasons. So this really helps a vcpu
> that is holding a spin lock, wanting to do some possibly real
> time work, get scheduled and move on.

I'm not sure I understand this point.  If you're pinning vcpus to pcpus,
then presumably you're not going to share a pcpu among many, or any
vcpus, so the lock holder will be able to run any time it wants.  And a
directed yield will only help if the lock waiter is sharing the same
pcpu as the lock holder, so it can hand over its timeslice (since making
the directed yield preempt something already running in order to run
your target vcpu seems rude and ripe for abuse).

>  Moreover, number of vcpus is
> going up pretty fast.

Presumably the number of pcpus are also going up, so the amount of
per-pcpu overcommit is about the same.

    J

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-18 16:37     ` Jeremy Fitzhardinge
@ 2010-08-18 17:09       ` Keir Fraser
  2010-08-19  2:52         ` Mukesh Rathor
  2010-08-24  8:08         ` George Dunlap
  2010-08-19  2:52       ` Mukesh Rathor
  1 sibling, 2 replies; 23+ messages in thread
From: Keir Fraser @ 2010-08-18 17:09 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Mukesh Rathor; +Cc: Xen-devel

On 18/08/2010 17:37, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:

> I don't see why the guest should micromanage Xen's scheduler decisions.
> If a VCPU is waiting for another VCPU and can put itself to sleep in the
> meantime, then its up to Xen to take advantage of that newly freed PCPU
> to schedule something.  It may decide to run something in your domain
> that's runnable, or it may decide to run something else.  There's no
> reason why the spinlock holder is the best VCPU to run overall, or even
> the best VCPU in your domain.
> 
> My view is you should just put any VCPU which has nothing to do to
> sleep, and let Xen sort out the scheduling of the remainder.

Yeah, I'm no fan of yield or yield-to type operations. I'd reserve the right
to implement both of them as no-op.

 -- Keir

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-18 17:09       ` Keir Fraser
@ 2010-08-19  2:52         ` Mukesh Rathor
  2010-08-24  8:08         ` George Dunlap
  1 sibling, 0 replies; 23+ messages in thread
From: Mukesh Rathor @ 2010-08-19  2:52 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Jeremy Fitzhardinge, Xen-devel

On Wed, 18 Aug 2010 18:09:22 +0100
Keir Fraser <keir.fraser@eu.citrix.com> wrote:

> On 18/08/2010 17:37, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:
> 
> > I don't see why the guest should micromanage Xen's scheduler
> > decisions. If a VCPU is waiting for another VCPU and can put itself
> > to sleep in the meantime, then its up to Xen to take advantage of
> > that newly freed PCPU to schedule something.  It may decide to run
> > something in your domain that's runnable, or it may decide to run
> > something else.  There's no reason why the spinlock holder is the
> > best VCPU to run overall, or even the best VCPU in your domain.
> > 
> > My view is you should just put any VCPU which has nothing to do to
> > sleep, and let Xen sort out the scheduling of the remainder.
> 
> Yeah, I'm no fan of yield or yield-to type operations. I'd reserve
> the right to implement both of them as no-op.
> 
>  -- Keir
> 

I think making them advisory makes sense. Ultimately xen decides. 

thanks,
Mukesh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-18 16:37     ` Jeremy Fitzhardinge
  2010-08-18 17:09       ` Keir Fraser
@ 2010-08-19  2:52       ` Mukesh Rathor
  2010-08-23 21:33         ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 23+ messages in thread
From: Mukesh Rathor @ 2010-08-19  2:52 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: Keir, Xen-devel, Fraser

On Wed, 18 Aug 2010 09:37:17 -0700
Jeremy Fitzhardinge <jeremy@goop.org> wrote:

> (They don't leave for no reason; they leave when they're told they can
> take the lock next.)
> 
> I don't see why the guest should micromanage Xen's scheduler
> decisions. If a VCPU is waiting for another VCPU and can put itself
> to sleep in the meantime, then its up to Xen to take advantage of
> that newly freed PCPU to schedule something.  It may decide to run
> something in your domain that's runnable, or it may decide to run
> something else.  There's no reason why the spinlock holder is the
> best VCPU to run overall, or even the best VCPU in your domain.
> 
> My view is you should just put any VCPU which has nothing to do to
> sleep, and let Xen sort out the scheduling of the remainder.

Agree for the most part. But if we can spare the cost of a vcpu coming
on a cpu, realizing it has nothing to do and putting itself to sleep, by a
simple solution, we've just saved cycles. Often we are looking for tiny
gains in the benchmarks against competition. 

Yes we don't want to micromanage xen's schedular. But if a guest knows
something that the schedular does not, and has no way of knowing it,
then it would be nice to be able to exploit that. I didn't think a vcpu
telling xen that it's not making forward progress was intrusive.

Another approach, perhaps better, is a hypercall that allows to temporarily
boost a vcpu's priority.  What do you guys think about that? This would
be akin to a system call allowing a process to boost priority. Or
some kernels, where a thread holding a lock gets a temporary bump in
the priority because a waitor tells the kernel to.


> I'm not sure I understand this point.  If you're pinning vcpus to
> pcpus, then presumably you're not going to share a pcpu among many,
> or any vcpus, so the lock holder will be able to run any time it
> wants.  And a directed yield will only help if the lock waiter is
> sharing the same pcpu as the lock holder, so it can hand over its
> timeslice (since making the directed yield preempt something already
> running in order to run your target vcpu seems rude and ripe for
> abuse).

No, if a customer licences 4 cpus, and runs a guest with 12 vcpus.
You now have 12 vcpus confined to the 4 physical. 

> Presumably the number of pcpus are also going up, so the amount of
> per-pcpu overcommit is about the same.

Unless the vcpus's are going up faster than pcpus :)....


Thanks,
Mukesh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-19  2:52       ` Mukesh Rathor
@ 2010-08-23 21:33         ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 23+ messages in thread
From: Jeremy Fitzhardinge @ 2010-08-23 21:33 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: Xen-devel, Keir Fraser

 On 08/18/2010 07:52 PM, Mukesh Rathor wrote:
>> My view is you should just put any VCPU which has nothing to do to
>> sleep, and let Xen sort out the scheduling of the remainder.
> Agree for the most part. But if we can spare the cost of a vcpu coming
> on a cpu, realizing it has nothing to do and putting itself to sleep, by a
> simple solution, we've just saved cycles. Often we are looking for tiny
> gains in the benchmarks against competition. 

Well, how does your proposal compare to mine?  Is it more efficient?

> Yes we don't want to micromanage xen's schedular. But if a guest knows
> something that the schedular does not, and has no way of knowing it,
> then it would be nice to be able to exploit that. I didn't think a vcpu
> telling xen that it's not making forward progress was intrusive.

Well, blocking on an event channel is a good hint.  And what's more, it
allows the guest even more control because it can choose which vcpu to
wake up when.

> Another approach, perhaps better, is a hypercall that allows to temporarily
> boost a vcpu's priority.  What do you guys think about that? This would
> be akin to a system call allowing a process to boost priority. Or
> some kernels, where a thread holding a lock gets a temporary bump in
> the priority because a waitor tells the kernel to.

That kind of thing has many pitfalls - not least, how do you make sure
it doesn't get abused?  A "proper" mechanism to deal with this is expose
some kind of complete vcpu blocking dependency graph to Xen to inform
its scheduling decisions, but that's probably overkill...

>> I'm not sure I understand this point.  If you're pinning vcpus to
>> pcpus, then presumably you're not going to share a pcpu among many,
>> or any vcpus, so the lock holder will be able to run any time it
>> wants.  And a directed yield will only help if the lock waiter is
>> sharing the same pcpu as the lock holder, so it can hand over its
>> timeslice (since making the directed yield preempt something already
>> running in order to run your target vcpu seems rude and ripe for
>> abuse).
> No, if a customer licences 4 cpus, and runs a guest with 12 vcpus.
> You now have 12 vcpus confined to the 4 physical. 

In one domain?  Why would they do that?

    J

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-18 17:09       ` Keir Fraser
  2010-08-19  2:52         ` Mukesh Rathor
@ 2010-08-24  8:08         ` George Dunlap
  2010-08-24  8:20           ` Keir Fraser
                             ` (2 more replies)
  1 sibling, 3 replies; 23+ messages in thread
From: George Dunlap @ 2010-08-24  8:08 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Jeremy Fitzhardinge, Xen-devel

Wow, I totally missed this thread.

A couple of thoughts;

Complicated solutions for the scheduler are a really bad idea.  It's
hard enough to predict and debug the side-effects of simple
mechanisms; a complex mechanism is doomed to failure at the outset.

I agree with Jeremy, that the guest shouldn't tell Xen to run a
specific VCPU.  At most it should be something along the lines of, "If
you're going to run any vcpu from this domain, please run vcpu X."

Jeremy, do you think that changes to the HV are necessary, or do you
think that the existing solution is sufficient?  It seems to me like
hinting to the HV to do a directed yield makes more sense than making
the same thing happen via blocking and event channels.  OTOH, that
gives the guest a lot more control over when and how things happen.

Mukesh, did you see the patch by Xiantao Zhang a few days ago,
regarding what to do on an HVM pause instruction?  I thought the
solution he had was interesting: when yielding due to a spinlock,
rather than going to the back of the queue, just go behind one person.
 I think an impleentation of "yield_to" that might make sense in the
credit scheduler is:
* Put the yielding vcpu behind one cpu
* If the yield-to vcpu is not running, pull it to the front within its
priority.  (I.e., if it's UNDER, put it at the front so it runs next;
if it's OVER, make it the first OVER cpu.)

Thoughts?

 -George


On Wed, Aug 18, 2010 at 6:09 PM, Keir Fraser <keir.fraser@eu.citrix.com> wrote:
> On 18/08/2010 17:37, "Jeremy Fitzhardinge" <jeremy@goop.org> wrote:
>
>> I don't see why the guest should micromanage Xen's scheduler decisions.
>> If a VCPU is waiting for another VCPU and can put itself to sleep in the
>> meantime, then its up to Xen to take advantage of that newly freed PCPU
>> to schedule something.  It may decide to run something in your domain
>> that's runnable, or it may decide to run something else.  There's no
>> reason why the spinlock holder is the best VCPU to run overall, or even
>> the best VCPU in your domain.
>>
>> My view is you should just put any VCPU which has nothing to do to
>> sleep, and let Xen sort out the scheduling of the remainder.
>
> Yeah, I'm no fan of yield or yield-to type operations. I'd reserve the right
> to implement both of them as no-op.
>
>  -- Keir
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-24  8:08         ` George Dunlap
@ 2010-08-24  8:20           ` Keir Fraser
  2010-08-24  8:43             ` George Dunlap
  2010-08-24  8:48             ` Jan Beulich
  2010-08-25  1:03           ` Dong, Eddie
  2010-08-26  2:13           ` Mukesh Rathor
  2 siblings, 2 replies; 23+ messages in thread
From: Keir Fraser @ 2010-08-24  8:20 UTC (permalink / raw)
  To: George Dunlap; +Cc: Jeremy Fitzhardinge, Xen-devel

On 24/08/2010 09:08, "George Dunlap" <dunlapg@umich.edu> wrote:

> Jeremy, do you think that changes to the HV are necessary, or do you
> think that the existing solution is sufficient?  It seems to me like
> hinting to the HV to do a directed yield makes more sense than making
> the same thing happen via blocking and event channels.  OTOH, that
> gives the guest a lot more control over when and how things happen.
> 
> Mukesh, did you see the patch by Xiantao Zhang a few days ago,
> regarding what to do on an HVM pause instruction?

I think there's a difference between providing some kind of yield_to as a
private interafce within the hypervisor as some kind of heuristic for
emulating something like PAUSE, versus providing such an operation as a
public guest interface.

It seems to me that Jeremy's spinlock implementation provides all the info a
scheduler would require: vcpus trying to acquire a lock are blocked, the
lock holder wakes just the next vcpu in turn when it releases the lock. The
scheduler at that point may have a decision to make as to whether to run the
lock releaser, or the new lock holder, or both, but how can the guest help
with that when its a system-wide scheduling decision? Obviously the guest
would presumably like all its runnable vcpus to run all of the time!

 - Keir

>  I thought the
> solution he had was interesting: when yielding due to a spinlock,
> rather than going to the back of the queue, just go behind one person.
>  I think an impleentation of "yield_to" that might make sense in the
> credit scheduler is:
> * Put the yielding vcpu behind one cpu
> * If the yield-to vcpu is not running, pull it to the front within its
> priority.  (I.e., if it's UNDER, put it at the front so it runs next;
> if it's OVER, make it the first OVER cpu.)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-24  8:20           ` Keir Fraser
@ 2010-08-24  8:43             ` George Dunlap
  2010-08-24  8:48             ` Jan Beulich
  1 sibling, 0 replies; 23+ messages in thread
From: George Dunlap @ 2010-08-24  8:43 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Jeremy Fitzhardinge, Xen-devel

On Tue, Aug 24, 2010 at 9:20 AM, Keir Fraser <keir.fraser@eu.citrix.com> wrote:
> I think there's a difference between providing some kind of yield_to as a
> private interafce within the hypervisor as some kind of heuristic for
> emulating something like PAUSE, versus providing such an operation as a
> public guest interface.

I agree that any "yield_to" should be strictly a hint, and not a
guarantee by the HV.  If that's the case, I don't actually see a
difference between a malicous guest knowing that "yield_to" happens to
behave a certain way, and a malicious guest knowing that "PAUSE"
behaves a certain way.

> It seems to me that Jeremy's spinlock implementation provides all the info a
> scheduler would require: vcpus trying to acquire a lock are blocked, the
> lock holder wakes just the next vcpu in turn when it releases the lock. The
> scheduler at that point may have a decision to make as to whether to run the
> lock releaser, or the new lock holder, or both, but how can the guest help
> with that when its a system-wide scheduling decision? Obviously the guest
> would presumably like all its runnable vcpus to run all of the time!

I think that makes sense, but leaves out one important factor: that
the credit scheduler, as it is, is essentially round-robin within a
priority; and round-robin schedulers are known to discriminate against
vcpus that yield in favor of those that burn up their whole timeslice.
 I think it makes sense to give yielding guests a bit of an advantage
to compensate for that.

That said, this whole thing needs measurement: any yield_to
implementation would need to show that:
* The performance is significantly better than either Jeremy's
patches, or simple yield (with, perhaps, boost-peers, as Xiantao
suggests)
* It does not give a spin-locking workload a cpu advantage over other
workloads, such as specjbb (cpu-bound) or scp (very
latency-sensitive).

 -George

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-24  8:20           ` Keir Fraser
  2010-08-24  8:43             ` George Dunlap
@ 2010-08-24  8:48             ` Jan Beulich
  2010-08-24  9:09               ` George Dunlap
  1 sibling, 1 reply; 23+ messages in thread
From: Jan Beulich @ 2010-08-24  8:48 UTC (permalink / raw)
  To: Keir Fraser, George Dunlap; +Cc: Jeremy Fitzhardinge, Xen-devel

>>> On 24.08.10 at 10:20, Keir Fraser <keir.fraser@eu.citrix.com> wrote:
> On 24/08/2010 09:08, "George Dunlap" <dunlapg@umich.edu> wrote:
> It seems to me that Jeremy's spinlock implementation provides all the info a
> scheduler would require: vcpus trying to acquire a lock are blocked, the
> lock holder wakes just the next vcpu in turn when it releases the lock. The
> scheduler at that point may have a decision to make as to whether to run the
> lock releaser, or the new lock holder, or both, but how can the guest help
> with that when its a system-wide scheduling decision? Obviously the guest
> would presumably like all its runnable vcpus to run all of the time!

Blocking on an unavailable lock is somewhat different imo: If the blocked
vCPU didn't exhaust its time slice, I think it is very valid to for it to
expect to not penalize the whole VM, and rather donate (part of) its
remaining time slice to the lock holder. That keeps other domains
unaffected, while allowing the subject domain to make better use of
its resources.

>>  I thought the
>> solution he had was interesting: when yielding due to a spinlock,
>> rather than going to the back of the queue, just go behind one person.
>>  I think an impleentation of "yield_to" that might make sense in the
>> credit scheduler is:
>> * Put the yielding vcpu behind one cpu

Which clearly has the potential of burning more cycles without
allowing the vCPU to actually make progress.

>> * If the yield-to vcpu is not running, pull it to the front within its
>> priority.  (I.e., if it's UNDER, put it at the front so it runs next;
>> if it's OVER, make it the first OVER cpu.)

At the risk of fairness wrt other domains, or even within the
domain. As said above, I think it would be better to temporarily
merge the priorities and location in the run queue of the yielding
and yielded-to vCPU-s, to have the yielded-to one get the
better of both (with a way to revert to the original settings
under the control of the guest, or enforced when the borrowed
time quantum expires).

The one more difficult case I would see in this model is what
needs to happen when the yielding vCPU has event delivery
enabled and receives an event, making it runnable again: In
this situation, the swapping of priority and/or run queue
placement might need to be forcibly reversed immediately,
not so much for fairness reasons than for keeping event
servicing latency reasonable. This includes the fact that in
such a case the vCPU wouldn't be able to do what it wants
with the waited for lock acquired, but would rather run the
event handling code first anyway, and hence the need for
boosting the lock holder went away.

Jan

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-24  8:48             ` Jan Beulich
@ 2010-08-24  9:09               ` George Dunlap
  2010-08-24 13:25                 ` Jan Beulich
  0 siblings, 1 reply; 23+ messages in thread
From: George Dunlap @ 2010-08-24  9:09 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Jeremy Fitzhardinge, Xen-devel, Keir Fraser

On Tue, Aug 24, 2010 at 9:48 AM, Jan Beulich <JBeulich@novell.com> wrote:
>>>  I thought the
>>> solution he had was interesting: when yielding due to a spinlock,
>>> rather than going to the back of the queue, just go behind one person.
>>>  I think an impleentation of "yield_to" that might make sense in the
>>> credit scheduler is:
>>> * Put the yielding vcpu behind one cpu
>
> Which clearly has the potential of burning more cycles without
> allowing the vCPU to actually make progress.

I think you may misunderstand; the yielding vcpu goes behind at least
one vcpu on the runqueue, even if the next vcpu is lower priority.  If
there's another vcpu on the runqueue, the other vcpu always runs.

I posted some scheduler patches implementing this yield a week or two
ago, and included some numbers.  The numbers were with Windows Server
2008, which has queued spinlocks (equivalent of ticketed spinlocks).
The throughput remained high even when highly over-committed.  So a
simple yield does have a significant effect.  In the unlikely even
that it is scheduled again, it will simply yield again when it sees
that it's still waiting for the spinlock.

In fact, undirected-yield is one of yield-to's competitors: I don't
think we should accept a "yield-to" patch unless it has significant
performance gains over undirected-yield.

> At the risk of fairness wrt other domains, or even within the
> domain. As said above, I think it would be better to temporarily
> merge the priorities and location in the run queue of the yielding
> and yielded-to vCPU-s, to have the yielded-to one get the
> better of both (with a way to revert to the original settings
> under the control of the guest, or enforced when the borrowed
> time quantum expires).

I think doing tricks with priorities is too complicated.  Complicated
mechanisms are very difficult to predict and prone to nasty,
hard-to-debug corner cases.  I don't think it's worth exploring this
kind of solution until it's clear that a simple solution cannot get
reasonable performance.  And I would oppose accepting any
priority-inheritance solution into the tree unless there were
repeatable measurements that showed that it had significant
performance gain over a simpler solution.

 -George

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-24  9:09               ` George Dunlap
@ 2010-08-24 13:25                 ` Jan Beulich
  2010-08-24 16:11                   ` George Dunlap
  0 siblings, 1 reply; 23+ messages in thread
From: Jan Beulich @ 2010-08-24 13:25 UTC (permalink / raw)
  To: George Dunlap; +Cc: Jeremy Fitzhardinge, Xen-devel, Keir Fraser

>>> On 24.08.10 at 11:09, George Dunlap <dunlapg@umich.edu> wrote:
> On Tue, Aug 24, 2010 at 9:48 AM, Jan Beulich <JBeulich@novell.com> wrote:
>>>>  I thought the
>>>> solution he had was interesting: when yielding due to a spinlock,
>>>> rather than going to the back of the queue, just go behind one person.
>>>>  I think an impleentation of "yield_to" that might make sense in the
>>>> credit scheduler is:
>>>> * Put the yielding vcpu behind one cpu
>>
>> Which clearly has the potential of burning more cycles without
>> allowing the vCPU to actually make progress.
> 
> I think you may misunderstand; the yielding vcpu goes behind at least
> one vcpu on the runqueue, even if the next vcpu is lower priority.  If
> there's another vcpu on the runqueue, the other vcpu always runs.

No, I understood it that way. What I was referring to is (as an
example) the case where two vCPU-s on the sam pCPU's run queue
both yield: They will each move after the other in the run queue in
close succession, but neither will really make progress, and neither
will really increase the likelihood of the respective lock holder to
get a chance to run.

> I posted some scheduler patches implementing this yield a week or two
> ago, and included some numbers.  The numbers were with Windows Server
> 2008, which has queued spinlocks (equivalent of ticketed spinlocks).
> The throughput remained high even when highly over-committed.  So a
> simple yield does have a significant effect.  In the unlikely even
> that it is scheduled again, it will simply yield again when it sees
> that it's still waiting for the spinlock.

Immediately, or after a few (hundred) spin cycles?

> In fact, undirected-yield is one of yield-to's competitors: I don't
> think we should accept a "yield-to" patch unless it has significant
> performance gains over undirected-yield.

This position I agree with.

>> At the risk of fairness wrt other domains, or even within the
>> domain. As said above, I think it would be better to temporarily
>> merge the priorities and location in the run queue of the yielding
>> and yielded-to vCPU-s, to have the yielded-to one get the
>> better of both (with a way to revert to the original settings
>> under the control of the guest, or enforced when the borrowed
>> time quantum expires).
> 
> I think doing tricks with priorities is too complicated.  Complicated
> mechanisms are very difficult to predict and prone to nasty,
> hard-to-debug corner cases.  I don't think it's worth exploring this
> kind of solution until it's clear that a simple solution cannot get
> reasonable performance.  And I would oppose accepting any
> priority-inheritance solution into the tree unless there were
> repeatable measurements that showed that it had significant
> performance gain over a simpler solution.

And so I do with this. Apart from suspecting fairness issues with
your yield_to proposal (as I wrote), my point just is - we won't
know if a "complicated" solution outperforms a "simple" one if we
don't try it.

Jan

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-24 13:25                 ` Jan Beulich
@ 2010-08-24 16:11                   ` George Dunlap
  2010-08-26 14:08                     ` Tim Deegan
  0 siblings, 1 reply; 23+ messages in thread
From: George Dunlap @ 2010-08-24 16:11 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Jeremy Fitzhardinge, Xen-devel, Keir Fraser

On Tue, Aug 24, 2010 at 2:25 PM, Jan Beulich <JBeulich@novell.com> wrote:
> No, I understood it that way. What I was referring to is (as an
> example) the case where two vCPU-s on the sam pCPU's run queue
> both yield: They will each move after the other in the run queue in
> close succession, but neither will really make progress, and neither
> will really increase the likelihood of the respective lock holder to
> get a chance to run.

Ah, I see.  In order for this to be a waste, it needs to be the case that:
* Two vcpus from different domains grab a spinlock and are then preempted
* Two vcpus from different domains then fail to grab the spinlock
* The two vcpus holding the locks are kept from getting cpu by
{another vcpu, other vcpus} which uses a long time-slice
* The two waiting for the lock share a cpu with each other and no one else

Of course in this situation, it would be nice if Xen could migrate one
of the other vcpus to the cpu of the two yielding vcpus.  That
shouldn't be too hard to implement, at least to see if it has a
measurable impact on aggregate throughput.

> Immediately, or after a few (hundred) spin cycles?

It depends on the implementation.  The Citrix guest tools do binary
patching of spinlock routines for w2k3 and XP; I believe they spin for
1000 cycles or so.  The viridian enlightenments I believe would yield
immediately.  I think the pause instruction causes a yield immediately
as well.

Yielding immediately when the host is not overloaded is actually
probably not optimal: if the vcpu holding the lock is currently
running, it's likely that by the time the vcpu makes it to the
scheduler, the lock it's waiting for has already been released.
(Which is part of the reason it's a spinlock and not a semaphore.)

> And so I do with this. Apart from suspecting fairness issues with
> your yield_to proposal (as I wrote), my point just is - we won't
> know if a "complicated" solution outperforms a "simple" one if we
> don't try it.

Are you volunteering? :-)

 -George

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: Linux spin lock enhancement on xen
  2010-08-24  8:08         ` George Dunlap
  2010-08-24  8:20           ` Keir Fraser
@ 2010-08-25  1:03           ` Dong, Eddie
  2010-08-26  2:13           ` Mukesh Rathor
  2 siblings, 0 replies; 23+ messages in thread
From: Dong, Eddie @ 2010-08-25  1:03 UTC (permalink / raw)
  To: George Dunlap, Keir Fraser; +Cc: Jeremy Fitzhardinge, Xen-devel, Dong, Eddie

George Dunlap wrote:
> Wow, I totally missed this thread.
> 
> A couple of thoughts;
> 
> Complicated solutions for the scheduler are a really bad idea.  It's
> hard enough to predict and debug the side-effects of simple
> mechanisms; a complex mechanism is doomed to failure at the outset.
> 
> I agree with Jeremy, that the guest shouldn't tell Xen to run a
> specific VCPU.  At most it should be something along the lines of, "If
> you're going to run any vcpu from this domain, please run vcpu X."
> 
> Jeremy, do you think that changes to the HV are necessary, or do you
> think that the existing solution is sufficient?  It seems to me like
> hinting to the HV to do a directed yield makes more sense than making
> the same thing happen via blocking and event channels.  OTOH, that
> gives the guest a lot more control over when and how things happen.
> 
> Mukesh, did you see the patch by Xiantao Zhang a few days ago,
> regarding what to do on an HVM pause instruction?  I thought the
> solution he had was interesting: when yielding due to a spinlock,
> rather than going to the back of the queue, just go behind one person.
>  I think an impleentation of "yield_to" that might make sense in the
> credit scheduler is:
> * Put the yielding vcpu behind one cpu
> * If the yield-to vcpu is not running, pull it to the front within its
> priority.  (I.e., if it's UNDER, put it at the front so it runs next;
> if it's OVER, make it the first OVER cpu.)
> 
> Thoughts?
> 

	What Xiantao (and I internally) proposed is to implement temporary coscheduling to solve spin-lock issues no matter FIFO spin-lock or ordinary spin-lock, utilizing PLE exit (of course can work with PV spin-lock as well). Here is our thinking (please refer to Xiantao's mail as well):

	There are 2 typical solution to improve spin lock efficiency in virtualization: A) lock holder preemption avoidance (or co-scheduling), and B) helping locks which donates the spinning CPU cycles for overal system utilization.  

	#A solves spin-lock issue best, however it requires hardware assistance to detect lock holder which is impratical, or coscheduling which is hard to be implement efficiently and sacrifficing lots of scheduler flexibility. Neither Xen or KVM implemented that.

	#B (current Xen policy with PLE_yeilding) may help system performance, however it may not help the performance of spinning guest. In some cases the guest may become even worse due to long waiting (yield) of spin-lock. In some cases it may get back additional CPU cycles (and performance) from VMM scheduler complementing to its previous CPU cycle donation. In general, #B may help system performance if it is right overcommitted, but it also hurt single guest "speed" depending.
	
	An additional issue in #B is that it may hurt FIFO spin lock (ticket spin-lock in Linux and queued spin-lock in Windwos from Windows 2000), where only the first-in waiting VCPU is able to get lock from OS design perspective. Current PLE won't be able to know which one is the next (First In) waiting VCPU and which one is lock holder.

[Proposed optimization]
	Lock holder preemption avoidance is the right solution to fully utilize hardware PLE capability, the current solution is simply hurting the performance, and we need to improve it with solution #A.

	Given that current hardware is unable to tell which VCPU is lock holder or which one is the next (First In) waiting VCPU? Coscheduling may be the choice. However, Coscheduling has that many side effect as well (somebody said other company using co-scheduling is going to give up as well). This proposal is to do temporary coscheduling on top of existing VMM scheduling. The details are:
	When one or more of VCPU of a guest is waiting for a spin-lock, we can temporary increase the priority of all VCPUs of the same geust to be scheduled in for a short period. The period will be pretty small here to avoid the impact of "coscheduling" to overall VMM scheduler. The current Xen patch simply "boost" the VCPUs which already show great gain, but there may be more tuning in optimized parameter for this algorithm.
	
	I believe this will be a perfect solution to spin-lock issue with PLE in for now (when VCPU # is not dramatically large. vConsolidate (mix of LInux and Windows guest) shows 19% consolidation performance gain, that is so great to believe even, but it is true :)  We are investing more for different workload, and will post new patch soon.
	Of course if PV guest is running in PVM container, the PVed spin-lock is still needed. But I am doubting its necessity if PVM is running on top of HVM container :)


Thx, Eddie

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-24  8:08         ` George Dunlap
  2010-08-24  8:20           ` Keir Fraser
  2010-08-25  1:03           ` Dong, Eddie
@ 2010-08-26  2:13           ` Mukesh Rathor
  2 siblings, 0 replies; 23+ messages in thread
From: Mukesh Rathor @ 2010-08-26  2:13 UTC (permalink / raw)
  To: George Dunlap; +Cc: Jeremy Fitzhardinge, Xen-devel, Keir Fraser


>  I think an impleentation of "yield_to" that might make sense in the
> credit scheduler is:
> * Put the yielding vcpu behind one cpu
> * If the yield-to vcpu is not running, pull it to the front within its
> priority.  (I.e., if it's UNDER, put it at the front so it runs next;
> if it's OVER, make it the first OVER cpu.)
> 

Yup, I second it.

thanks,
Mukesh

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Linux spin lock enhancement on xen
  2010-08-24 16:11                   ` George Dunlap
@ 2010-08-26 14:08                     ` Tim Deegan
  0 siblings, 0 replies; 23+ messages in thread
From: Tim Deegan @ 2010-08-26 14:08 UTC (permalink / raw)
  To: George Dunlap; +Cc: Jeremy Fitzhardinge, Xen-devel, Keir Fraser, Jan Beulich

At 17:11 +0100 on 24 Aug (1282669892), George Dunlap wrote:
> It depends on the implementation.  The Citrix guest tools do binary
> patching of spinlock routines for w2k3 and XP; I believe they spin for
> 1000 cycles or so.  The viridian enlightenments I believe would yield
> immediately.

IIRC the Viridian interface includes a parameter that the hypervisor
passes to the guest to tell it how long to spin for before yielding. 

>  I think the pause instruction causes a yield immediately
> as well.

This is where the PLE hardware assist comes in - it effectively does the
same as the Viridian interfaces by counting PAUSEs.

FWIW (and I am defintely not a scheduler expert) I'm against anything
that gives a priority boost to a domain's VCPUs based on perceived
locking behaviour, and in favour of keeping things dead simple.
Targeted scheduler "improvements" have bitten us more than once.  When
George's scheduler regression tests can give us a more rounded picture
or the overall effect of scheduler tweaks (esp. on fairness) maybe that
will change.

Cheers,

Tim.

> Yielding immediately when the host is not overloaded is actually
> probably not optimal: if the vcpu holding the lock is currently
> running, it's likely that by the time the vcpu makes it to the
> scheduler, the lock it's waiting for has already been released.
> (Which is part of the reason it's a spinlock and not a semaphore.)
> 
> > And so I do with this. Apart from suspecting fairness issues with
> > your yield_to proposal (as I wrote), my point just is - we won't
> > know if a "complicated" solution outperforms a "simple" one if we
> > don't try it.
> 
> Are you volunteering? :-)
> 
>  -George
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2010-08-26 14:08 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-17  1:33 Linux spin lock enhancement on xen Mukesh Rathor
2010-08-17  7:33 ` Keir Fraser
2010-08-17  7:53 ` Jan Beulich
2010-08-18  1:58   ` Mukesh Rathor
2010-08-17 14:34 ` Ky Srinivasan
2010-08-18  1:58   ` Mukesh Rathor
2010-08-17 17:43 ` Jeremy Fitzhardinge
2010-08-18  1:58   ` Mukesh Rathor
2010-08-18 16:37     ` Jeremy Fitzhardinge
2010-08-18 17:09       ` Keir Fraser
2010-08-19  2:52         ` Mukesh Rathor
2010-08-24  8:08         ` George Dunlap
2010-08-24  8:20           ` Keir Fraser
2010-08-24  8:43             ` George Dunlap
2010-08-24  8:48             ` Jan Beulich
2010-08-24  9:09               ` George Dunlap
2010-08-24 13:25                 ` Jan Beulich
2010-08-24 16:11                   ` George Dunlap
2010-08-26 14:08                     ` Tim Deegan
2010-08-25  1:03           ` Dong, Eddie
2010-08-26  2:13           ` Mukesh Rathor
2010-08-19  2:52       ` Mukesh Rathor
2010-08-23 21:33         ` Jeremy Fitzhardinge

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.