All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch] continue_hypercall_on_cpu rework using tasklets
@ 2010-04-13 13:19 Juergen Gross
  2010-04-13 13:40 ` Jan Beulich
  2010-04-13 14:41 ` Keir Fraser
  0 siblings, 2 replies; 44+ messages in thread
From: Juergen Gross @ 2010-04-13 13:19 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 1187 bytes --]

Hi,

attached is a patch to use tasklets for continue_hypercall_on_cpu instead of
temporarily pinning the vcpu to the target physical cpu.

This is thought as base for cpupools as Keir requested to get rid of the
"borrow cpu" stuff in my original solution.

This is a rework of the patch I sent before based on some comments by Jan
Beulich.

Keir, if you agree to take this patch, I'll send out the cpupool stuff based
on this patch again. There is just one question pending:

cpupools are using continue_hypercall_on_cpu, but this function is defined for
x86 only, not for ia64. I see 2 possible solutions:

1. make continue_hypercall_on_cpu available on ia64, too
2. make cpupools a pure x86 feature

I think the first solution would be better, but I would need help on this as I
don't have any test machine.


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

[-- Attachment #2: tasklet.patch --]
[-- Type: text/x-patch, Size: 15037 bytes --]

# HG changeset patch
# User juergen.gross@ts.fujitsu.com
# Date 1271164572 -7200
# Node ID 580130fe415ce646861d1a544f9b93fc1e81835f
# Parent  aae7cb2f18411492c720d3b60adaa979858a63df

Signed-off-by: juergen.gross@ts.fujitsu.com

continue_hypercall_on_cpu rework using tasklets

diff -r aae7cb2f1841 -r 580130fe415c xen/arch/x86/acpi/power.c
--- a/xen/arch/x86/acpi/power.c	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/arch/x86/acpi/power.c	Tue Apr 13 15:16:12 2010 +0200
@@ -234,7 +234,7 @@
     return error;
 }
 
-static long enter_state_helper(void *data)
+static long enter_state_helper(void *hdl, void *data)
 {
     struct acpi_sleep_info *sinfo = (struct acpi_sleep_info *)data;
     return enter_state(sinfo->sleep_state);
@@ -265,7 +265,7 @@
     acpi_sinfo.pm1b_cnt_val = sleep->pm1b_cnt_val;
     acpi_sinfo.sleep_state = sleep->sleep_state;
 
-    return continue_hypercall_on_cpu(0, enter_state_helper, &acpi_sinfo);
+    return continue_hypercall_on_cpu(0, NULL, enter_state_helper, &acpi_sinfo);
 }
 
 static int acpi_get_wake_status(void)
diff -r aae7cb2f1841 -r 580130fe415c xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/arch/x86/domain.c	Tue Apr 13 15:16:12 2010 +0200
@@ -1518,42 +1518,52 @@
 }
 
 struct migrate_info {
-    long (*func)(void *data);
+    struct tasklet tasklet;
+    long (*func)(void *hdl, void *data);
     void *data;
     void (*saved_schedule_tail)(struct vcpu *);
-    cpumask_t saved_affinity;
-    unsigned int nest;
+    volatile int nest;
+    long ret;
+    struct vcpu *v;
 };
 
 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
 {
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct migrate_info *info = v->arch.continue_info;
-    cpumask_t mask = info->saved_affinity;
     void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
 
-    regs->eax = info->func(info->data);
+    regs->eax = info->ret;
 
-    if ( info->nest-- == 0 )
-    {
-        xfree(info);
-        v->arch.schedule_tail = saved_schedule_tail;
-        v->arch.continue_info = NULL;
-        vcpu_unlock_affinity(v, &mask);
-    }
+    tasklet_kill(&info->tasklet);
+    xfree(info);
+    v->arch.schedule_tail = saved_schedule_tail;
+    v->arch.continue_info = NULL;
 
     (*saved_schedule_tail)(v);
 }
 
-int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
+static void continue_hypercall_on_cpu_tasklet(struct migrate_info *info)
+{
+    info->ret = info->func((void *)info, info->data);
+
+    if ( info->nest-- == 0 )
+        vcpu_unpause(info->v);
+
+    return;
+}
+
+int continue_hypercall_on_cpu(int cpu, void *hdl,
+                              long (*func)(void *hdl, void *data), void *data)
 {
     struct vcpu *v = current;
-    struct migrate_info *info;
-    cpumask_t mask = cpumask_of_cpu(cpu);
-    int rc;
+    struct migrate_info *info = (struct migrate_info *)hdl;
 
     if ( cpu == smp_processor_id() )
-        return func(data);
+        return func(info, data);
+
+    if ( info != NULL )
+        v = info->v;
 
     info = v->arch.continue_info;
     if ( info == NULL )
@@ -1562,34 +1572,30 @@
         if ( info == NULL )
             return -ENOMEM;
 
-        rc = vcpu_lock_affinity(v, &mask);
-        if ( rc )
-        {
-            xfree(info);
-            return rc;
-        }
-
         info->saved_schedule_tail = v->arch.schedule_tail;
-        info->saved_affinity = mask;
         info->nest = 0;
+        info->v = v;
+        tasklet_init(&info->tasklet,
+                     (void(*)(unsigned long))continue_hypercall_on_cpu_tasklet,
+                     (unsigned long)info);
 
         v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
         v->arch.continue_info = info;
+        vcpu_pause_nosync(v);
     }
     else
     {
         BUG_ON(info->nest != 0);
-        rc = vcpu_locked_change_affinity(v, &mask);
-        if ( rc )
-            return rc;
         info->nest++;
     }
 
     info->func = func;
     info->data = data;
 
+    tasklet_schedule_cpu(&info->tasklet, cpu);
+    raise_softirq(SCHEDULE_SOFTIRQ);
+
     /* Dummy return value will be overwritten by new schedule_tail. */
-    BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
     return 0;
 }
 
diff -r aae7cb2f1841 -r 580130fe415c xen/arch/x86/microcode.c
--- a/xen/arch/x86/microcode.c	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/arch/x86/microcode.c	Tue Apr 13 15:16:12 2010 +0200
@@ -116,7 +116,7 @@
     return err;
 }
 
-static long do_microcode_update(void *_info)
+static long do_microcode_update(void *hdl, void *_info)
 {
     struct microcode_info *info = _info;
     int error;
@@ -129,7 +129,8 @@
 
     info->cpu = next_cpu(info->cpu, cpu_online_map);
     if ( info->cpu < NR_CPUS )
-        return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+        return continue_hypercall_on_cpu(info->cpu, hdl,
+                                         do_microcode_update, info);
 
     error = info->error;
     xfree(info);
@@ -162,5 +163,6 @@
     info->error = 0;
     info->cpu = first_cpu(cpu_online_map);
 
-    return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+    return continue_hypercall_on_cpu(info->cpu, NULL,
+                                     do_microcode_update, info);
 }
diff -r aae7cb2f1841 -r 580130fe415c xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/arch/x86/platform_hypercall.c	Tue Apr 13 15:16:12 2010 +0200
@@ -48,12 +48,12 @@
 extern int set_px_pminfo(uint32_t cpu, struct xen_processor_performance *perf);
 extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
 
-static long cpu_frequency_change_helper(void *data)
+static long cpu_frequency_change_helper(void *hdl, void *data)
 {
     return cpu_frequency_change(this_cpu(freq));
 }
 
-static long cpu_down_helper(void *data)
+static long cpu_down_helper(void *hdl, void *data)
 {
     int cpu = (unsigned long)data;
     return cpu_down(cpu);
@@ -314,7 +314,7 @@
         if ( op->u.change_freq.flags || !cpu_online(op->u.change_freq.cpu) )
             break;
         per_cpu(freq, op->u.change_freq.cpu) = op->u.change_freq.freq;
-        ret = continue_hypercall_on_cpu(op->u.change_freq.cpu,
+        ret = continue_hypercall_on_cpu(op->u.change_freq.cpu, NULL,
                                         cpu_frequency_change_helper,
                                         NULL);
         break;
@@ -467,7 +467,7 @@
             break;
         }
         ret = continue_hypercall_on_cpu(
-          0, cpu_down_helper, (void *)(unsigned long)cpu);
+          0, NULL, cpu_down_helper, (void *)(unsigned long)cpu);
         break;
     }
     break;
diff -r aae7cb2f1841 -r 580130fe415c xen/arch/x86/sysctl.c
--- a/xen/arch/x86/sysctl.c	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/arch/x86/sysctl.c	Tue Apr 13 15:16:12 2010 +0200
@@ -29,7 +29,7 @@
 
 #define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
 
-static long cpu_down_helper(void *data)
+static long cpu_down_helper(void *hdl, void *data)
 {
     int cpu = (unsigned long)data;
     return cpu_down(cpu);
@@ -241,7 +241,7 @@
             break;
         case XEN_SYSCTL_CPU_HOTPLUG_OFFLINE:
             ret = continue_hypercall_on_cpu(
-                0, cpu_down_helper, (void *)(unsigned long)cpu);
+                0, NULL, cpu_down_helper, (void *)(unsigned long)cpu);
             break;
         case XEN_SYSCTL_CPU_HOTPLUG_STATUS:
             ret = 0;
diff -r aae7cb2f1841 -r 580130fe415c xen/common/schedule.c
--- a/xen/common/schedule.c	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/common/schedule.c	Tue Apr 13 15:16:12 2010 +0200
@@ -408,26 +408,18 @@
     }
 }
 
-static int __vcpu_set_affinity(
-    struct vcpu *v, cpumask_t *affinity,
-    bool_t old_lock_status, bool_t new_lock_status)
+int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
 {
     cpumask_t online_affinity, old_affinity;
+
+    if ( v->domain->is_pinned )
+        return -EINVAL;
 
     cpus_and(online_affinity, *affinity, cpu_online_map);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
     vcpu_schedule_lock_irq(v);
-
-    if ( v->affinity_locked != old_lock_status )
-    {
-        BUG_ON(!v->affinity_locked);
-        vcpu_schedule_unlock_irq(v);
-        return -EBUSY;
-    }
-
-    v->affinity_locked = new_lock_status;
 
     old_affinity = v->cpu_affinity;
     v->cpu_affinity = *affinity;
@@ -444,36 +436,6 @@
     }
 
     return 0;
-}
-
-int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    if ( v->domain->is_pinned )
-        return -EINVAL;
-    return __vcpu_set_affinity(v, affinity, 0, 0);
-}
-
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 0, 1);
-}
-
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 1, 1);
-}
-
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    cpumask_t online_affinity;
-
-    /* Do not fail if no CPU in old affinity mask is online. */
-    cpus_and(online_affinity, *affinity, cpu_online_map);
-    if ( cpus_empty(online_affinity) )
-        *affinity = cpu_online_map;
-
-    if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
-        BUG();
 }
 
 /* Block the currently-executing domain until a pertinent event occurs. */
diff -r aae7cb2f1841 -r 580130fe415c xen/common/softirq.c
--- a/xen/common/softirq.c	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/common/softirq.c	Tue Apr 13 15:16:12 2010 +0200
@@ -88,9 +88,11 @@
 }
 
 static LIST_HEAD(tasklet_list);
+static DEFINE_PER_CPU(struct list_head, tasklet_list_pcpu);
 static DEFINE_SPINLOCK(tasklet_lock);
 
-void tasklet_schedule(struct tasklet *t)
+static void tasklet_schedule_list(struct tasklet *t, struct list_head *tlist,
+    int cpu)
 {
     unsigned long flags;
 
@@ -101,28 +103,47 @@
         if ( !t->is_scheduled && !t->is_running )
         {
             BUG_ON(!list_empty(&t->list));
-            list_add_tail(&t->list, &tasklet_list);
+            list_add_tail(&t->list, tlist);
+            t->scheduled_on = NR_CPUS;
         }
         t->is_scheduled = 1;
-        raise_softirq(TASKLET_SOFTIRQ);
+        if ( cpu == smp_processor_id() )
+            raise_softirq(TASKLET_SOFTIRQ);
+        else if ( !t->is_running )
+            cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
+        else
+            t->scheduled_on = cpu;
     }
 
     spin_unlock_irqrestore(&tasklet_lock, flags);
 }
 
+void tasklet_schedule(struct tasklet *t)
+{
+    tasklet_schedule_list(t, &tasklet_list, smp_processor_id());
+}
+
+void tasklet_schedule_cpu(struct tasklet *t, int cpu)
+{
+    tasklet_schedule_list(t, &per_cpu(tasklet_list_pcpu, cpu), cpu);
+}
+
 static void tasklet_action(void)
 {
+    struct list_head *tlist;
     struct tasklet *t;
 
     spin_lock_irq(&tasklet_lock);
 
-    if ( list_empty(&tasklet_list) )
+    tlist = ( list_empty(&this_cpu(tasklet_list_pcpu)) ) ? &tasklet_list :
+        &this_cpu(tasklet_list_pcpu);
+    if ( list_empty(tlist) )
     {
         spin_unlock_irq(&tasklet_lock);
         return;
     }
 
-    t = list_entry(tasklet_list.next, struct tasklet, list);
+    t = list_entry(tlist->next, struct tasklet, list);
     list_del_init(&t->list);
 
     BUG_ON(t->is_dead || t->is_running || !t->is_scheduled);
@@ -138,14 +159,23 @@
     if ( t->is_scheduled )
     {
         BUG_ON(t->is_dead || !list_empty(&t->list));
-        list_add_tail(&t->list, &tasklet_list);
+        if ( t->scheduled_on >= NR_CPUS )
+            list_add_tail(&t->list, tlist);
+        else
+        {
+            unsigned int cpu = t->scheduled_on;
+
+            list_add_tail(&t->list, &per_cpu(tasklet_list_pcpu, cpu));
+            cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
+        }
     }
 
     /*
      * If there is more work to do then reschedule. We don't grab more work
      * immediately as we want to allow other softirq work to happen first.
      */
-    if ( !list_empty(&tasklet_list) )
+    if ( !list_empty(&tasklet_list) ||
+        !list_empty(&this_cpu(tasklet_list_pcpu)) )
         raise_softirq(TASKLET_SOFTIRQ);
 
     spin_unlock_irq(&tasklet_lock);
@@ -186,6 +216,12 @@
 
 void __init softirq_init(void)
 {
+    int i;
+
+    for_each_possible_cpu ( i )
+    {
+        INIT_LIST_HEAD(&per_cpu(tasklet_list_pcpu, i));
+    }
     open_softirq(TASKLET_SOFTIRQ, tasklet_action);
 }
 
diff -r aae7cb2f1841 -r 580130fe415c xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/include/asm-x86/domain.h	Tue Apr 13 15:16:12 2010 +0200
@@ -452,7 +452,8 @@
 #define hvm_svm         hvm_vcpu.u.svm
 
 /* Continue the current hypercall via func(data) on specified cpu. */
-int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data);
+int continue_hypercall_on_cpu(int cpu, void *hdl,
+                              long (*func)(void *hdl, void *data), void *data);
 
 void vcpu_show_execution_state(struct vcpu *);
 void vcpu_show_registers(const struct vcpu *);
diff -r aae7cb2f1841 -r 580130fe415c xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/include/xen/sched.h	Tue Apr 13 15:16:12 2010 +0200
@@ -132,8 +132,6 @@
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */
     bool_t           paused_for_shutdown;
-    /* VCPU affinity is temporarily locked from controller changes? */
-    bool_t           affinity_locked;
 
     /*
      * > 0: a single port is being polled;
@@ -581,9 +579,6 @@
 void vcpu_force_reschedule(struct vcpu *v);
 void cpu_disable_scheduler(void);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 uint64_t get_cpu_idle_time(unsigned int cpu);
diff -r aae7cb2f1841 -r 580130fe415c xen/include/xen/softirq.h
--- a/xen/include/xen/softirq.h	Fri Apr 09 08:54:25 2010 +0100
+++ b/xen/include/xen/softirq.h	Tue Apr 13 15:16:12 2010 +0200
@@ -50,14 +50,17 @@
     bool_t is_scheduled;
     bool_t is_running;
     bool_t is_dead;
+    unsigned int scheduled_on;
     void (*func)(unsigned long);
     unsigned long data;
 };
 
 #define DECLARE_TASKLET(name, func, data) \
-    struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, func, data }
+    struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, NR_CPUS, \
+                            func, data }
 
 void tasklet_schedule(struct tasklet *t);
+void tasklet_schedule_cpu(struct tasklet *t, int cpu);
 void tasklet_kill(struct tasklet *t);
 void tasklet_init(
     struct tasklet *t, void (*func)(unsigned long), unsigned long data);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-13 13:19 [Patch] continue_hypercall_on_cpu rework using tasklets Juergen Gross
@ 2010-04-13 13:40 ` Jan Beulich
  2010-04-13 13:49   ` Juergen Gross
  2010-04-13 14:41 ` Keir Fraser
  1 sibling, 1 reply; 44+ messages in thread
From: Jan Beulich @ 2010-04-13 13:40 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel

>>> Juergen Gross <juergen.gross@ts.fujitsu.com> 13.04.10 15:19 >>>
>cpupools are using continue_hypercall_on_cpu, but this function is defined for
>x86 only, not for ia64. I see 2 possible solutions:
>
>1. make continue_hypercall_on_cpu available on ia64, too
>2. make cpupools a pure x86 feature

Is there anything meaningful in the new code that's really x86-specific
(i.e. can't the whole code chunk be moved to xen/common/)?

Jan

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-13 13:40 ` Jan Beulich
@ 2010-04-13 13:49   ` Juergen Gross
  2010-04-13 15:08     ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-13 13:49 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

Jan Beulich wrote:
>>>> Juergen Gross <juergen.gross@ts.fujitsu.com> 13.04.10 15:19 >>>
>> cpupools are using continue_hypercall_on_cpu, but this function is defined for
>> x86 only, not for ia64. I see 2 possible solutions:
>>
>> 1. make continue_hypercall_on_cpu available on ia64, too
>> 2. make cpupools a pure x86 feature
> 
> Is there anything meaningful in the new code that's really x86-specific
> (i.e. can't the whole code chunk be moved to xen/common/)?

My main concern is the difference in the handling of schedule_tail. I just
can't tell how to change the ia64 variant to make it work correctly.

The other x86-specific part is the setting of the return register, but this
is rather easy doable by using an architecture specific macro.


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-13 13:19 [Patch] continue_hypercall_on_cpu rework using tasklets Juergen Gross
  2010-04-13 13:40 ` Jan Beulich
@ 2010-04-13 14:41 ` Keir Fraser
  2010-04-14  4:26   ` Juergen Gross
  1 sibling, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-13 14:41 UTC (permalink / raw)
  To: Juergen Gross, xen-devel

On 13/04/2010 14:19, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

> attached is a patch to use tasklets for continue_hypercall_on_cpu instead of
> temporarily pinning the vcpu to the target physical cpu.
> 
> This is thought as base for cpupools as Keir requested to get rid of the
> "borrow cpu" stuff in my original solution.

Why do you change the interface of continue_hypercall_on_cpu()? What's a
'hdl' anyway?

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-13 13:49   ` Juergen Gross
@ 2010-04-13 15:08     ` Keir Fraser
  2010-04-14  6:54       ` Juergen Gross
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-13 15:08 UTC (permalink / raw)
  To: Juergen Gross, Jan Beulich; +Cc: xen-devel

On 13/04/2010 14:49, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

>>> 1. make continue_hypercall_on_cpu available on ia64, too
>>> 2. make cpupools a pure x86 feature
>> 
>> Is there anything meaningful in the new code that's really x86-specific
>> (i.e. can't the whole code chunk be moved to xen/common/)?
> 
> My main concern is the difference in the handling of schedule_tail. I just
> can't tell how to change the ia64 variant to make it work correctly.

We were messing with schedule_tail in order to get control into the
hypercall continuation in the context of the same vcpu but on a different
pcpu. Now that you just execute the continuation in a straightforward
softirq/tasklet context, you don't need to mess with schedule_tail at all...
Right?

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-13 14:41 ` Keir Fraser
@ 2010-04-14  4:26   ` Juergen Gross
  2010-04-14  6:46     ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-14  4:26 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> On 13/04/2010 14:19, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>> attached is a patch to use tasklets for continue_hypercall_on_cpu instead of
>> temporarily pinning the vcpu to the target physical cpu.
>>
>> This is thought as base for cpupools as Keir requested to get rid of the
>> "borrow cpu" stuff in my original solution.
> 
> Why do you change the interface of continue_hypercall_on_cpu()? What's a
> 'hdl' anyway?

I need a way to find the migrate_info structure in case of nested calls of
continue_hypercall_on_cpu(). Originally this was done by storing it in the
vcpu structure, but this can't be done any more using tasklets. In my first
attempt I saved it in the per-cpu area, but this approach isn't working if
continue_hypercall_on_cpu() is called concurrently. So the cleanest way is
to pass it via a parameter.
'hdl' is just a 'handle'.


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-14  4:26   ` Juergen Gross
@ 2010-04-14  6:46     ` Keir Fraser
  2010-04-14  6:58       ` Juergen Gross
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-14  6:46 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel

On 14/04/2010 05:26, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

>> Why do you change the interface of continue_hypercall_on_cpu()? What's a
>> 'hdl' anyway?
> 
> I need a way to find the migrate_info structure in case of nested calls of
> continue_hypercall_on_cpu(). Originally this was done by storing it in the
> vcpu structure, but this can't be done any more using tasklets. In my first
> attempt I saved it in the per-cpu area, but this approach isn't working if
> continue_hypercall_on_cpu() is called concurrently. So the cleanest way is
> to pass it via a parameter.

The per-cpu area method should work fine, since Xen is non-preemptive? I
don't think the concurrency you are concerned about can happen.

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-13 15:08     ` Keir Fraser
@ 2010-04-14  6:54       ` Juergen Gross
  0 siblings, 0 replies; 44+ messages in thread
From: Juergen Gross @ 2010-04-14  6:54 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, Jan Beulich

[-- Attachment #1: Type: text/plain, Size: 1226 bytes --]

Keir Fraser wrote:
> On 13/04/2010 14:49, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>>>> 1. make continue_hypercall_on_cpu available on ia64, too
>>>> 2. make cpupools a pure x86 feature
>>> Is there anything meaningful in the new code that's really x86-specific
>>> (i.e. can't the whole code chunk be moved to xen/common/)?
>> My main concern is the difference in the handling of schedule_tail. I just
>> can't tell how to change the ia64 variant to make it work correctly.
> 
> We were messing with schedule_tail in order to get control into the
> hypercall continuation in the context of the same vcpu but on a different
> pcpu. Now that you just execute the continuation in a straightforward
> softirq/tasklet context, you don't need to mess with schedule_tail at all...
> Right?

Aah, you mean something like the attached patch?


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

[-- Attachment #2: tasklet.patch --]
[-- Type: text/x-patch, Size: 18066 bytes --]

Signed-off-by: juergen.gross@ts.fujitsu.com

# HG changeset patch
# User juergen.gross@ts.fujitsu.com
# Date 1271227921 -7200
# Node ID bbb8e0f142df0f8a26df90964c1b27414b9839ea
# Parent  c02cc832cb2d88c383d33c1ba50c381fae703308
rework of continue_hypercall_on_cpu to use tasklets, make architecture independant

diff -r c02cc832cb2d -r bbb8e0f142df xen/arch/x86/acpi/power.c
--- a/xen/arch/x86/acpi/power.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/arch/x86/acpi/power.c	Wed Apr 14 08:52:01 2010 +0200
@@ -234,7 +234,7 @@
     return error;
 }
 
-static long enter_state_helper(void *data)
+static long enter_state_helper(void *hdl, void *data)
 {
     struct acpi_sleep_info *sinfo = (struct acpi_sleep_info *)data;
     return enter_state(sinfo->sleep_state);
@@ -265,7 +265,7 @@
     acpi_sinfo.pm1b_cnt_val = sleep->pm1b_cnt_val;
     acpi_sinfo.sleep_state = sleep->sleep_state;
 
-    return continue_hypercall_on_cpu(0, enter_state_helper, &acpi_sinfo);
+    return continue_hypercall_on_cpu(0, NULL, enter_state_helper, &acpi_sinfo);
 }
 
 static int acpi_get_wake_status(void)
diff -r c02cc832cb2d -r bbb8e0f142df xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/arch/x86/domain.c	Wed Apr 14 08:52:01 2010 +0200
@@ -1517,82 +1517,6 @@
     flush_tlb_mask(&v->vcpu_dirty_cpumask);
 }
 
-struct migrate_info {
-    long (*func)(void *data);
-    void *data;
-    void (*saved_schedule_tail)(struct vcpu *);
-    cpumask_t saved_affinity;
-    unsigned int nest;
-};
-
-static void continue_hypercall_on_cpu_helper(struct vcpu *v)
-{
-    struct cpu_user_regs *regs = guest_cpu_user_regs();
-    struct migrate_info *info = v->arch.continue_info;
-    cpumask_t mask = info->saved_affinity;
-    void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
-
-    regs->eax = info->func(info->data);
-
-    if ( info->nest-- == 0 )
-    {
-        xfree(info);
-        v->arch.schedule_tail = saved_schedule_tail;
-        v->arch.continue_info = NULL;
-        vcpu_unlock_affinity(v, &mask);
-    }
-
-    (*saved_schedule_tail)(v);
-}
-
-int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
-{
-    struct vcpu *v = current;
-    struct migrate_info *info;
-    cpumask_t mask = cpumask_of_cpu(cpu);
-    int rc;
-
-    if ( cpu == smp_processor_id() )
-        return func(data);
-
-    info = v->arch.continue_info;
-    if ( info == NULL )
-    {
-        info = xmalloc(struct migrate_info);
-        if ( info == NULL )
-            return -ENOMEM;
-
-        rc = vcpu_lock_affinity(v, &mask);
-        if ( rc )
-        {
-            xfree(info);
-            return rc;
-        }
-
-        info->saved_schedule_tail = v->arch.schedule_tail;
-        info->saved_affinity = mask;
-        info->nest = 0;
-
-        v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
-        v->arch.continue_info = info;
-    }
-    else
-    {
-        BUG_ON(info->nest != 0);
-        rc = vcpu_locked_change_affinity(v, &mask);
-        if ( rc )
-            return rc;
-        info->nest++;
-    }
-
-    info->func = func;
-    info->data = data;
-
-    /* Dummy return value will be overwritten by new schedule_tail. */
-    BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
-    return 0;
-}
-
 #define next_arg(fmt, args) ({                                              \
     unsigned long __arg;                                                    \
     switch ( *(fmt)++ )                                                     \
diff -r c02cc832cb2d -r bbb8e0f142df xen/arch/x86/microcode.c
--- a/xen/arch/x86/microcode.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/arch/x86/microcode.c	Wed Apr 14 08:52:01 2010 +0200
@@ -28,6 +28,7 @@
 #include <xen/sched.h>
 #include <xen/smp.h>
 #include <xen/spinlock.h>
+#include <xen/domain.h>
 #include <xen/guest_access.h>
 
 #include <asm/current.h>
@@ -116,7 +117,7 @@
     return err;
 }
 
-static long do_microcode_update(void *_info)
+static long do_microcode_update(void *hdl, void *_info)
 {
     struct microcode_info *info = _info;
     int error;
@@ -129,7 +130,8 @@
 
     info->cpu = next_cpu(info->cpu, cpu_online_map);
     if ( info->cpu < NR_CPUS )
-        return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+        return continue_hypercall_on_cpu(info->cpu, hdl,
+                                         do_microcode_update, info);
 
     error = info->error;
     xfree(info);
@@ -162,5 +164,6 @@
     info->error = 0;
     info->cpu = first_cpu(cpu_online_map);
 
-    return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+    return continue_hypercall_on_cpu(info->cpu, NULL,
+                                     do_microcode_update, info);
 }
diff -r c02cc832cb2d -r bbb8e0f142df xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/arch/x86/platform_hypercall.c	Wed Apr 14 08:52:01 2010 +0200
@@ -48,12 +48,12 @@
 extern int set_px_pminfo(uint32_t cpu, struct xen_processor_performance *perf);
 extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
 
-static long cpu_frequency_change_helper(void *data)
+static long cpu_frequency_change_helper(void *hdl, void *data)
 {
     return cpu_frequency_change(this_cpu(freq));
 }
 
-static long cpu_down_helper(void *data)
+static long cpu_down_helper(void *hdl, void *data)
 {
     int cpu = (unsigned long)data;
     return cpu_down(cpu);
@@ -314,7 +314,7 @@
         if ( op->u.change_freq.flags || !cpu_online(op->u.change_freq.cpu) )
             break;
         per_cpu(freq, op->u.change_freq.cpu) = op->u.change_freq.freq;
-        ret = continue_hypercall_on_cpu(op->u.change_freq.cpu,
+        ret = continue_hypercall_on_cpu(op->u.change_freq.cpu, NULL,
                                         cpu_frequency_change_helper,
                                         NULL);
         break;
@@ -467,7 +467,7 @@
             break;
         }
         ret = continue_hypercall_on_cpu(
-          0, cpu_down_helper, (void *)(unsigned long)cpu);
+          0, NULL, cpu_down_helper, (void *)(unsigned long)cpu);
         break;
     }
     break;
diff -r c02cc832cb2d -r bbb8e0f142df xen/arch/x86/sysctl.c
--- a/xen/arch/x86/sysctl.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/arch/x86/sysctl.c	Wed Apr 14 08:52:01 2010 +0200
@@ -14,6 +14,7 @@
 #include <public/sysctl.h>
 #include <xen/sched.h>
 #include <xen/event.h>
+#include <xen/domain.h>
 #include <xen/domain_page.h>
 #include <asm/msr.h>
 #include <xen/trace.h>
@@ -29,7 +30,7 @@
 
 #define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
 
-static long cpu_down_helper(void *data)
+static long cpu_down_helper(void *hdl, void *data)
 {
     int cpu = (unsigned long)data;
     return cpu_down(cpu);
@@ -186,7 +187,7 @@
             break;
         case XEN_SYSCTL_CPU_HOTPLUG_OFFLINE:
             ret = continue_hypercall_on_cpu(
-                0, cpu_down_helper, (void *)(unsigned long)cpu);
+                0, NULL, cpu_down_helper, (void *)(unsigned long)cpu);
             break;
         case XEN_SYSCTL_CPU_HOTPLUG_STATUS:
             ret = 0;
diff -r c02cc832cb2d -r bbb8e0f142df xen/common/domain.c
--- a/xen/common/domain.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/common/domain.c	Wed Apr 14 08:52:01 2010 +0200
@@ -898,6 +898,76 @@
     return -ENOSYS;
 }
 
+struct migrate_info {
+    long (*func)(void *hdl, void *data);
+    void *data;
+    volatile int nest;
+    struct cpu_user_regs volatile *regs;
+    struct vcpu *v;
+};
+
+static void continue_hypercall_on_cpu_tasklet(struct migrate_info *info)
+{
+    struct vcpu *v = info->v;
+
+    while ( get_return_reg(info->regs) )
+        cpu_relax();
+
+    /* vcpu is paused now, return value can be set */
+    set_return_reg(info->regs, info->func((void *)info, info->data));
+
+    if ( info->nest-- == 0 )
+    {
+        xfree(info);
+        vcpu_unpause(v);
+    }
+
+    return;
+}
+
+int continue_hypercall_on_cpu(int cpu, void *hdl,
+                              long (*func)(void *hdl, void *data), void *data)
+{
+    struct vcpu *v = current;
+    struct migrate_info *info = (struct migrate_info *)hdl;
+
+    if ( cpu == smp_processor_id() )
+        return func(info, data);
+
+    if ( info == NULL )
+    {
+        info = xmalloc(struct migrate_info);
+        if ( info == NULL )
+            return -ENOMEM;
+
+        info->nest = 0;
+        info->v = v;
+        info->regs = guest_cpu_user_regs();
+        tasklet_init(&v->cont_tasklet,
+                     (void(*)(unsigned long))continue_hypercall_on_cpu_tasklet,
+                     (unsigned long)info);
+
+        vcpu_pause_nosync(v);
+    }
+    else
+    {
+        v = info->v;
+        BUG_ON(info->nest != 0);
+        info->nest++;
+    }
+
+    info->func = func;
+    info->data = data;
+
+    set_return_reg(info->regs, -EBUSY);
+
+    tasklet_schedule_cpu(&v->cont_tasklet, cpu);
+    raise_softirq(SCHEDULE_SOFTIRQ);
+
+    /* Dummy return value will be overwritten by tasklet. */
+    return 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff -r c02cc832cb2d -r bbb8e0f142df xen/common/schedule.c
--- a/xen/common/schedule.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/common/schedule.c	Wed Apr 14 08:52:01 2010 +0200
@@ -408,26 +408,18 @@
     }
 }
 
-static int __vcpu_set_affinity(
-    struct vcpu *v, cpumask_t *affinity,
-    bool_t old_lock_status, bool_t new_lock_status)
+int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
 {
     cpumask_t online_affinity, old_affinity;
+
+    if ( v->domain->is_pinned )
+        return -EINVAL;
 
     cpus_and(online_affinity, *affinity, cpu_online_map);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
     vcpu_schedule_lock_irq(v);
-
-    if ( v->affinity_locked != old_lock_status )
-    {
-        BUG_ON(!v->affinity_locked);
-        vcpu_schedule_unlock_irq(v);
-        return -EBUSY;
-    }
-
-    v->affinity_locked = new_lock_status;
 
     old_affinity = v->cpu_affinity;
     v->cpu_affinity = *affinity;
@@ -444,36 +436,6 @@
     }
 
     return 0;
-}
-
-int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    if ( v->domain->is_pinned )
-        return -EINVAL;
-    return __vcpu_set_affinity(v, affinity, 0, 0);
-}
-
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 0, 1);
-}
-
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 1, 1);
-}
-
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    cpumask_t online_affinity;
-
-    /* Do not fail if no CPU in old affinity mask is online. */
-    cpus_and(online_affinity, *affinity, cpu_online_map);
-    if ( cpus_empty(online_affinity) )
-        *affinity = cpu_online_map;
-
-    if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
-        BUG();
 }
 
 /* Block the currently-executing domain until a pertinent event occurs. */
diff -r c02cc832cb2d -r bbb8e0f142df xen/common/softirq.c
--- a/xen/common/softirq.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/common/softirq.c	Wed Apr 14 08:52:01 2010 +0200
@@ -88,9 +88,11 @@
 }
 
 static LIST_HEAD(tasklet_list);
+static DEFINE_PER_CPU(struct list_head, tasklet_list_pcpu);
 static DEFINE_SPINLOCK(tasklet_lock);
 
-void tasklet_schedule(struct tasklet *t)
+static void tasklet_schedule_list(struct tasklet *t, struct list_head *tlist,
+    int cpu)
 {
     unsigned long flags;
 
@@ -101,28 +103,47 @@
         if ( !t->is_scheduled && !t->is_running )
         {
             BUG_ON(!list_empty(&t->list));
-            list_add_tail(&t->list, &tasklet_list);
+            list_add_tail(&t->list, tlist);
+            t->scheduled_on = NR_CPUS;
         }
         t->is_scheduled = 1;
-        raise_softirq(TASKLET_SOFTIRQ);
+        if ( cpu == smp_processor_id() )
+            raise_softirq(TASKLET_SOFTIRQ);
+        else if ( !t->is_running )
+            cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
+        else
+            t->scheduled_on = cpu;
     }
 
     spin_unlock_irqrestore(&tasklet_lock, flags);
 }
 
+void tasklet_schedule(struct tasklet *t)
+{
+    tasklet_schedule_list(t, &tasklet_list, smp_processor_id());
+}
+
+void tasklet_schedule_cpu(struct tasklet *t, int cpu)
+{
+    tasklet_schedule_list(t, &per_cpu(tasklet_list_pcpu, cpu), cpu);
+}
+
 static void tasklet_action(void)
 {
+    struct list_head *tlist;
     struct tasklet *t;
 
     spin_lock_irq(&tasklet_lock);
 
-    if ( list_empty(&tasklet_list) )
+    tlist = ( list_empty(&this_cpu(tasklet_list_pcpu)) ) ? &tasklet_list :
+        &this_cpu(tasklet_list_pcpu);
+    if ( list_empty(tlist) )
     {
         spin_unlock_irq(&tasklet_lock);
         return;
     }
 
-    t = list_entry(tasklet_list.next, struct tasklet, list);
+    t = list_entry(tlist->next, struct tasklet, list);
     list_del_init(&t->list);
 
     BUG_ON(t->is_dead || t->is_running || !t->is_scheduled);
@@ -138,14 +159,23 @@
     if ( t->is_scheduled )
     {
         BUG_ON(t->is_dead || !list_empty(&t->list));
-        list_add_tail(&t->list, &tasklet_list);
+        if ( t->scheduled_on >= NR_CPUS )
+            list_add_tail(&t->list, tlist);
+        else
+        {
+            unsigned int cpu = t->scheduled_on;
+
+            list_add_tail(&t->list, &per_cpu(tasklet_list_pcpu, cpu));
+            cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
+        }
     }
 
     /*
      * If there is more work to do then reschedule. We don't grab more work
      * immediately as we want to allow other softirq work to happen first.
      */
-    if ( !list_empty(&tasklet_list) )
+    if ( !list_empty(&tasklet_list) ||
+        !list_empty(&this_cpu(tasklet_list_pcpu)) )
         raise_softirq(TASKLET_SOFTIRQ);
 
     spin_unlock_irq(&tasklet_lock);
@@ -186,6 +216,12 @@
 
 void __init softirq_init(void)
 {
+    int i;
+
+    for_each_possible_cpu ( i )
+    {
+        INIT_LIST_HEAD(&per_cpu(tasklet_list_pcpu, i));
+    }
     open_softirq(TASKLET_SOFTIRQ, tasklet_action);
 }
 
diff -r c02cc832cb2d -r bbb8e0f142df xen/include/asm-ia64/regs.h
--- a/xen/include/asm-ia64/regs.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/asm-ia64/regs.h	Wed Apr 14 08:52:01 2010 +0200
@@ -1,1 +1,4 @@
 #include <asm/ptrace.h>
+
+#define get_return_reg(r)      r->r8
+#define set_return_reg(r, val) r->r8 = val
diff -r c02cc832cb2d -r bbb8e0f142df xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/asm-x86/domain.h	Wed Apr 14 08:52:01 2010 +0200
@@ -451,9 +451,6 @@
 #define hvm_vmx         hvm_vcpu.u.vmx
 #define hvm_svm         hvm_vcpu.u.svm
 
-/* Continue the current hypercall via func(data) on specified cpu. */
-int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data);
-
 void vcpu_show_execution_state(struct vcpu *);
 void vcpu_show_registers(const struct vcpu *);
 
diff -r c02cc832cb2d -r bbb8e0f142df xen/include/asm-x86/regs.h
--- a/xen/include/asm-x86/regs.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/asm-x86/regs.h	Wed Apr 14 08:52:01 2010 +0200
@@ -19,4 +19,7 @@
     (diff == 0);                                                              \
 })
 
+#define get_return_reg(r)      r->eax
+#define set_return_reg(r, val) r->eax = val
+
 #endif /* __X86_REGS_H__ */
diff -r c02cc832cb2d -r bbb8e0f142df xen/include/xen/domain.h
--- a/xen/include/xen/domain.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/xen/domain.h	Wed Apr 14 08:52:01 2010 +0200
@@ -62,6 +62,10 @@
 bool_t domctl_lock_acquire(void);
 void domctl_lock_release(void);
 
+/* Continue the current hypercall via func(data) on specified cpu. */
+int continue_hypercall_on_cpu(int cpu, void *hdl,
+                              long (*func)(void *hdl, void *data), void *data);
+
 extern unsigned int xen_processor_pmbits;
 
 #endif /* __XEN_DOMAIN_H__ */
diff -r c02cc832cb2d -r bbb8e0f142df xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/xen/sched.h	Wed Apr 14 08:52:01 2010 +0200
@@ -132,8 +132,6 @@
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */
     bool_t           paused_for_shutdown;
-    /* VCPU affinity is temporarily locked from controller changes? */
-    bool_t           affinity_locked;
 
     /*
      * > 0: a single port is being polled;
@@ -156,6 +154,9 @@
 
     /* Bitmask of CPUs which are holding onto this VCPU's state. */
     cpumask_t        vcpu_dirty_cpumask;
+
+    /* tasklet for continue_hypercall_on_cpu() */
+    struct tasklet   cont_tasklet;
 
     struct arch_vcpu arch;
 };
@@ -581,9 +582,6 @@
 void vcpu_force_reschedule(struct vcpu *v);
 void cpu_disable_scheduler(void);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 uint64_t get_cpu_idle_time(unsigned int cpu);
diff -r c02cc832cb2d -r bbb8e0f142df xen/include/xen/softirq.h
--- a/xen/include/xen/softirq.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/xen/softirq.h	Wed Apr 14 08:52:01 2010 +0200
@@ -50,14 +50,17 @@
     bool_t is_scheduled;
     bool_t is_running;
     bool_t is_dead;
+    unsigned int scheduled_on;
     void (*func)(unsigned long);
     unsigned long data;
 };
 
 #define DECLARE_TASKLET(name, func, data) \
-    struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, func, data }
+    struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, NR_CPUS, \
+                            func, data }
 
 void tasklet_schedule(struct tasklet *t);
+void tasklet_schedule_cpu(struct tasklet *t, int cpu);
 void tasklet_kill(struct tasklet *t);
 void tasklet_init(
     struct tasklet *t, void (*func)(unsigned long), unsigned long data);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-14  6:46     ` Keir Fraser
@ 2010-04-14  6:58       ` Juergen Gross
  2010-04-14  7:15         ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-14  6:58 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> On 14/04/2010 05:26, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>>> Why do you change the interface of continue_hypercall_on_cpu()? What's a
>>> 'hdl' anyway?
>> I need a way to find the migrate_info structure in case of nested calls of
>> continue_hypercall_on_cpu(). Originally this was done by storing it in the
>> vcpu structure, but this can't be done any more using tasklets. In my first
>> attempt I saved it in the per-cpu area, but this approach isn't working if
>> continue_hypercall_on_cpu() is called concurrently. So the cleanest way is
>> to pass it via a parameter.
> 
> The per-cpu area method should work fine, since Xen is non-preemptive? I
> don't think the concurrency you are concerned about can happen.

The tasklet knows only on which cpu it is running, so the data has to be
stored on the target cpu. And one pcpu can be the target of concurrent calls
from different calling cpus...


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-14  6:58       ` Juergen Gross
@ 2010-04-14  7:15         ` Keir Fraser
  2010-04-14  7:25           ` Juergen Gross
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-14  7:15 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel

On 14/04/2010 07:58, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

>> The per-cpu area method should work fine, since Xen is non-preemptive? I
>> don't think the concurrency you are concerned about can happen.
> 
> The tasklet knows only on which cpu it is running, so the data has to be
> stored on the target cpu. And one pcpu can be the target of concurrent calls
> from different calling cpus...

A tasklet also takes an arbitrary ulong parameter, which you can cast to a
pointer to your informational structure. The parameter is specified via
tasklet_init(). That should suffice.

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-14  7:15         ` Keir Fraser
@ 2010-04-14  7:25           ` Juergen Gross
  2010-04-14  7:35             ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-14  7:25 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> On 14/04/2010 07:58, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>>> The per-cpu area method should work fine, since Xen is non-preemptive? I
>>> don't think the concurrency you are concerned about can happen.
>> The tasklet knows only on which cpu it is running, so the data has to be
>> stored on the target cpu. And one pcpu can be the target of concurrent calls
>> from different calling cpus...
> 
> A tasklet also takes an arbitrary ulong parameter, which you can cast to a
> pointer to your informational structure. The parameter is specified via
> tasklet_init(). That should suffice.

I'm already using this. The problem is to find the original calling vcpu in
case of a nested call of continue_hypercall_on_cpu() while not conflicting
with concurrent calls from other vcpus which happen to address the same pcpu.


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-14  7:25           ` Juergen Gross
@ 2010-04-14  7:35             ` Keir Fraser
  2010-04-14  8:04               ` Juergen Gross
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-14  7:35 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel

On 14/04/2010 08:25, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

>> A tasklet also takes an arbitrary ulong parameter, which you can cast to a
>> pointer to your informational structure. The parameter is specified via
>> tasklet_init(). That should suffice.
> 
> I'm already using this. The problem is to find the original calling vcpu in
> case of a nested call of continue_hypercall_on_cpu() while not conflicting
> with concurrent calls from other vcpus which happen to address the same pcpu.

There can be only one nested invocation on any given pcpu, since a running
invocation is never preempted. Hence on entry to c_h_o_c() you can check a
per-cpu variable to see whether this invocation is nesting, or not. And if
it is, that variable can be a pointer to an info structure which includes a
pointer to the invoking vcpu.

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-14  7:35             ` Keir Fraser
@ 2010-04-14  8:04               ` Juergen Gross
  2010-04-14 10:30                 ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-14  8:04 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 1298 bytes --]

Keir Fraser wrote:
> On 14/04/2010 08:25, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>>> A tasklet also takes an arbitrary ulong parameter, which you can cast to a
>>> pointer to your informational structure. The parameter is specified via
>>> tasklet_init(). That should suffice.
>> I'm already using this. The problem is to find the original calling vcpu in
>> case of a nested call of continue_hypercall_on_cpu() while not conflicting
>> with concurrent calls from other vcpus which happen to address the same pcpu.
> 
> There can be only one nested invocation on any given pcpu, since a running
> invocation is never preempted. Hence on entry to c_h_o_c() you can check a
> per-cpu variable to see whether this invocation is nesting, or not. And if
> it is, that variable can be a pointer to an info structure which includes a
> pointer to the invoking vcpu.

Okay, attached is the modified patch again.


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

[-- Attachment #2: tasklet.patch --]
[-- Type: text/x-patch, Size: 14443 bytes --]

Signed-off-by: juergen.gross@ts.fujitsu.com

# HG changeset patch
# User juergen.gross@ts.fujitsu.com
# Date 1271232093 -7200
# Node ID 4f01d2bf496cbd95649775deb004e26e74008227
# Parent  c02cc832cb2d88c383d33c1ba50c381fae703308
rework of continue_hypercall_on_cpu to use tasklets, make architecture independant

diff -r c02cc832cb2d -r 4f01d2bf496c xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/arch/x86/domain.c	Wed Apr 14 10:01:33 2010 +0200
@@ -1517,82 +1517,6 @@
     flush_tlb_mask(&v->vcpu_dirty_cpumask);
 }
 
-struct migrate_info {
-    long (*func)(void *data);
-    void *data;
-    void (*saved_schedule_tail)(struct vcpu *);
-    cpumask_t saved_affinity;
-    unsigned int nest;
-};
-
-static void continue_hypercall_on_cpu_helper(struct vcpu *v)
-{
-    struct cpu_user_regs *regs = guest_cpu_user_regs();
-    struct migrate_info *info = v->arch.continue_info;
-    cpumask_t mask = info->saved_affinity;
-    void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
-
-    regs->eax = info->func(info->data);
-
-    if ( info->nest-- == 0 )
-    {
-        xfree(info);
-        v->arch.schedule_tail = saved_schedule_tail;
-        v->arch.continue_info = NULL;
-        vcpu_unlock_affinity(v, &mask);
-    }
-
-    (*saved_schedule_tail)(v);
-}
-
-int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
-{
-    struct vcpu *v = current;
-    struct migrate_info *info;
-    cpumask_t mask = cpumask_of_cpu(cpu);
-    int rc;
-
-    if ( cpu == smp_processor_id() )
-        return func(data);
-
-    info = v->arch.continue_info;
-    if ( info == NULL )
-    {
-        info = xmalloc(struct migrate_info);
-        if ( info == NULL )
-            return -ENOMEM;
-
-        rc = vcpu_lock_affinity(v, &mask);
-        if ( rc )
-        {
-            xfree(info);
-            return rc;
-        }
-
-        info->saved_schedule_tail = v->arch.schedule_tail;
-        info->saved_affinity = mask;
-        info->nest = 0;
-
-        v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
-        v->arch.continue_info = info;
-    }
-    else
-    {
-        BUG_ON(info->nest != 0);
-        rc = vcpu_locked_change_affinity(v, &mask);
-        if ( rc )
-            return rc;
-        info->nest++;
-    }
-
-    info->func = func;
-    info->data = data;
-
-    /* Dummy return value will be overwritten by new schedule_tail. */
-    BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
-    return 0;
-}
-
 #define next_arg(fmt, args) ({                                              \
     unsigned long __arg;                                                    \
     switch ( *(fmt)++ )                                                     \
diff -r c02cc832cb2d -r 4f01d2bf496c xen/arch/x86/microcode.c
--- a/xen/arch/x86/microcode.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/arch/x86/microcode.c	Wed Apr 14 10:01:33 2010 +0200
@@ -28,6 +28,7 @@
 #include <xen/sched.h>
 #include <xen/smp.h>
 #include <xen/spinlock.h>
+#include <xen/domain.h>
 #include <xen/guest_access.h>
 
 #include <asm/current.h>
diff -r c02cc832cb2d -r 4f01d2bf496c xen/arch/x86/sysctl.c
--- a/xen/arch/x86/sysctl.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/arch/x86/sysctl.c	Wed Apr 14 10:01:33 2010 +0200
@@ -14,6 +14,7 @@
 #include <public/sysctl.h>
 #include <xen/sched.h>
 #include <xen/event.h>
+#include <xen/domain.h>
 #include <xen/domain_page.h>
 #include <asm/msr.h>
 #include <xen/trace.h>
diff -r c02cc832cb2d -r 4f01d2bf496c xen/common/domain.c
--- a/xen/common/domain.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/common/domain.c	Wed Apr 14 10:01:33 2010 +0200
@@ -898,6 +898,81 @@
     return -ENOSYS;
 }
 
+struct migrate_info {
+    long (*func)(void *data);
+    void *data;
+    volatile int nest;
+    struct cpu_user_regs volatile *regs;
+    struct vcpu *v;
+};
+
+static DEFINE_PER_CPU(struct migrate_info *, continue_info);
+
+static void continue_hypercall_on_cpu_tasklet(struct migrate_info *info)
+{
+    struct vcpu *v = info->v;
+
+    while ( get_return_reg(info->regs) )
+        cpu_relax();
+
+    per_cpu(continue_info, smp_processor_id()) = info;
+
+    /* vcpu is paused now, return value can be set */
+    set_return_reg(info->regs, info->func(info->data));
+
+    per_cpu(continue_info, smp_processor_id()) = NULL;
+
+    if ( info->nest-- == 0 )
+    {
+        xfree(info);
+        vcpu_unpause(v);
+    }
+
+    return;
+}
+
+int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
+{
+    struct vcpu *v = current;
+    struct migrate_info *info = per_cpu(continue_info, smp_processor_id());
+
+    if ( cpu == smp_processor_id() )
+        return func(data);
+
+    if ( info == NULL )
+    {
+        info = xmalloc(struct migrate_info);
+        if ( info == NULL )
+            return -ENOMEM;
+
+        info->nest = 0;
+        info->v = v;
+        info->regs = guest_cpu_user_regs();
+        tasklet_init(&v->cont_tasklet,
+                     (void(*)(unsigned long))continue_hypercall_on_cpu_tasklet,
+                     (unsigned long)info);
+
+        vcpu_pause_nosync(v);
+    }
+    else
+    {
+        v = info->v;
+        BUG_ON(info->nest != 0);
+        info->nest++;
+    }
+
+    info->func = func;
+    info->data = data;
+
+    set_return_reg(info->regs, -EBUSY);
+
+    tasklet_schedule_cpu(&v->cont_tasklet, cpu);
+    raise_softirq(SCHEDULE_SOFTIRQ);
+
+    /* Dummy return value will be overwritten by tasklet. */
+    return 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff -r c02cc832cb2d -r 4f01d2bf496c xen/common/schedule.c
--- a/xen/common/schedule.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/common/schedule.c	Wed Apr 14 10:01:33 2010 +0200
@@ -408,26 +408,18 @@
     }
 }
 
-static int __vcpu_set_affinity(
-    struct vcpu *v, cpumask_t *affinity,
-    bool_t old_lock_status, bool_t new_lock_status)
+int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
 {
     cpumask_t online_affinity, old_affinity;
+
+    if ( v->domain->is_pinned )
+        return -EINVAL;
 
     cpus_and(online_affinity, *affinity, cpu_online_map);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
     vcpu_schedule_lock_irq(v);
-
-    if ( v->affinity_locked != old_lock_status )
-    {
-        BUG_ON(!v->affinity_locked);
-        vcpu_schedule_unlock_irq(v);
-        return -EBUSY;
-    }
-
-    v->affinity_locked = new_lock_status;
 
     old_affinity = v->cpu_affinity;
     v->cpu_affinity = *affinity;
@@ -444,36 +436,6 @@
     }
 
     return 0;
-}
-
-int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    if ( v->domain->is_pinned )
-        return -EINVAL;
-    return __vcpu_set_affinity(v, affinity, 0, 0);
-}
-
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 0, 1);
-}
-
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 1, 1);
-}
-
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    cpumask_t online_affinity;
-
-    /* Do not fail if no CPU in old affinity mask is online. */
-    cpus_and(online_affinity, *affinity, cpu_online_map);
-    if ( cpus_empty(online_affinity) )
-        *affinity = cpu_online_map;
-
-    if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
-        BUG();
 }
 
 /* Block the currently-executing domain until a pertinent event occurs. */
diff -r c02cc832cb2d -r 4f01d2bf496c xen/common/softirq.c
--- a/xen/common/softirq.c	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/common/softirq.c	Wed Apr 14 10:01:33 2010 +0200
@@ -88,9 +88,11 @@
 }
 
 static LIST_HEAD(tasklet_list);
+static DEFINE_PER_CPU(struct list_head, tasklet_list_pcpu);
 static DEFINE_SPINLOCK(tasklet_lock);
 
-void tasklet_schedule(struct tasklet *t)
+static void tasklet_schedule_list(struct tasklet *t, struct list_head *tlist,
+    int cpu)
 {
     unsigned long flags;
 
@@ -101,28 +103,47 @@
         if ( !t->is_scheduled && !t->is_running )
         {
             BUG_ON(!list_empty(&t->list));
-            list_add_tail(&t->list, &tasklet_list);
+            list_add_tail(&t->list, tlist);
+            t->scheduled_on = NR_CPUS;
         }
         t->is_scheduled = 1;
-        raise_softirq(TASKLET_SOFTIRQ);
+        if ( cpu == smp_processor_id() )
+            raise_softirq(TASKLET_SOFTIRQ);
+        else if ( !t->is_running )
+            cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
+        else
+            t->scheduled_on = cpu;
     }
 
     spin_unlock_irqrestore(&tasklet_lock, flags);
 }
 
+void tasklet_schedule(struct tasklet *t)
+{
+    tasklet_schedule_list(t, &tasklet_list, smp_processor_id());
+}
+
+void tasklet_schedule_cpu(struct tasklet *t, int cpu)
+{
+    tasklet_schedule_list(t, &per_cpu(tasklet_list_pcpu, cpu), cpu);
+}
+
 static void tasklet_action(void)
 {
+    struct list_head *tlist;
     struct tasklet *t;
 
     spin_lock_irq(&tasklet_lock);
 
-    if ( list_empty(&tasklet_list) )
+    tlist = ( list_empty(&this_cpu(tasklet_list_pcpu)) ) ? &tasklet_list :
+        &this_cpu(tasklet_list_pcpu);
+    if ( list_empty(tlist) )
     {
         spin_unlock_irq(&tasklet_lock);
         return;
     }
 
-    t = list_entry(tasklet_list.next, struct tasklet, list);
+    t = list_entry(tlist->next, struct tasklet, list);
     list_del_init(&t->list);
 
     BUG_ON(t->is_dead || t->is_running || !t->is_scheduled);
@@ -138,14 +159,23 @@
     if ( t->is_scheduled )
     {
         BUG_ON(t->is_dead || !list_empty(&t->list));
-        list_add_tail(&t->list, &tasklet_list);
+        if ( t->scheduled_on >= NR_CPUS )
+            list_add_tail(&t->list, tlist);
+        else
+        {
+            unsigned int cpu = t->scheduled_on;
+
+            list_add_tail(&t->list, &per_cpu(tasklet_list_pcpu, cpu));
+            cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
+        }
     }
 
     /*
      * If there is more work to do then reschedule. We don't grab more work
      * immediately as we want to allow other softirq work to happen first.
      */
-    if ( !list_empty(&tasklet_list) )
+    if ( !list_empty(&tasklet_list) ||
+        !list_empty(&this_cpu(tasklet_list_pcpu)) )
         raise_softirq(TASKLET_SOFTIRQ);
 
     spin_unlock_irq(&tasklet_lock);
@@ -186,6 +216,12 @@
 
 void __init softirq_init(void)
 {
+    int i;
+
+    for_each_possible_cpu ( i )
+    {
+        INIT_LIST_HEAD(&per_cpu(tasklet_list_pcpu, i));
+    }
     open_softirq(TASKLET_SOFTIRQ, tasklet_action);
 }
 
diff -r c02cc832cb2d -r 4f01d2bf496c xen/include/asm-ia64/regs.h
--- a/xen/include/asm-ia64/regs.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/asm-ia64/regs.h	Wed Apr 14 10:01:33 2010 +0200
@@ -1,1 +1,4 @@
 #include <asm/ptrace.h>
+
+#define get_return_reg(r)      r->r8
+#define set_return_reg(r, val) r->r8 = val
diff -r c02cc832cb2d -r 4f01d2bf496c xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/asm-x86/domain.h	Wed Apr 14 10:01:33 2010 +0200
@@ -451,9 +451,6 @@
 #define hvm_vmx         hvm_vcpu.u.vmx
 #define hvm_svm         hvm_vcpu.u.svm
 
-/* Continue the current hypercall via func(data) on specified cpu. */
-int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data);
-
 void vcpu_show_execution_state(struct vcpu *);
 void vcpu_show_registers(const struct vcpu *);
 
diff -r c02cc832cb2d -r 4f01d2bf496c xen/include/asm-x86/regs.h
--- a/xen/include/asm-x86/regs.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/asm-x86/regs.h	Wed Apr 14 10:01:33 2010 +0200
@@ -19,4 +19,7 @@
     (diff == 0);                                                              \
 })
 
+#define get_return_reg(r)      r->eax
+#define set_return_reg(r, val) r->eax = val
+
 #endif /* __X86_REGS_H__ */
diff -r c02cc832cb2d -r 4f01d2bf496c xen/include/xen/domain.h
--- a/xen/include/xen/domain.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/xen/domain.h	Wed Apr 14 10:01:33 2010 +0200
@@ -62,6 +62,9 @@
 bool_t domctl_lock_acquire(void);
 void domctl_lock_release(void);
 
+/* Continue the current hypercall via func(data) on specified cpu. */
+int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data);
+
 extern unsigned int xen_processor_pmbits;
 
 #endif /* __XEN_DOMAIN_H__ */
diff -r c02cc832cb2d -r 4f01d2bf496c xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/xen/sched.h	Wed Apr 14 10:01:33 2010 +0200
@@ -132,8 +132,6 @@
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */
     bool_t           paused_for_shutdown;
-    /* VCPU affinity is temporarily locked from controller changes? */
-    bool_t           affinity_locked;
 
     /*
      * > 0: a single port is being polled;
@@ -156,6 +154,9 @@
 
     /* Bitmask of CPUs which are holding onto this VCPU's state. */
     cpumask_t        vcpu_dirty_cpumask;
+
+    /* tasklet for continue_hypercall_on_cpu() */
+    struct tasklet   cont_tasklet;
 
     struct arch_vcpu arch;
 };
@@ -581,9 +582,6 @@
 void vcpu_force_reschedule(struct vcpu *v);
 void cpu_disable_scheduler(void);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 uint64_t get_cpu_idle_time(unsigned int cpu);
diff -r c02cc832cb2d -r 4f01d2bf496c xen/include/xen/softirq.h
--- a/xen/include/xen/softirq.h	Tue Apr 13 18:19:33 2010 +0100
+++ b/xen/include/xen/softirq.h	Wed Apr 14 10:01:33 2010 +0200
@@ -50,14 +50,17 @@
     bool_t is_scheduled;
     bool_t is_running;
     bool_t is_dead;
+    unsigned int scheduled_on;
     void (*func)(unsigned long);
     unsigned long data;
 };
 
 #define DECLARE_TASKLET(name, func, data) \
-    struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, func, data }
+    struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, NR_CPUS, \
+                            func, data }
 
 void tasklet_schedule(struct tasklet *t);
+void tasklet_schedule_cpu(struct tasklet *t, int cpu);
 void tasklet_kill(struct tasklet *t);
 void tasklet_init(
     struct tasklet *t, void (*func)(unsigned long), unsigned long data);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-14  8:04               ` Juergen Gross
@ 2010-04-14 10:30                 ` Keir Fraser
  2010-04-15  6:31                   ` Juergen Gross
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-14 10:30 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel

On 14/04/2010 09:04, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

>> 
>> There can be only one nested invocation on any given pcpu, since a running
>> invocation is never preempted. Hence on entry to c_h_o_c() you can check a
>> per-cpu variable to see whether this invocation is nesting, or not. And if
>> it is, that variable can be a pointer to an info structure which includes a
>> pointer to the invoking vcpu.
> 
> Okay, attached is the modified patch again.

I cleaned up some more and applied as xen-unstable:21165 and
xen-unstable:21166.

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-14 10:30                 ` Keir Fraser
@ 2010-04-15  6:31                   ` Juergen Gross
  2010-04-15  6:39                     ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-15  6:31 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> On 14/04/2010 09:04, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>>> There can be only one nested invocation on any given pcpu, since a running
>>> invocation is never preempted. Hence on entry to c_h_o_c() you can check a
>>> per-cpu variable to see whether this invocation is nesting, or not. And if
>>> it is, that variable can be a pointer to an info structure which includes a
>>> pointer to the invoking vcpu.
>> Okay, attached is the modified patch again.
> 
> I cleaned up some more and applied as xen-unstable:21165 and
> xen-unstable:21166.

Thanks. Unfortunately, your modifications are not working. Microcode update
hangs. My version worked without problems.

Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15  6:31                   ` Juergen Gross
@ 2010-04-15  6:39                     ` Keir Fraser
  2010-04-15  7:57                       ` Juergen Gross
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-15  6:39 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel

On 15/04/2010 07:31, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

>>> Okay, attached is the modified patch again.
>> 
>> I cleaned up some more and applied as xen-unstable:21165 and
>> xen-unstable:21166.
> 
> Thanks. Unfortunately, your modifications are not working. Microcode update
> hangs. My version worked without problems.

What revision did you test? I put in some fixes as c/s 21173.

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15  6:39                     ` Keir Fraser
@ 2010-04-15  7:57                       ` Juergen Gross
  2010-04-15  8:13                         ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-15  7:57 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> On 15/04/2010 07:31, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>>>> Okay, attached is the modified patch again.
>>> I cleaned up some more and applied as xen-unstable:21165 and
>>> xen-unstable:21166.
>> Thanks. Unfortunately, your modifications are not working. Microcode update
>> hangs. My version worked without problems.
> 
> What revision did you test? I put in some fixes as c/s 21173.

My highest c/s was 21167.
c/s 21173 is hanging, too (sorry for the delay, but I had to remove my cpupool
stuff due to the scheduler changes for credit2).

Is a call of sync_vcpu_execstate() fro a tasklet really allowed? I don't
think the ASSERTs in __sync_lazy_execstate() are all fulfilled in this case.


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15  7:57                       ` Juergen Gross
@ 2010-04-15  8:13                         ` Keir Fraser
  2010-04-15  8:22                           ` Keir Fraser
  2010-04-15  8:29                           ` Juergen Gross
  0 siblings, 2 replies; 44+ messages in thread
From: Keir Fraser @ 2010-04-15  8:13 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel

On 15/04/2010 08:57, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

>> What revision did you test? I put in some fixes as c/s 21173.
> 
> My highest c/s was 21167.
> c/s 21173 is hanging, too (sorry for the delay, but I had to remove my cpupool
> stuff due to the scheduler changes for credit2).

Ah yes, just done a test myself (clearly my dom0 setup is not doing
microcode updates) and I've now fixed it as c/s 21176. Thanks!

> Is a call of sync_vcpu_execstate() fro a tasklet really allowed? I don't
> think the ASSERTs in __sync_lazy_execstate() are all fulfilled in this case.

Better hope so or e.g.,
acpi_enter_sleep
->continue_hypercall_on_cpu(enter_state_helper)
->enter_state
->freeze_domains
->domain_pause
->vcpu_sleep_sync
->sync_vcpu_execstate
Also wouldn't work.

There is only one ASSERT in __sync_lazy_execstate, and it's safe for this
case. Bear in mind that our softirqs always run in the context of whatever
domain happens to be running on that cpu currently -- they don't have their
own proper vcpu context.

By the by, your original attempt at synchronisation (spin on return value in
regs changing) was risky as it could be unbounded time before the vcpu
registers get copied out of the original cpu's stack. Especially during
early dom0 boot, when the system is very idle.

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15  8:13                         ` Keir Fraser
@ 2010-04-15  8:22                           ` Keir Fraser
  2010-04-15  9:59                             ` Jiang, Yunhong
  2010-04-15  8:29                           ` Juergen Gross
  1 sibling, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-15  8:22 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel, Jiang, Yunhong

On 15/04/2010 09:13, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

> Better hope so or e.g.,
> acpi_enter_sleep
> ->continue_hypercall_on_cpu(enter_state_helper)
> ->enter_state
> ->freeze_domains
> ->domain_pause
> ->vcpu_sleep_sync
> ->sync_vcpu_execstate
> Also wouldn't work.

Actually that's a good example because it now won't work, but for other
reasons! The hypercall continuation can interrupt another vcpu's execution,
and then try to synchronously pause that vcpu. Which will deadlock.

Luckily I think we can re-jig this code to freeze_domains() before doing the
continue_hypercall_on_cpu(). I've cc'ed one of the CPU RAS guys. :-)

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15  8:13                         ` Keir Fraser
  2010-04-15  8:22                           ` Keir Fraser
@ 2010-04-15  8:29                           ` Juergen Gross
  2010-04-15  9:08                             ` Keir Fraser
  1 sibling, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-15  8:29 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> On 15/04/2010 08:57, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>>> What revision did you test? I put in some fixes as c/s 21173.
>> My highest c/s was 21167.
>> c/s 21173 is hanging, too (sorry for the delay, but I had to remove my cpupool
>> stuff due to the scheduler changes for credit2).
> 
> Ah yes, just done a test myself (clearly my dom0 setup is not doing
> microcode updates) and I've now fixed it as c/s 21176. Thanks!

Great! Works for me, too.

> 
>> Is a call of sync_vcpu_execstate() fro a tasklet really allowed? I don't
>> think the ASSERTs in __sync_lazy_execstate() are all fulfilled in this case.
> 
> Better hope so or e.g.,
> acpi_enter_sleep
> ->continue_hypercall_on_cpu(enter_state_helper)
> ->enter_state
> ->freeze_domains
> ->domain_pause
> ->vcpu_sleep_sync
> ->sync_vcpu_execstate
> Also wouldn't work.
> 
> There is only one ASSERT in __sync_lazy_execstate, and it's safe for this
> case. Bear in mind that our softirqs always run in the context of whatever
> domain happens to be running on that cpu currently -- they don't have their
> own proper vcpu context.

I don't see how it should be guaranteed that the current vcpu MUST be an idle
one...

> 
> By the by, your original attempt at synchronisation (spin on return value in
> regs changing) was risky as it could be unbounded time before the vcpu
> registers get copied out of the original cpu's stack. Especially during
> early dom0 boot, when the system is very idle.

Indeed. I realized my version had another flaw in case of nested calls of
c_h_o_c: if one call would return a non-zero value and issue another call
of c_h_o_c, the tasklet would hang for ever...
Your version is cleaner.


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15  8:29                           ` Juergen Gross
@ 2010-04-15  9:08                             ` Keir Fraser
  0 siblings, 0 replies; 44+ messages in thread
From: Keir Fraser @ 2010-04-15  9:08 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel

On 15/04/2010 09:29, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

>> There is only one ASSERT in __sync_lazy_execstate, and it's safe for this
>> case. Bear in mind that our softirqs always run in the context of whatever
>> domain happens to be running on that cpu currently -- they don't have their
>> own proper vcpu context.
> 
> I don't see how it should be guaranteed that the current vcpu MUST be an idle
> one...

It's odd because someone else asked this exact same question, so the code
must be more subtle than I thought. Note that the ASSERT is inside
if(switch_required), and switch_required is true only if the vcpu whose
state is loaded on this CPU is not the same as the currently-scheduled vcpu.
This odd situation is only possible if we are lazy-syncing the former vcpu's
state, and that only occurs if the currently-running vcpu is the idle vcpu
(as all other vcpus would need their state loaded immediately, so lazy sync
does not apply).

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15  8:22                           ` Keir Fraser
@ 2010-04-15  9:59                             ` Jiang, Yunhong
  2010-04-15 11:06                               ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Jiang, Yunhong @ 2010-04-15  9:59 UTC (permalink / raw)
  To: Keir Fraser, Juergen Gross; +Cc: xen-devel, Yu, Ke



>-----Original Message-----
>From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>Sent: Thursday, April 15, 2010 4:22 PM
>To: Juergen Gross
>Cc: xen-devel@lists.xensource.com; Jiang, Yunhong
>Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>
>On 15/04/2010 09:13, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
>
>> Better hope so or e.g.,
>> acpi_enter_sleep
>> ->continue_hypercall_on_cpu(enter_state_helper)
>> ->enter_state
>> ->freeze_domains
>> ->domain_pause
>> ->vcpu_sleep_sync
>> ->sync_vcpu_execstate
>> Also wouldn't work.
>
>Actually that's a good example because it now won't work, but for other
>reasons! The hypercall continuation can interrupt another vcpu's execution,
>and then try to synchronously pause that vcpu. Which will deadlock.
>
>Luckily I think we can re-jig this code to freeze_domains() before doing the
>continue_hypercall_on_cpu(). I've cc'ed one of the CPU RAS guys. :-)

Hmm, I have cc'ed one of the PM guys because it is enter_state :-)
Can we add check in vcpu_sleep_sync() for current? It is meaningless to cpu_relax for current vcpu in that situation, especially if we are not in irq context.
I'm not sure why in freeze_domains it only checkes dom0's vcpu for current, instead of all domains.

--jyh

>
> -- Keir
>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15  9:59                             ` Jiang, Yunhong
@ 2010-04-15 11:06                               ` Keir Fraser
  2010-04-15 11:22                                 ` Keir Fraser
                                                   ` (2 more replies)
  0 siblings, 3 replies; 44+ messages in thread
From: Keir Fraser @ 2010-04-15 11:06 UTC (permalink / raw)
  To: Jiang, Yunhong, Juergen Gross; +Cc: xen-devel, Yu, Ke

On 15/04/2010 10:59, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:

>> Actually that's a good example because it now won't work, but for other
>> reasons! The hypercall continuation can interrupt another vcpu's execution,
>> and then try to synchronously pause that vcpu. Which will deadlock.
>> 
>> Luckily I think we can re-jig this code to freeze_domains() before doing the
>> continue_hypercall_on_cpu(). I've cc'ed one of the CPU RAS guys. :-)
> 
> Hmm, I have cc'ed one of the PM guys because it is enter_state :-)
> Can we add check in vcpu_sleep_sync() for current? It is meaningless to
> cpu_relax for current vcpu in that situation, especially if we are not in irq
> context.
> I'm not sure why in freeze_domains it only checkes dom0's vcpu for current,
> instead of all domains.

Well actually pausing any vcpu from within the hypercall continuation is
dangerous. The softirq handler running the hypercall continuation may have
interrupted some running VCPU X. And the VCPU Y that the continuation is
currently trying to pause may itself be trying to pause X. So we can get a
deadlock that way. The freeze_domains() *has* to be pulled outside of the
hypercall continuation.

It's a little bit similar to the super-subtle stop_machine_run deadlock
possibility I just emailed to you a second ago. :-)

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15 11:06                               ` Keir Fraser
@ 2010-04-15 11:22                                 ` Keir Fraser
  2010-04-16  9:20                                   ` Yu, Ke
  2010-04-16  6:42                                 ` Jiang, Yunhong
  2010-04-16  6:45                                 ` Jiang, Yunhong
  2 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-15 11:22 UTC (permalink / raw)
  To: Jiang, Yunhong; +Cc: xen-devel, Yu, Ke

On 15/04/2010 12:06, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

>> Hmm, I have cc'ed one of the PM guys because it is enter_state :-)
>> Can we add check in vcpu_sleep_sync() for current? It is meaningless to
>> cpu_relax for current vcpu in that situation, especially if we are not in irq
>> context.
>> I'm not sure why in freeze_domains it only checkes dom0's vcpu for current,
>> instead of all domains.
> 
> Well actually pausing any vcpu from within the hypercall continuation is
> dangerous. The softirq handler running the hypercall continuation may have
> interrupted some running VCPU X. And the VCPU Y that the continuation is
> currently trying to pause may itself be trying to pause X. So we can get a
> deadlock that way. The freeze_domains() *has* to be pulled outside of the
> hypercall continuation.

Changesets 21180 and 21181 in xen-unstable are my attempts to fix this. Yu
Ke, please let me know if these look and test okay for you guys.

 Thanks,
 Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15 11:06                               ` Keir Fraser
  2010-04-15 11:22                                 ` Keir Fraser
@ 2010-04-16  6:42                                 ` Jiang, Yunhong
  2010-04-16  6:55                                   ` Juergen Gross
  2010-04-16  7:13                                   ` Keir Fraser
  2010-04-16  6:45                                 ` Jiang, Yunhong
  2 siblings, 2 replies; 44+ messages in thread
From: Jiang, Yunhong @ 2010-04-16  6:42 UTC (permalink / raw)
  To: Keir Fraser, Juergen Gross; +Cc: xen-devel, Yu, Ke



>-----Original Message-----
>From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>Sent: Thursday, April 15, 2010 7:07 PM
>To: Jiang, Yunhong; Juergen Gross
>Cc: xen-devel@lists.xensource.com; Yu, Ke
>Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>
>On 15/04/2010 10:59, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:
>
>>> Actually that's a good example because it now won't work, but for other
>>> reasons! The hypercall continuation can interrupt another vcpu's execution,
>>> and then try to synchronously pause that vcpu. Which will deadlock.
>>>
>>> Luckily I think we can re-jig this code to freeze_domains() before doing the
>>> continue_hypercall_on_cpu(). I've cc'ed one of the CPU RAS guys. :-)
>>
>> Hmm, I have cc'ed one of the PM guys because it is enter_state :-)
>> Can we add check in vcpu_sleep_sync() for current? It is meaningless to
>> cpu_relax for current vcpu in that situation, especially if we are not in irq
>> context.
>> I'm not sure why in freeze_domains it only checkes dom0's vcpu for current,
>> instead of all domains.
>
>Well actually pausing any vcpu from within the hypercall continuation is
>dangerous. The softirq handler running the hypercall continuation may have
>interrupted some running VCPU X. And the VCPU Y that the continuation is
>currently trying to pause may itself be trying to pause X. So we can get a
>deadlock that way. The freeze_domains() *has* to be pulled outside of the
>hypercall continuation.
>
>It's a little bit similar to the super-subtle stop_machine_run deadlock
>possibility I just emailed to you a second ago. :-)

Thanks for pointing out the stop_machine_run deadlock issue.

After more consideration and internally discussion, seems the key point is, the tasklet softirq is something like getting a lock for the current vcpu's state(i.e. no one else could change that state unless this softirq is finished). So any block action in softirq context, not just vcpu_pause_sync, is dangerous and should be avoided (we can't get a lock and do block action per my understanding).
This is because vcpu's state can only be changed by schedule softirq (am I right on this?), while schedule softirq can't prempt other softirq. So, more generally, anything that will be updated by a softirq context, and will be syncrhozed/blocking waitied in xen's vcpu context is in fact a implicit lock held by the softirq.

To the tricky bug on the stop_machine_run(), I think it is in fact similar to the cpu_add_remove_lock. The stop_machine_run() is a block action, so we must make sure no one will be blocking to get the lock that is held by stop_machine_run() already. At that time, we change all components that try to get the cpu_add_remove_lock to be try_lock.

The changes caused by the tasklet is, a new implicit lock is added, i.e. the vcpu's state.
The straightforward method is like the cpu_add_remove_lock: make everything that waiting for the vcpu state change to do softirq between the checking. Maybe the cleaner way is your previous suggestion, that is, put the stop_machine_run() in the idle_vcpu(), another way is, turn back to the original method, i.e. do it in the schedule_tail.

Also We are not sure why the continue_hypercall_on_cpu is changed to use tasklet. What's the benifit for it? At least I think this is quite confusing, because per our understanding, usually hypercall is assumed to execut in current context, while this change break the assumption. So any hypercall that may use this _c_h_o_c, and any function called by that hypercall, should be aware of this, I'm not sure if this is really so correct, at least it may cause trouble if someone use this without realize the limitation. From Juergen Gross's mail, it seems for cpupool, but I have no idea of the cpupool :-(

--jyh


>
> -- Keir
>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15 11:06                               ` Keir Fraser
  2010-04-15 11:22                                 ` Keir Fraser
  2010-04-16  6:42                                 ` Jiang, Yunhong
@ 2010-04-16  6:45                                 ` Jiang, Yunhong
  2 siblings, 0 replies; 44+ messages in thread
From: Jiang, Yunhong @ 2010-04-16  6:45 UTC (permalink / raw)
  To: Jiang, Yunhong, Keir Fraser, Juergen Gross; +Cc: xen-devel, Yu, Ke

BTW, I suspect if cpu online/offline lock should really wrap the stop_machine_run(). I remember Criping questioned this also. Will have a look on it.

--jyh

>-----Original Message-----
>From: Jiang, Yunhong
>Sent: Friday, April 16, 2010 2:43 PM
>To: Keir Fraser; Juergen Gross
>Cc: xen-devel@lists.xensource.com; Yu, Ke
>Subject: RE: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>
>
>
>>-----Original Message-----
>>From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>>Sent: Thursday, April 15, 2010 7:07 PM
>>To: Jiang, Yunhong; Juergen Gross
>>Cc: xen-devel@lists.xensource.com; Yu, Ke
>>Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>>
>>On 15/04/2010 10:59, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:
>>
>>>> Actually that's a good example because it now won't work, but for other
>>>> reasons! The hypercall continuation can interrupt another vcpu's execution,
>>>> and then try to synchronously pause that vcpu. Which will deadlock.
>>>>
>>>> Luckily I think we can re-jig this code to freeze_domains() before doing the
>>>> continue_hypercall_on_cpu(). I've cc'ed one of the CPU RAS guys. :-)
>>>
>>> Hmm, I have cc'ed one of the PM guys because it is enter_state :-)
>>> Can we add check in vcpu_sleep_sync() for current? It is meaningless to
>>> cpu_relax for current vcpu in that situation, especially if we are not in irq
>>> context.
>>> I'm not sure why in freeze_domains it only checkes dom0's vcpu for current,
>>> instead of all domains.
>>
>>Well actually pausing any vcpu from within the hypercall continuation is
>>dangerous. The softirq handler running the hypercall continuation may have
>>interrupted some running VCPU X. And the VCPU Y that the continuation is
>>currently trying to pause may itself be trying to pause X. So we can get a
>>deadlock that way. The freeze_domains() *has* to be pulled outside of the
>>hypercall continuation.
>>
>>It's a little bit similar to the super-subtle stop_machine_run deadlock
>>possibility I just emailed to you a second ago. :-)
>
>Thanks for pointing out the stop_machine_run deadlock issue.
>
>After more consideration and internally discussion, seems the key point is, the
>tasklet softirq is something like getting a lock for the current vcpu's state(i.e. no one
>else could change that state unless this softirq is finished). So any block action in
>softirq context, not just vcpu_pause_sync, is dangerous and should be avoided (we
>can't get a lock and do block action per my understanding).
>This is because vcpu's state can only be changed by schedule softirq (am I right on
>this?), while schedule softirq can't prempt other softirq. So, more generally, anything
>that will be updated by a softirq context, and will be syncrhozed/blocking waitied in
>xen's vcpu context is in fact a implicit lock held by the softirq.
>
>To the tricky bug on the stop_machine_run(), I think it is in fact similar to the
>cpu_add_remove_lock. The stop_machine_run() is a block action, so we must make
>sure no one will be blocking to get the lock that is held by stop_machine_run()
>already. At that time, we change all components that try to get the
>cpu_add_remove_lock to be try_lock.
>
>The changes caused by the tasklet is, a new implicit lock is added, i.e. the vcpu's
>state.
>The straightforward method is like the cpu_add_remove_lock: make everything that
>waiting for the vcpu state change to do softirq between the checking. Maybe the
>cleaner way is your previous suggestion, that is, put the stop_machine_run() in the
>idle_vcpu(), another way is, turn back to the original method, i.e. do it in the
>schedule_tail.
>
>Also We are not sure why the continue_hypercall_on_cpu is changed to use tasklet.
>What's the benifit for it? At least I think this is quite confusing, because per our
>understanding, usually hypercall is assumed to execut in current context, while this
>change break the assumption. So any hypercall that may use this _c_h_o_c, and any
>function called by that hypercall, should be aware of this, I'm not sure if this is really
>so correct, at least it may cause trouble if someone use this without realize the
>limitation. From Juergen Gross's mail, it seems for cpupool, but I have no idea of the
>cpupool :-(
>
>--jyh
>
>
>>
>> -- Keir
>>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-16  6:42                                 ` Jiang, Yunhong
@ 2010-04-16  6:55                                   ` Juergen Gross
  2010-04-16  8:30                                     ` Jiang, Yunhong
  2010-04-16  7:13                                   ` Keir Fraser
  1 sibling, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2010-04-16  6:55 UTC (permalink / raw)
  To: Jiang, Yunhong; +Cc: xen-devel, Keir Fraser, Yu, Ke

Jiang, Yunhong wrote:
> 
>> -----Original Message-----
>> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>> Sent: Thursday, April 15, 2010 7:07 PM
>> To: Jiang, Yunhong; Juergen Gross
>> Cc: xen-devel@lists.xensource.com; Yu, Ke
>> Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>>
>> On 15/04/2010 10:59, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:
>>
>>>> Actually that's a good example because it now won't work, but for other
>>>> reasons! The hypercall continuation can interrupt another vcpu's execution,
>>>> and then try to synchronously pause that vcpu. Which will deadlock.
>>>>
>>>> Luckily I think we can re-jig this code to freeze_domains() before doing the
>>>> continue_hypercall_on_cpu(). I've cc'ed one of the CPU RAS guys. :-)
>>> Hmm, I have cc'ed one of the PM guys because it is enter_state :-)
>>> Can we add check in vcpu_sleep_sync() for current? It is meaningless to
>>> cpu_relax for current vcpu in that situation, especially if we are not in irq
>>> context.
>>> I'm not sure why in freeze_domains it only checkes dom0's vcpu for current,
>>> instead of all domains.
>> Well actually pausing any vcpu from within the hypercall continuation is
>> dangerous. The softirq handler running the hypercall continuation may have
>> interrupted some running VCPU X. And the VCPU Y that the continuation is
>> currently trying to pause may itself be trying to pause X. So we can get a
>> deadlock that way. The freeze_domains() *has* to be pulled outside of the
>> hypercall continuation.
>>
>> It's a little bit similar to the super-subtle stop_machine_run deadlock
>> possibility I just emailed to you a second ago. :-)
> 
> Thanks for pointing out the stop_machine_run deadlock issue.
> 
> After more consideration and internally discussion, seems the key point is, the tasklet softirq is something like getting a lock for the current vcpu's state(i.e. no one else could change that state unless this softirq is finished). So any block action in softirq context, not just vcpu_pause_sync, is dangerous and should be avoided (we can't get a lock and do block action per my understanding).
> This is because vcpu's state can only be changed by schedule softirq (am I right on this?), while schedule softirq can't prempt other softirq. So, more generally, anything that will be updated by a softirq context, and will be syncrhozed/blocking waitied in xen's vcpu context is in fact a implicit lock held by the softirq.
> 
> To the tricky bug on the stop_machine_run(), I think it is in fact similar to the cpu_add_remove_lock. The stop_machine_run() is a block action, so we must make sure no one will be blocking to get the lock that is held by stop_machine_run() already. At that time, we change all components that try to get the cpu_add_remove_lock to be try_lock.
> 
> The changes caused by the tasklet is, a new implicit lock is added, i.e. the vcpu's state.
> The straightforward method is like the cpu_add_remove_lock: make everything that waiting for the vcpu state change to do softirq between the checking. Maybe the cleaner way is your previous suggestion, that is, put the stop_machine_run() in the idle_vcpu(), another way is, turn back to the original method, i.e. do it in the schedule_tail.
> 
> Also We are not sure why the continue_hypercall_on_cpu is changed to use tasklet. What's the benifit for it? At least I think this is quite confusing, because per our understanding, usually hypercall is assumed to execut in current context, while this change break the assumption. So any hypercall that may use this _c_h_o_c, and any function called by that hypercall, should be aware of this, I'm not sure if this is really so correct, at least it may cause trouble if someone use this without realize the limitation. From Juergen Gross's mail, it seems for cpupool, but I have no idea of the cpupool :-(

Cpupools introduce something like "scheduling domains" in xen. Each cpupool
owns a set of physical cpus and has an own scheduler. Each domain is member
of a cpupool.

It is possible to move cpus or domains between pools, but a domain is always
limited to the physical cpus being in the cpupool of the domain.

This limitation makes it impossible to use continue_hypercall_on_cpu with
cpupools for any physical cpu without changing it. My original solution
temporarily moved the target cpu into the cpupool of the issuing domain,
but this was regarded as an ugly hack.


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-16  6:42                                 ` Jiang, Yunhong
  2010-04-16  6:55                                   ` Juergen Gross
@ 2010-04-16  7:13                                   ` Keir Fraser
  2010-04-16  8:16                                     ` Jiang, Yunhong
  1 sibling, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-16  7:13 UTC (permalink / raw)
  To: Jiang, Yunhong, Juergen Gross; +Cc: xen-devel, Yu, Ke

On 16/04/2010 07:42, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:

> Thanks for pointing out the stop_machine_run deadlock issue.
> 
> After more consideration and internally discussion, seems the key point is,
> the tasklet softirq is something like getting a lock for the current vcpu's
> state(i.e. no one else could change that state unless this softirq is
> finished).
>
> So any block action in softirq context, not just vcpu_pause_sync,
> is dangerous and should be avoided (we can't get a lock and do block action
> per my understanding).
> This is because vcpu's state can only be changed by schedule softirq (am I
> right on this?), while schedule softirq can't prempt other softirq. So, more
> generally, anything that will be updated by a softirq context, and will be
> syncrhozed/blocking waitied in xen's vcpu context is in fact a implicit lock
> held by the softirq.

I think you're at risk of over-analysing the situation. You cannot safely
synchronously pause vcpus from within softirq context. I'm not sure we can
extrapolate further than that.

> To the tricky bug on the stop_machine_run(), I think it is in fact similar to
> the cpu_add_remove_lock. The stop_machine_run() is a block action, so we must
> make sure no one will be blocking to get the lock that is held by
> stop_machine_run() already. At that time, we change all components that try to
> get the cpu_add_remove_lock to be try_lock.
> 
> The changes caused by the tasklet is, a new implicit lock is added, i.e. the
> vcpu's state.

Let me be clear: the deadlock I described in stop_machine_run() is *not*
introduced by the c_h_o_c reimplementation. It has been there all along.
Nothing in my description of the deadlock depended on the implementation of
c_h_o_c: it depended only on the baheviour of stop_machine_run itself, which
does not internally use c_h_o_c.

> The straightforward method is like the cpu_add_remove_lock: make everything
> that waiting for the vcpu state change to do softirq between the checking.

Possible, could have impacts of its own of course. We couldn't do
SCHEDULE_SOFTIRQ as we would lose the caller's context, and the caller could
not hold locks when pausing others (although I suppose they generally can't
anyway).

> Maybe the cleaner way is your previous suggestion, that is, put the
> stop_machine_run() in the idle_vcpu(), another way is, turn back to the
> original method, i.e. do it in the schedule_tail.

Argh. That would be annoying!

Another possibility is to... shudder... introduce a timeout. Wait only e.g.,
1ms for all vcpus to enter the STOPMACHINE_PREPARE state and if they don't,
cancel the operation and return -EBUSY. There are already a bunch of
opportunities to return 'spurious' -EBUSY already in the cpu-offline paths,
so tools already need to know to retry at least a certain number of times.
Undoubtedly this is the dirty solution, but it is easy-ish to implement and
should only be allowing us to avoid an extremely rare deadlock possibility.
It would just be critical to pick a large enough timeout!

> Also We are not sure why the continue_hypercall_on_cpu is changed to use
> tasklet. What's the benifit for it? At least I think this is quite confusing,
> because per our understanding, usually hypercall is assumed to execut in
> current context, while this change break the assumption. So any hypercall that
> may use this _c_h_o_c, and any function called by that hypercall, should be
> aware of this, I'm not sure if this is really so correct, at least it may
> cause trouble if someone use this without realize the limitation. From Juergen
> Gross's mail, it seems for cpupool, but I have no idea of the cpupool :-(

The implementation is simpler and lets us get rid of the complexity around
vcpu affinity logic. There aren't that many users of c_h_o_c and most are
still trivially correct. I don't think the changes to c_h_o_c have
introduced any more deadlocks, apart from the freeze_domains issue (which I
believe I have now fixed).

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-16  7:13                                   ` Keir Fraser
@ 2010-04-16  8:16                                     ` Jiang, Yunhong
  2010-04-16 17:57                                       ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Jiang, Yunhong @ 2010-04-16  8:16 UTC (permalink / raw)
  To: Keir Fraser, Juergen Gross; +Cc: xen-devel, Yu, Ke



>-----Original Message-----
>From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>Sent: Friday, April 16, 2010 3:14 PM
>To: Jiang, Yunhong; Juergen Gross
>Cc: xen-devel@lists.xensource.com; Yu, Ke
>Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>
>On 16/04/2010 07:42, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:
>
>> Thanks for pointing out the stop_machine_run deadlock issue.
>>
>> After more consideration and internally discussion, seems the key point is,
>> the tasklet softirq is something like getting a lock for the current vcpu's
>> state(i.e. no one else could change that state unless this softirq is
>> finished).
>>
>> So any block action in softirq context, not just vcpu_pause_sync,
>> is dangerous and should be avoided (we can't get a lock and do block action
>> per my understanding).
>> This is because vcpu's state can only be changed by schedule softirq (am I
>> right on this?), while schedule softirq can't prempt other softirq. So, more
>> generally, anything that will be updated by a softirq context, and will be
>> syncrhozed/blocking waitied in xen's vcpu context is in fact a implicit lock
>> held by the softirq.
>
>I think you're at risk of over-analysing the situation. You cannot safely
>synchronously pause vcpus from within softirq context. I'm not sure we can
>extrapolate further than that.
>
>> To the tricky bug on the stop_machine_run(), I think it is in fact similar to
>> the cpu_add_remove_lock. The stop_machine_run() is a block action, so we must
>> make sure no one will be blocking to get the lock that is held by
>> stop_machine_run() already. At that time, we change all components that try to
>> get the cpu_add_remove_lock to be try_lock.
>>
>> The changes caused by the tasklet is, a new implicit lock is added, i.e. the
>> vcpu's state.
>
>Let me be clear: the deadlock I described in stop_machine_run() is *not*
>introduced by the c_h_o_c reimplementation. It has been there all along.
>Nothing in my description of the deadlock depended on the implementation of
>c_h_o_c: it depended only on the baheviour of stop_machine_run itself, which
>does not internally use c_h_o_c.

Oops, yes, this should be there already, without c_h_o_c.
So, if another vcpu is trying to vcpu_pause() this vcpu, it will deadlock becaues this vcpu can't be paused, right? (I assume the same reason to hvmop_flush_tlb_all).

If this is true, then how about checking !vcpu_runnable(current) in stop_machine_run()'s stopmachine_set_state()? If this check failed, it means someone try to change our state and potential deadlock may happen, we can cancel this stop_machine_run()? This is at least a bit cleaner than the timeout mechanism.

>
>> The straightforward method is like the cpu_add_remove_lock: make everything
>> that waiting for the vcpu state change to do softirq between the checking.
>
>Possible, could have impacts of its own of course. We couldn't do
>SCHEDULE_SOFTIRQ as we would lose the caller's context, and the caller could
>not hold locks when pausing others (although I suppose they generally can't
>anyway).
>
>> Maybe the cleaner way is your previous suggestion, that is, put the
>> stop_machine_run() in the idle_vcpu(), another way is, turn back to the
>> original method, i.e. do it in the schedule_tail.
>
>Argh. That would be annoying!

Seems do it in the schedule_tail will not benifit this deadlock issues.

>
>Another possibility is to... shudder... introduce a timeout. Wait only e.g.,
>1ms for all vcpus to enter the STOPMACHINE_PREPARE state and if they don't,
>cancel the operation and return -EBUSY. There are already a bunch of
>opportunities to return 'spurious' -EBUSY already in the cpu-offline paths,
>so tools already need to know to retry at least a certain number of times.
>Undoubtedly this is the dirty solution, but it is easy-ish to implement and
>should only be allowing us to avoid an extremely rare deadlock possibility.
>It would just be critical to pick a large enough timeout!
>
>> Also We are not sure why the continue_hypercall_on_cpu is changed to use
>> tasklet. What's the benifit for it? At least I think this is quite confusing,
>> because per our understanding, usually hypercall is assumed to execut in
>> current context, while this change break the assumption. So any hypercall that
>> may use this _c_h_o_c, and any function called by that hypercall, should be
>> aware of this, I'm not sure if this is really so correct, at least it may
>> cause trouble if someone use this without realize the limitation. From Juergen
>> Gross's mail, it seems for cpupool, but I have no idea of the cpupool :-(
>
>The implementation is simpler and lets us get rid of the complexity around
>vcpu affinity logic. There aren't that many users of c_h_o_c and most are
>still trivially correct. I don't think the changes to c_h_o_c have
>introduced any more deadlocks, apart from the freeze_domains issue (which I
>believe I have now fixed).

Yes, c_h_o_c does not introduce any more deadlock, my concern is, it's name maybe confusing if one try to use it for other hypercall . After all, if a hypercall and function used by the hypercall depends on current, it should not use this c_h_o_c.

--jyh

>
> -- Keir
>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-16  6:55                                   ` Juergen Gross
@ 2010-04-16  8:30                                     ` Jiang, Yunhong
  0 siblings, 0 replies; 44+ messages in thread
From: Jiang, Yunhong @ 2010-04-16  8:30 UTC (permalink / raw)
  To: Juergen Gross; +Cc: xen-devel, Keir Fraser, Yu, Ke

Juergen, thanks for your explaination.

--jyh

>-----Original Message-----
>From: Juergen Gross [mailto:juergen.gross@ts.fujitsu.com]
>Sent: Friday, April 16, 2010 2:56 PM
>To: Jiang, Yunhong
>Cc: Keir Fraser; xen-devel@lists.xensource.com; Yu, Ke
>Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>
>Jiang, Yunhong wrote:
>>
>>> -----Original Message-----
>>> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>>> Sent: Thursday, April 15, 2010 7:07 PM
>>> To: Jiang, Yunhong; Juergen Gross
>>> Cc: xen-devel@lists.xensource.com; Yu, Ke
>>> Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>>>
>>> On 15/04/2010 10:59, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:
>>>
>>>>> Actually that's a good example because it now won't work, but for other
>>>>> reasons! The hypercall continuation can interrupt another vcpu's execution,
>>>>> and then try to synchronously pause that vcpu. Which will deadlock.
>>>>>
>>>>> Luckily I think we can re-jig this code to freeze_domains() before doing the
>>>>> continue_hypercall_on_cpu(). I've cc'ed one of the CPU RAS guys. :-)
>>>> Hmm, I have cc'ed one of the PM guys because it is enter_state :-)
>>>> Can we add check in vcpu_sleep_sync() for current? It is meaningless to
>>>> cpu_relax for current vcpu in that situation, especially if we are not in irq
>>>> context.
>>>> I'm not sure why in freeze_domains it only checkes dom0's vcpu for current,
>>>> instead of all domains.
>>> Well actually pausing any vcpu from within the hypercall continuation is
>>> dangerous. The softirq handler running the hypercall continuation may have
>>> interrupted some running VCPU X. And the VCPU Y that the continuation is
>>> currently trying to pause may itself be trying to pause X. So we can get a
>>> deadlock that way. The freeze_domains() *has* to be pulled outside of the
>>> hypercall continuation.
>>>
>>> It's a little bit similar to the super-subtle stop_machine_run deadlock
>>> possibility I just emailed to you a second ago. :-)
>>
>> Thanks for pointing out the stop_machine_run deadlock issue.
>>
>> After more consideration and internally discussion, seems the key point is, the
>tasklet softirq is something like getting a lock for the current vcpu's state(i.e. no one
>else could change that state unless this softirq is finished). So any block action in
>softirq context, not just vcpu_pause_sync, is dangerous and should be avoided (we
>can't get a lock and do block action per my understanding).
>> This is because vcpu's state can only be changed by schedule softirq (am I right on
>this?), while schedule softirq can't prempt other softirq. So, more generally, anything
>that will be updated by a softirq context, and will be syncrhozed/blocking waitied in
>xen's vcpu context is in fact a implicit lock held by the softirq.
>>
>> To the tricky bug on the stop_machine_run(), I think it is in fact similar to the
>cpu_add_remove_lock. The stop_machine_run() is a block action, so we must make
>sure no one will be blocking to get the lock that is held by stop_machine_run()
>already. At that time, we change all components that try to get the
>cpu_add_remove_lock to be try_lock.
>>
>> The changes caused by the tasklet is, a new implicit lock is added, i.e. the vcpu's
>state.
>> The straightforward method is like the cpu_add_remove_lock: make everything
>that waiting for the vcpu state change to do softirq between the checking. Maybe
>the cleaner way is your previous suggestion, that is, put the stop_machine_run() in
>the idle_vcpu(), another way is, turn back to the original method, i.e. do it in the
>schedule_tail.
>>
>> Also We are not sure why the continue_hypercall_on_cpu is changed to use tasklet.
>What's the benifit for it? At least I think this is quite confusing, because per our
>understanding, usually hypercall is assumed to execut in current context, while this
>change break the assumption. So any hypercall that may use this _c_h_o_c, and any
>function called by that hypercall, should be aware of this, I'm not sure if this is really
>so correct, at least it may cause trouble if someone use this without realize the
>limitation. From Juergen Gross's mail, it seems for cpupool, but I have no idea of the
>cpupool :-(
>
>Cpupools introduce something like "scheduling domains" in xen. Each cpupool
>owns a set of physical cpus and has an own scheduler. Each domain is member
>of a cpupool.
>
>It is possible to move cpus or domains between pools, but a domain is always
>limited to the physical cpus being in the cpupool of the domain.
>
>This limitation makes it impossible to use continue_hypercall_on_cpu with
>cpupools for any physical cpu without changing it. My original solution
>temporarily moved the target cpu into the cpupool of the issuing domain,
>but this was regarded as an ugly hack.
>
>
>Juergen
>
>--
>Juergen Gross                 Principal Developer Operating Systems
>TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
>Fujitsu Technology Solutions              e-mail: juergen.gross@ts.fujitsu.com
>Domagkstr. 28                           Internet: ts.fujitsu.com
>D-80807 Muenchen                 Company details:
>ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-15 11:22                                 ` Keir Fraser
@ 2010-04-16  9:20                                   ` Yu, Ke
  2010-04-16 17:51                                     ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Yu, Ke @ 2010-04-16  9:20 UTC (permalink / raw)
  To: Keir Fraser, Jiang, Yunhong; +Cc: xen-devel

> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Thursday, April 15, 2010 7:23 PM
> To: Jiang, Yunhong
> Cc: xen-devel@lists.xensource.com; Yu, Ke
> Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using
> tasklets
> 
> On 15/04/2010 12:06, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
> 
> >> Hmm, I have cc'ed one of the PM guys because it is enter_state :-)
> >> Can we add check in vcpu_sleep_sync() for current? It is meaningless to
> >> cpu_relax for current vcpu in that situation, especially if we are not in irq
> >> context.
> >> I'm not sure why in freeze_domains it only checkes dom0's vcpu for
> current,
> >> instead of all domains.
> >
> > Well actually pausing any vcpu from within the hypercall continuation is
> > dangerous. The softirq handler running the hypercall continuation may
> have
> > interrupted some running VCPU X. And the VCPU Y that the continuation
> is
> > currently trying to pause may itself be trying to pause X. So we can get a
> > deadlock that way. The freeze_domains() *has* to be pulled outside of the
> > hypercall continuation.
> 
> Changesets 21180 and 21181 in xen-unstable are my attempts to fix this. Yu
> Ke, please let me know if these look and test okay for you guys.
> 

We have a stress S3 test upon the Xen 4.1 unstable. Unfortunately, cset 21181 cannot pass the stress. It hangs after 48 times S3 suspend/resume.  Another round test even shows it hangs after 2 times suspend/resume. But it is too early to say cset 21181/21180 is the evil, because even cset 21172 (cset ahead of continue_hypercall_on_cpu improvement patch) cannot pass the S3 test either, although it has bigger success suspend/resume times than cset 21181 . so generally, there must be something wrong with S3 logic since Xen 4.0 release. We are still trying to dig it out...

Best Regards
Ke 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-16  9:20                                   ` Yu, Ke
@ 2010-04-16 17:51                                     ` Keir Fraser
  2010-04-19 10:50                                       ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-16 17:51 UTC (permalink / raw)
  To: Yu, Ke, Jiang, Yunhong; +Cc: xen-devel

On 16/04/2010 10:20, "Yu, Ke" <ke.yu@intel.com> wrote:

>> Changesets 21180 and 21181 in xen-unstable are my attempts to fix this. Yu
>> Ke, please let me know if these look and test okay for you guys.
>> 
> 
> We have a stress S3 test upon the Xen 4.1 unstable. Unfortunately, cset 21181
> cannot pass the stress. It hangs after 48 times S3 suspend/resume.  Another
> round test even shows it hangs after 2 times suspend/resume. But it is too
> early to say cset 21181/21180 is the evil, because even cset 21172 (cset ahead
> of continue_hypercall_on_cpu improvement patch) cannot pass the S3 test
> either, although it has bigger success suspend/resume times than cset 21181 .
> so generally, there must be something wrong with S3 logic since Xen 4.0
> release. We are still trying to dig it out...

21172 was not a good changeset to pick. I suggest trying xen-unstable tip
with just 21181/21180 reverted.

 Thanks,
 Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-16  8:16                                     ` Jiang, Yunhong
@ 2010-04-16 17:57                                       ` Keir Fraser
  2010-04-19  5:53                                         ` Jiang, Yunhong
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2010-04-16 17:57 UTC (permalink / raw)
  To: Jiang, Yunhong, Juergen Gross; +Cc: xen-devel, Yu, Ke

On 16/04/2010 09:16, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:

> Oops, yes, this should be there already, without c_h_o_c.
> So, if another vcpu is trying to vcpu_pause() this vcpu, it will deadlock
> becaues this vcpu can't be paused, right? (I assume the same reason to
> hvmop_flush_tlb_all).

Yes.

> If this is true, then how about checking !vcpu_runnable(current) in
> stop_machine_run()'s stopmachine_set_state()? If this check failed, it means
> someone try to change our state and potential deadlock may happen, we can
> cancel this stop_machine_run()? This is at least a bit cleaner than the
> timeout mechanism.

Interesting... That could work. But...

>>> Maybe the cleaner way is your previous suggestion, that is, put the
>>> stop_machine_run() in the idle_vcpu(), another way is, turn back to the
>>> original method, i.e. do it in the schedule_tail.
>> 
>> Argh. That would be annoying!
> 
> Seems do it in the schedule_tail will not benifit this deadlock issues.

Yes, but now I think about it, scheduling c_h_o_c() and s_m_r() in idle-vcpu
context instead of softirq context might not be *that* hard to do, and it
avoids all these subtle deadlock possibilities. I think I'm warming to it
compared with the alternatives.

> Yes, c_h_o_c does not introduce any more deadlock, my concern is, it's name
> maybe confusing if one try to use it for other hypercall . After all, if a
> hypercall and function used by the hypercall depends on current, it should not
> use this c_h_o_c.

Well, I think there are two issues: (1) we run the continuation as a
softirq; (2) we don't run the continuation in the context of the original
vcpu. I think (1) is a bigger problem than (2) as it introduces the
possibility of all these nasty subtle deadlocks. (2) is obviously not
desirable, *but* I don't think any callers much care about the context of
the original caller, *and* if they do the resulting bug is generally going
to be pretty obvious. That is, the hypercall just won't work, ever -- it's
much less likely than (1) to result in very rare very subtle bugs.

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-16 17:57                                       ` Keir Fraser
@ 2010-04-19  5:53                                         ` Jiang, Yunhong
  2010-04-19  6:48                                           ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Jiang, Yunhong @ 2010-04-19  5:53 UTC (permalink / raw)
  To: Keir Fraser, Juergen Gross; +Cc: xen-devel, Yu, Ke


>-----Original Message-----
>From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>Sent: Saturday, April 17, 2010 1:58 AM
>To: Jiang, Yunhong; Juergen Gross
>Cc: xen-devel@lists.xensource.com; Yu, Ke
>Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using tasklets
>
>On 16/04/2010 09:16, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:
>
>> Oops, yes, this should be there already, without c_h_o_c.
>> So, if another vcpu is trying to vcpu_pause() this vcpu, it will deadlock
>> becaues this vcpu can't be paused, right? (I assume the same reason to
>> hvmop_flush_tlb_all).
>
>Yes.
>
>> If this is true, then how about checking !vcpu_runnable(current) in
>> stop_machine_run()'s stopmachine_set_state()? If this check failed, it means
>> someone try to change our state and potential deadlock may happen, we can
>> cancel this stop_machine_run()? This is at least a bit cleaner than the
>> timeout mechanism.
>
>Interesting... That could work. But...

I agree that idle vcpu method is more cleaner in the long run.

>
>>>> Maybe the cleaner way is your previous suggestion, that is, put the
>>>> stop_machine_run() in the idle_vcpu(), another way is, turn back to the
>>>> original method, i.e. do it in the schedule_tail.
>>>
>>> Argh. That would be annoying!
>>
>> Seems do it in the schedule_tail will not benifit this deadlock issues.
>
>Yes, but now I think about it, scheduling c_h_o_c() and s_m_r() in idle-vcpu
>context instead of softirq context might not be *that* hard to do, and it
>avoids all these subtle deadlock possibilities. I think I'm warming to it
>compared with the alternatives.
>
>> Yes, c_h_o_c does not introduce any more deadlock, my concern is, it's name
>> maybe confusing if one try to use it for other hypercall . After all, if a
>> hypercall and function used by the hypercall depends on current, it should not
>> use this c_h_o_c.
>
>Well, I think there are two issues: (1) we run the continuation as a
>softirq; (2) we don't run the continuation in the context of the original
>vcpu. I think (1) is a bigger problem than (2) as it introduces the
>possibility of all these nasty subtle deadlocks. (2) is obviously not
>desirable, *but* I don't think any callers much care about the context of
>the original caller, *and* if they do the resulting bug is generally going
>to be pretty obvious. That is, the hypercall just won't work, ever -- it's
>much less likely than (1) to result in very rare very subtle bugs.

For issue 2, In fact this c_h_o_c is sometihng like xen action, i.e. it is some utility provided by xen hypervisor that can be utilized by guest, so maybe we can change the name of c_h_o_c, to be like call_xen_XXX?
To your idle_vcpu context work, I think it is a bit like hvm domain waiting for a IO assist from qemu (i.e., use prepare_wait_on_xen_event_channel()), is it right?

--jyh

>
> -- Keir
>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-19  5:53                                         ` Jiang, Yunhong
@ 2010-04-19  6:48                                           ` Keir Fraser
  0 siblings, 0 replies; 44+ messages in thread
From: Keir Fraser @ 2010-04-19  6:48 UTC (permalink / raw)
  To: Jiang, Yunhong, Juergen Gross; +Cc: xen-devel, Yu, Ke

On 19/04/2010 06:53, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:

>> Well, I think there are two issues: (1) we run the continuation as a
>> softirq; (2) we don't run the continuation in the context of the original
>> vcpu. I think (1) is a bigger problem than (2) as it introduces the
>> possibility of all these nasty subtle deadlocks. (2) is obviously not
>> desirable, *but* I don't think any callers much care about the context of
>> the original caller, *and* if they do the resulting bug is generally going
>> to be pretty obvious. That is, the hypercall just won't work, ever -- it's
>> much less likely than (1) to result in very rare very subtle bugs.
> 
> For issue 2, In fact this c_h_o_c is sometihng like xen action, i.e. it is
> some utility provided by xen hypervisor that can be utilized by guest, so
> maybe we can change the name of c_h_o_c, to be like call_xen_XXX?

Well, the handler does provide the final hypercall return code, so it is
still a hypercall continuation even if it doesn't run in the correct vcpu's
context.

> To your idle_vcpu context work, I think it is a bit like hvm domain waiting
> for a IO assist from qemu (i.e., use prepare_wait_on_xen_event_channel()), is
> it right?

It's easier than that because the work flow does not leave the hypervisor
itself. So we can simply pause/unpause the guest vcpu -- exactly as we are
currently doing in the new tasklet-based c_h_o_c().

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-16 17:51                                     ` Keir Fraser
@ 2010-04-19 10:50                                       ` Keir Fraser
  2010-04-19 12:04                                         ` Yu, Ke
  2010-04-20  5:35                                         ` Wei, Gang
  0 siblings, 2 replies; 44+ messages in thread
From: Keir Fraser @ 2010-04-19 10:50 UTC (permalink / raw)
  To: Yu, Ke, Jiang, Yunhong; +Cc: xen-devel

On 16/04/2010 18:51, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

>> We have a stress S3 test upon the Xen 4.1 unstable. Unfortunately, cset 21181
>> cannot pass the stress. It hangs after 48 times S3 suspend/resume.  Another
>> round test even shows it hangs after 2 times suspend/resume. But it is too
>> early to say cset 21181/21180 is the evil, because even cset 21172 (cset
>> ahead
>> of continue_hypercall_on_cpu improvement patch) cannot pass the S3 test
>> either, although it has bigger success suspend/resume times than cset 21181 .
>> so generally, there must be something wrong with S3 logic since Xen 4.0
>> release. We are still trying to dig it out...
> 
> 21172 was not a good changeset to pick. I suggest trying xen-unstable tip
> with just 21181/21180 reverted.

Actually, try xen-unstable staging tip (c/s 21202). I've just reimplemented
tasklets, so things are quite different at latest tip.

 Thanks,
 Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-19 10:50                                       ` Keir Fraser
@ 2010-04-19 12:04                                         ` Yu, Ke
  2010-04-20  5:35                                         ` Wei, Gang
  1 sibling, 0 replies; 44+ messages in thread
From: Yu, Ke @ 2010-04-19 12:04 UTC (permalink / raw)
  To: Keir Fraser, Jiang, Yunhong; +Cc: xen-devel

> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Monday, April 19, 2010 6:51 PM
> To: Yu, Ke; Jiang, Yunhong
> Cc: xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel] [Patch] continue_hypercall_on_cpu rework using
> tasklets
> 
> On 16/04/2010 18:51, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
> 
> >> We have a stress S3 test upon the Xen 4.1 unstable. Unfortunately, cset
> 21181
> >> cannot pass the stress. It hangs after 48 times S3 suspend/resume.
> Another
> >> round test even shows it hangs after 2 times suspend/resume. But it is
> too
> >> early to say cset 21181/21180 is the evil, because even cset 21172 (cset
> >> ahead
> >> of continue_hypercall_on_cpu improvement patch) cannot pass the S3
> test
> >> either, although it has bigger success suspend/resume times than cset
> 21181 .
> >> so generally, there must be something wrong with S3 logic since Xen 4.0
> >> release. We are still trying to dig it out...
> >
> > 21172 was not a good changeset to pick. I suggest trying xen-unstable tip
> > with just 21181/21180 reverted.
> 
> Actually, try xen-unstable staging tip (c/s 21202). I've just reimplemented
> tasklets, so things are quite different at latest tip.
> 
>  Thanks,
>  Keir
> 

Yeah, move those stuff to idle cpu context is cleaner. I will try this.

Best Regards
Ke

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-19 10:50                                       ` Keir Fraser
  2010-04-19 12:04                                         ` Yu, Ke
@ 2010-04-20  5:35                                         ` Wei, Gang
  2010-04-20 12:51                                           ` Keir Fraser
  1 sibling, 1 reply; 44+ messages in thread
From: Wei, Gang @ 2010-04-20  5:35 UTC (permalink / raw)
  To: Keir Fraser, Yu, Ke, Jiang, Yunhong; +Cc: xen-devel

On Monday, 2010-4-19 6:51 PM, Keir Fraser wrote:
> On 16/04/2010 18:51, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
> 
>>> We have a stress S3 test upon the Xen 4.1 unstable. Unfortunately,
>>> cset 21181 cannot pass the stress. It hangs after 48 times S3
>>> suspend/resume.  Another round test even shows it hangs after 2
>>> times suspend/resume. But it is too early to say cset 21181/21180
>>> is the evil, because even cset 21172 (cset ahead of
>>> continue_hypercall_on_cpu improvement patch) cannot pass the S3
>>> test either, although it has bigger success suspend/resume times
>>> than cset 21181 . so generally, there must be something wrong with
>>> S3 logic since Xen 4.0 release. We are still trying to dig it
>>> out...  
>> 
>> 21172 was not a good changeset to pick. I suggest trying
>> xen-unstable tip with just 21181/21180 reverted.
> 
> Actually, try xen-unstable staging tip (c/s 21202). I've just
> reimplemented tasklets, so things are quite different at latest tip.

Just tried c/s 21202, hung up during the 285th S3 resume on system with only legacy HPET broadcast. And it is more easy to hung up on HPET MSI broadcast capable system - need only <100 times.

Nothing wrong in the serial log, and ctrl-a didn't respond. I will try to find a debuging system with ITP to figure out the hanging point. Meanwhile, I will try cpu online/offline stress test see whether it is a general problem in cpu online/offline or just S3 specific.

Jimmy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-20  5:35                                         ` Wei, Gang
@ 2010-04-20 12:51                                           ` Keir Fraser
  2010-04-20 14:10                                             ` Wei, Gang
  2010-04-21  6:47                                             ` Wei, Gang
  0 siblings, 2 replies; 44+ messages in thread
From: Keir Fraser @ 2010-04-20 12:51 UTC (permalink / raw)
  To: Wei, Gang, Yu, Ke, Jiang, Yunhong; +Cc: xen-devel

On 20/04/2010 06:35, "Wei, Gang" <gang.wei@intel.com> wrote:

>>> 21172 was not a good changeset to pick. I suggest trying
>>> xen-unstable tip with just 21181/21180 reverted.
>> 
>> Actually, try xen-unstable staging tip (c/s 21202). I've just
>> reimplemented tasklets, so things are quite different at latest tip.
> 
> Just tried c/s 21202, hung up during the 285th S3 resume on system with only
> legacy HPET broadcast. And it is more easy to hung up on HPET MSI broadcast
> capable system - need only <100 times.
> 
> Nothing wrong in the serial log, and ctrl-a didn't respond. I will try to find
> a debuging system with ITP to figure out the hanging point. Meanwhile, I will
> try cpu online/offline stress test see whether it is a general problem in cpu
> online/offline or just S3 specific.

How reliable is S3 in 4.0, for comparison? Did you get it rock solid?

Also, try c/s 21204 before getting into heavy debugging. It fixes being
able to access VMCS fields from c_h_o_c() context. Worth a try.

 -- Keir

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-20 12:51                                           ` Keir Fraser
@ 2010-04-20 14:10                                             ` Wei, Gang
  2010-04-21  6:47                                             ` Wei, Gang
  1 sibling, 0 replies; 44+ messages in thread
From: Wei, Gang @ 2010-04-20 14:10 UTC (permalink / raw)
  To: Keir Fraser, Yu, Ke, Jiang, Yunhong; +Cc: xen-devel

On Tuesday, 2010-4-20 8:52 PM, Keir Fraser wrote:
> How reliable is S3 in 4.0, for comparison? Did you get it rock solid?

S3 in 4.0 just pass several 100-time S3 stress test. It is not that rock solid. So I am not sure whether it is a regression or not yet.

> Also, try c/s 21204 before getting into heavy debugging. It fixes
> being able to access VMCS fields from c_h_o_c() context. Worth a try.

I will give 21204 a try.

Jimmy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* RE: [Patch] continue_hypercall_on_cpu rework using tasklets
  2010-04-20 12:51                                           ` Keir Fraser
  2010-04-20 14:10                                             ` Wei, Gang
@ 2010-04-21  6:47                                             ` Wei, Gang
  1 sibling, 0 replies; 44+ messages in thread
From: Wei, Gang @ 2010-04-21  6:47 UTC (permalink / raw)
  To: Keir Fraser, Yu, Ke, Jiang, Yunhong; +Cc: xen-devel

On Tuesday, 2010-4-20 8:52 PM, Keir Fraser wrote:
> Also, try c/s 21204 before getting into heavy debugging. It fixes
> being able to access VMCS fields from c_h_o_c() context. Worth a try.

Tried c/s 21208, still can't pass 100-time S3 test. Hung up after 30~60 times.

Jimmy

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2009-12-30 13:57 ` Keir Fraser
@ 2009-12-30 13:59   ` Juergen Gross
  0 siblings, 0 replies; 44+ messages in thread
From: Juergen Gross @ 2009-12-30 13:59 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Keir Fraser wrote:
> Looks great. You can rebase cpupools on top of it and both can go in after
> 4.0 is released.

Thanks!


Juergen

> 
>  -- Keir
> 
> On 30/12/2009 13:46, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:
> 
>> Hi,
>>
>> attached is a patch to use tasklets for continue_hypercall_on_cpu instead of
>> temporarily pinning the vcpu to the target physical cpu.
>>
>> This is thought as base for cpupools as Keir requested to get rid of the
>> "borrow cpu" stuff in my original solution.
>>
>> Tested on x86_64 via a little test hypercall calling continue_hypercall_on_cpu
>> with different target cpus.
>>
>> Keir, is this solution the direction you wanted to go to?
>>
>>
>> Juergen
> 
> 
> 
> 


-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technolgy Solutions               e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [Patch] continue_hypercall_on_cpu rework using tasklets
  2009-12-30 13:46 Juergen Gross
@ 2009-12-30 13:57 ` Keir Fraser
  2009-12-30 13:59   ` Juergen Gross
  0 siblings, 1 reply; 44+ messages in thread
From: Keir Fraser @ 2009-12-30 13:57 UTC (permalink / raw)
  To: Juergen Gross, xen-devel

Looks great. You can rebase cpupools on top of it and both can go in after
4.0 is released.

 -- Keir

On 30/12/2009 13:46, "Juergen Gross" <juergen.gross@ts.fujitsu.com> wrote:

> Hi,
> 
> attached is a patch to use tasklets for continue_hypercall_on_cpu instead of
> temporarily pinning the vcpu to the target physical cpu.
> 
> This is thought as base for cpupools as Keir requested to get rid of the
> "borrow cpu" stuff in my original solution.
> 
> Tested on x86_64 via a little test hypercall calling continue_hypercall_on_cpu
> with different target cpus.
> 
> Keir, is this solution the direction you wanted to go to?
> 
> 
> Juergen

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [Patch] continue_hypercall_on_cpu rework using tasklets
@ 2009-12-30 13:46 Juergen Gross
  2009-12-30 13:57 ` Keir Fraser
  0 siblings, 1 reply; 44+ messages in thread
From: Juergen Gross @ 2009-12-30 13:46 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 800 bytes --]

Hi,

attached is a patch to use tasklets for continue_hypercall_on_cpu instead of
temporarily pinning the vcpu to the target physical cpu.

This is thought as base for cpupools as Keir requested to get rid of the
"borrow cpu" stuff in my original solution.

Tested on x86_64 via a little test hypercall calling continue_hypercall_on_cpu
with different target cpus.

Keir, is this solution the direction you wanted to go to?


Juergen

-- 
Juergen Gross                 Principal Developer Operating Systems
TSP ES&S SWE OS6                       Telephone: +49 (0) 89 3222 2967
Fujitsu Technolgy Solutions               e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28                           Internet: ts.fujitsu.com
D-80807 Muenchen                 Company details: ts.fujitsu.com/imprint.html

[-- Attachment #2: hpc_tasklet.patch --]
[-- Type: text/x-patch, Size: 9632 bytes --]

# HG changeset patch
# User juergen.gross@ts.fujitsu.com
# Date 1262180329 -3600
# Node ID 1aa6f84167e2b4bcfed265d775bc3ac72ce321ed
# Parent  3f654b88e201a1341786a0e8725c25f40c1162b7

Signed-off-by: juergen.gross@ts.fujitsu.com

continue_hypercall_on_cpu rework using tasklets

diff -r 3f654b88e201 -r 1aa6f84167e2 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c	Tue Dec 29 15:11:47 2009 +0000
+++ b/xen/arch/x86/domain.c	Wed Dec 30 14:38:49 2009 +0100
@@ -1506,42 +1506,71 @@
 }
 
 struct migrate_info {
+    struct tasklet tasklet;
     long (*func)(void *data);
     void *data;
     void (*saved_schedule_tail)(struct vcpu *);
-    cpumask_t saved_affinity;
     unsigned int nest;
+    int ret;
+    struct vcpu *v;
+    volatile int ready;
 };
+
+static DEFINE_PER_CPU(struct migrate_info *, mig_info);
 
 static void continue_hypercall_on_cpu_helper(struct vcpu *v)
 {
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct migrate_info *info = v->arch.continue_info;
-    cpumask_t mask = info->saved_affinity;
     void (*saved_schedule_tail)(struct vcpu *) = info->saved_schedule_tail;
 
-    regs->eax = info->func(info->data);
+    while ( !info->ready )
+        cpu_relax();
+
+    regs->eax = info->ret;
 
     if ( info->nest-- == 0 )
     {
+        tasklet_kill(&info->tasklet);
         xfree(info);
         v->arch.schedule_tail = saved_schedule_tail;
         v->arch.continue_info = NULL;
-        vcpu_unlock_affinity(v, &mask);
     }
 
     (*saved_schedule_tail)(v);
+}
+
+static void continue_hypercall_on_cpu_tasklet(struct migrate_info *info)
+{
+    this_cpu(mig_info) = info;
+
+    while ( !vcpu_runnable(info->v) && info->v->is_running )
+        cpu_relax();
+
+    info->ret = info->func(info->data);
+
+    if ( info->nest == 0 )
+    {
+        info->ready = 1;
+        vcpu_wake(info->v);
+    }
+
+    this_cpu(mig_info) = NULL;
+
+    return;
 }
 
 int continue_hypercall_on_cpu(int cpu, long (*func)(void *data), void *data)
 {
     struct vcpu *v = current;
     struct migrate_info *info;
-    cpumask_t mask = cpumask_of_cpu(cpu);
-    int rc;
 
     if ( cpu == smp_processor_id() )
         return func(data);
+
+    info = this_cpu(mig_info);
+    if ( info != NULL )
+        v = info->v;
 
     info = v->arch.continue_info;
     if ( info == NULL )
@@ -1550,16 +1579,12 @@
         if ( info == NULL )
             return -ENOMEM;
 
-        rc = vcpu_lock_affinity(v, &mask);
-        if ( rc )
-        {
-            xfree(info);
-            return rc;
-        }
-
         info->saved_schedule_tail = v->arch.schedule_tail;
-        info->saved_affinity = mask;
         info->nest = 0;
+        info->v = v;
+        tasklet_init(&info->tasklet,
+                     (void(*)(unsigned long))continue_hypercall_on_cpu_tasklet,
+                     (unsigned long)info);
 
         v->arch.schedule_tail = continue_hypercall_on_cpu_helper;
         v->arch.continue_info = info;
@@ -1567,17 +1592,18 @@
     else
     {
         BUG_ON(info->nest != 0);
-        rc = vcpu_locked_change_affinity(v, &mask);
-        if ( rc )
-            return rc;
         info->nest++;
     }
 
     info->func = func;
     info->data = data;
+    info->ready = 0;
+
+    tasklet_schedule_cpu(&info->tasklet, cpu);
+    vcpu_sleep_nosync(v);
+    raise_softirq(SCHEDULE_SOFTIRQ);
 
     /* Dummy return value will be overwritten by new schedule_tail. */
-    BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id())));
     return 0;
 }
 
diff -r 3f654b88e201 -r 1aa6f84167e2 xen/common/schedule.c
--- a/xen/common/schedule.c	Tue Dec 29 15:11:47 2009 +0000
+++ b/xen/common/schedule.c	Wed Dec 30 14:38:49 2009 +0100
@@ -367,26 +367,17 @@
     }
 }
 
-static int __vcpu_set_affinity(
-    struct vcpu *v, cpumask_t *affinity,
-    bool_t old_lock_status, bool_t new_lock_status)
+int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
 {
     cpumask_t online_affinity, old_affinity;
 
+    if ( v->domain->is_pinned )
+        return -EINVAL;
     cpus_and(online_affinity, *affinity, cpu_online_map);
     if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
     vcpu_schedule_lock_irq(v);
-
-    if ( v->affinity_locked != old_lock_status )
-    {
-        BUG_ON(!v->affinity_locked);
-        vcpu_schedule_unlock_irq(v);
-        return -EBUSY;
-    }
-
-    v->affinity_locked = new_lock_status;
 
     old_affinity = v->cpu_affinity;
     v->cpu_affinity = *affinity;
@@ -403,36 +394,6 @@
     }
 
     return 0;
-}
-
-int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    if ( v->domain->is_pinned )
-        return -EINVAL;
-    return __vcpu_set_affinity(v, affinity, 0, 0);
-}
-
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 0, 1);
-}
-
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    return __vcpu_set_affinity(v, affinity, 1, 1);
-}
-
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity)
-{
-    cpumask_t online_affinity;
-
-    /* Do not fail if no CPU in old affinity mask is online. */
-    cpus_and(online_affinity, *affinity, cpu_online_map);
-    if ( cpus_empty(online_affinity) )
-        *affinity = cpu_online_map;
-
-    if ( __vcpu_set_affinity(v, affinity, 1, 0) != 0 )
-        BUG();
 }
 
 /* Block the currently-executing domain until a pertinent event occurs. */
diff -r 3f654b88e201 -r 1aa6f84167e2 xen/common/softirq.c
--- a/xen/common/softirq.c	Tue Dec 29 15:11:47 2009 +0000
+++ b/xen/common/softirq.c	Wed Dec 30 14:38:49 2009 +0100
@@ -88,9 +88,11 @@
 }
 
 static LIST_HEAD(tasklet_list);
+static DEFINE_PER_CPU(struct list_head, tasklet_list_pcpu);
 static DEFINE_SPINLOCK(tasklet_lock);
 
-void tasklet_schedule(struct tasklet *t)
+static void tasklet_schedule_list(struct tasklet *t, struct list_head *tlist,
+    int cpu)
 {
     unsigned long flags;
 
@@ -101,28 +103,44 @@
         if ( !t->is_scheduled && !t->is_running )
         {
             BUG_ON(!list_empty(&t->list));
-            list_add_tail(&t->list, &tasklet_list);
+            list_add_tail(&t->list, tlist);
         }
         t->is_scheduled = 1;
-        raise_softirq(TASKLET_SOFTIRQ);
+        if ( cpu == smp_processor_id() )
+            raise_softirq(TASKLET_SOFTIRQ);
+        else
+            cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
     }
 
     spin_unlock_irqrestore(&tasklet_lock, flags);
 }
 
+void tasklet_schedule(struct tasklet *t)
+{
+    tasklet_schedule_list(t, &tasklet_list, smp_processor_id());
+}
+
+void tasklet_schedule_cpu(struct tasklet *t, int cpu)
+{
+    tasklet_schedule_list(t, &per_cpu(tasklet_list_pcpu, cpu), cpu);
+}
+
 static void tasklet_action(void)
 {
+    struct list_head *tlist;
     struct tasklet *t;
 
     spin_lock_irq(&tasklet_lock);
 
-    if ( list_empty(&tasklet_list) )
+    tlist = ( list_empty(&this_cpu(tasklet_list_pcpu)) ) ? &tasklet_list :
+        &this_cpu(tasklet_list_pcpu);
+    if ( list_empty(tlist) )
     {
         spin_unlock_irq(&tasklet_lock);
         return;
     }
 
-    t = list_entry(tasklet_list.next, struct tasklet, list);
+    t = list_entry(tlist->next, struct tasklet, list);
     list_del_init(&t->list);
 
     BUG_ON(t->is_dead || t->is_running || !t->is_scheduled);
@@ -138,14 +156,15 @@
     if ( t->is_scheduled )
     {
         BUG_ON(t->is_dead || !list_empty(&t->list));
-        list_add_tail(&t->list, &tasklet_list);
+        list_add_tail(&t->list, tlist);
     }
 
     /*
      * If there is more work to do then reschedule. We don't grab more work
      * immediately as we want to allow other softirq work to happen first.
      */
-    if ( !list_empty(&tasklet_list) )
+    if ( !list_empty(&tasklet_list) ||
+        !list_empty(&this_cpu(tasklet_list_pcpu)) )
         raise_softirq(TASKLET_SOFTIRQ);
 
     spin_unlock_irq(&tasklet_lock);
@@ -186,6 +205,12 @@
 
 void __init softirq_init(void)
 {
+    int i;
+
+    for_each_possible_cpu ( i )
+    {
+        INIT_LIST_HEAD(&per_cpu(tasklet_list_pcpu, i));
+    }
     open_softirq(TASKLET_SOFTIRQ, tasklet_action);
 }
 
diff -r 3f654b88e201 -r 1aa6f84167e2 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Tue Dec 29 15:11:47 2009 +0000
+++ b/xen/include/xen/sched.h	Wed Dec 30 14:38:49 2009 +0100
@@ -130,8 +130,6 @@
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */
     bool_t           paused_for_shutdown;
-    /* VCPU affinity is temporarily locked from controller changes? */
-    bool_t           affinity_locked;
 
     /*
      * > 0: a single port is being polled;
@@ -579,9 +577,6 @@
 void vcpu_force_reschedule(struct vcpu *v);
 void cpu_disable_scheduler(void);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
-int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
-void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 uint64_t get_cpu_idle_time(unsigned int cpu);
diff -r 3f654b88e201 -r 1aa6f84167e2 xen/include/xen/softirq.h
--- a/xen/include/xen/softirq.h	Tue Dec 29 15:11:47 2009 +0000
+++ b/xen/include/xen/softirq.h	Wed Dec 30 14:38:49 2009 +0100
@@ -58,6 +58,7 @@
     struct tasklet name = { LIST_HEAD_INIT(name.list), 0, 0, 0, func, data }
 
 void tasklet_schedule(struct tasklet *t);
+void tasklet_schedule_cpu(struct tasklet *t, int cpu);
 void tasklet_kill(struct tasklet *t);
 void tasklet_init(
     struct tasklet *t, void (*func)(unsigned long), unsigned long data);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2010-04-21  6:47 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-13 13:19 [Patch] continue_hypercall_on_cpu rework using tasklets Juergen Gross
2010-04-13 13:40 ` Jan Beulich
2010-04-13 13:49   ` Juergen Gross
2010-04-13 15:08     ` Keir Fraser
2010-04-14  6:54       ` Juergen Gross
2010-04-13 14:41 ` Keir Fraser
2010-04-14  4:26   ` Juergen Gross
2010-04-14  6:46     ` Keir Fraser
2010-04-14  6:58       ` Juergen Gross
2010-04-14  7:15         ` Keir Fraser
2010-04-14  7:25           ` Juergen Gross
2010-04-14  7:35             ` Keir Fraser
2010-04-14  8:04               ` Juergen Gross
2010-04-14 10:30                 ` Keir Fraser
2010-04-15  6:31                   ` Juergen Gross
2010-04-15  6:39                     ` Keir Fraser
2010-04-15  7:57                       ` Juergen Gross
2010-04-15  8:13                         ` Keir Fraser
2010-04-15  8:22                           ` Keir Fraser
2010-04-15  9:59                             ` Jiang, Yunhong
2010-04-15 11:06                               ` Keir Fraser
2010-04-15 11:22                                 ` Keir Fraser
2010-04-16  9:20                                   ` Yu, Ke
2010-04-16 17:51                                     ` Keir Fraser
2010-04-19 10:50                                       ` Keir Fraser
2010-04-19 12:04                                         ` Yu, Ke
2010-04-20  5:35                                         ` Wei, Gang
2010-04-20 12:51                                           ` Keir Fraser
2010-04-20 14:10                                             ` Wei, Gang
2010-04-21  6:47                                             ` Wei, Gang
2010-04-16  6:42                                 ` Jiang, Yunhong
2010-04-16  6:55                                   ` Juergen Gross
2010-04-16  8:30                                     ` Jiang, Yunhong
2010-04-16  7:13                                   ` Keir Fraser
2010-04-16  8:16                                     ` Jiang, Yunhong
2010-04-16 17:57                                       ` Keir Fraser
2010-04-19  5:53                                         ` Jiang, Yunhong
2010-04-19  6:48                                           ` Keir Fraser
2010-04-16  6:45                                 ` Jiang, Yunhong
2010-04-15  8:29                           ` Juergen Gross
2010-04-15  9:08                             ` Keir Fraser
  -- strict thread matches above, loose matches on Subject: below --
2009-12-30 13:46 Juergen Gross
2009-12-30 13:57 ` Keir Fraser
2009-12-30 13:59   ` Juergen Gross

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.