Re: [RFC PATCH 01/10] sched: core: save IRQ state during locking

From: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com>
To: "Jürgen Groß" <jgross@suse.com>
Cc: George Dunlap <george.dunlap@citrix.com>,
	Dario Faggioli <dfaggioli@suse.com>,
	"xen-devel@lists.xenproject.org" <xen-devel@lists.xenproject.org>
Subject: Re: [RFC PATCH 01/10] sched: core: save IRQ state during locking
Date: Tue, 23 Feb 2021 11:15:00 +0000	[thread overview]
Message-ID: <877dmz6nnv.fsf@epam.com> (raw)
In-Reply-To: <4f7b4788-3b2e-8501-6aec-948b70320af2@suse.com>

Hi Jurgen,

Jürgen Groß writes:

> On 23.02.21 03:34, Volodymyr Babchuk wrote:
>> With XEN preemption enabled, scheduler functions can be called with
>> IRQs disabled (for example, at end of IRQ handler), so we should
>> save and restore IRQ state in schedulers code.
>
> This breaks core scheduling.

Yes, thank you. I forgot to mention that this PoC is not compatible with
core scheduling. It is not used on ARM, so I could not test it anyways.

> Waiting for another sibling with interrupts disabled is an absolute
> no go, as deadlocks are the consequence.
>
> You could (in theory) make preemption and core scheduling mutually
> exclusive, but this would break the forward path to mutexes etc.
>

Well, I implemented the most naive way to enable hypervisor
preemption. I'm sure that with a bit more careful approach I can make it
compatible with core scheduling. There is no strict requirement to run
scheduler with IRQs disabled.

>
> Juergen
>
>> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
>> ---
>>   xen/common/sched/core.c | 33 ++++++++++++++++++---------------
>>   1 file changed, 18 insertions(+), 15 deletions(-)
>> diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
>> index 9745a77eee..7e075613d5 100644
>> --- a/xen/common/sched/core.c
>> +++ b/xen/common/sched/core.c
>> @@ -2470,7 +2470,8 @@ static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
>>    * sched_res_rculock has been dropped.
>>    */
>>   static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>> -                                                   spinlock_t **lock, int cpu,
>> +                                                   spinlock_t **lock,
>> +                                                   unsigned long *flags, int cpu,
>>                                                      s_time_t now)
>>   {
>>       struct sched_unit *next;
>> @@ -2500,7 +2501,7 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>>                   prev->rendezvous_in_cnt++;
>>                   atomic_set(&prev->rendezvous_out_cnt, 0);
>>   -                pcpu_schedule_unlock_irq(*lock, cpu);
>> +                pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>>                     sched_context_switch(vprev, v, false, now);
>>   @@ -2530,7 +2531,7 @@ static struct sched_unit
>> *sched_wait_rendezvous_in(struct sched_unit *prev,
>>               prev->rendezvous_in_cnt++;
>>               atomic_set(&prev->rendezvous_out_cnt, 0);
>>   -            pcpu_schedule_unlock_irq(*lock, cpu);
>> +            pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>>                 raise_softirq(SCHED_SLAVE_SOFTIRQ);
>>               sched_context_switch(vprev, vprev, false, now);
>> @@ -2538,11 +2539,11 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>>               return NULL;         /* ARM only. */
>>           }
>>   -        pcpu_schedule_unlock_irq(*lock, cpu);
>> +        pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>>             cpu_relax();
>>   -        *lock = pcpu_schedule_lock_irq(cpu);
>> +        *lock = pcpu_schedule_lock_irqsave(cpu, flags);
>>             /*
>>            * Check for scheduling resource switched. This happens when we are
>> @@ -2557,7 +2558,7 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>>               ASSERT(is_idle_unit(prev));
>>               atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
>>               prev->rendezvous_in_cnt = 0;
>> -            pcpu_schedule_unlock_irq(*lock, cpu);
>> +            pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>>               rcu_read_unlock(&sched_res_rculock);
>>               return NULL;
>>           }
>> @@ -2574,12 +2575,13 @@ static void sched_slave(void)
>>       spinlock_t           *lock;
>>       bool                  do_softirq = false;
>>       unsigned int          cpu = smp_processor_id();
>> +    unsigned long         flags;
>>         ASSERT_NOT_IN_ATOMIC();
>>         rcu_read_lock(&sched_res_rculock);
>>   -    lock = pcpu_schedule_lock_irq(cpu);
>> +    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
>>         now = NOW();
>>   @@ -2590,7 +2592,7 @@ static void sched_slave(void)
>>             if ( v )
>>           {
>> -            pcpu_schedule_unlock_irq(lock, cpu);
>> +            pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>                 sched_context_switch(vprev, v, false, now);
>>   @@ -2602,7 +2604,7 @@ static void sched_slave(void)
>>         if ( !prev->rendezvous_in_cnt )
>>       {
>> -        pcpu_schedule_unlock_irq(lock, cpu);
>> +        pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>             rcu_read_unlock(&sched_res_rculock);
>>   @@ -2615,11 +2617,11 @@ static void sched_slave(void)
>>         stop_timer(&get_sched_res(cpu)->s_timer);
>>   -    next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
>> +    next = sched_wait_rendezvous_in(prev, &lock, &flags, cpu, now);
>>       if ( !next )
>>           return;
>>   -    pcpu_schedule_unlock_irq(lock, cpu);
>> +    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>         sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
>>                            is_idle_unit(next) && !is_idle_unit(prev), now);
>> @@ -2637,6 +2639,7 @@ static void schedule(void)
>>       s_time_t              now;
>>       struct sched_resource *sr;
>>       spinlock_t           *lock;
>> +    unsigned long         flags;
>>       int cpu = smp_processor_id();
>>       unsigned int          gran;
>>   @@ -2646,7 +2649,7 @@ static void schedule(void)
>>         rcu_read_lock(&sched_res_rculock);
>>   -    lock = pcpu_schedule_lock_irq(cpu);
>> +    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
>>         sr = get_sched_res(cpu);
>>       gran = sr->granularity;
>> @@ -2657,7 +2660,7 @@ static void schedule(void)
>>            * We have a race: sched_slave() should be called, so raise a softirq
>>            * in order to re-enter schedule() later and call sched_slave() now.
>>            */
>> -        pcpu_schedule_unlock_irq(lock, cpu);
>> +        pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>             rcu_read_unlock(&sched_res_rculock);
>>   @@ -2676,7 +2679,7 @@ static void schedule(void)
>>           prev->rendezvous_in_cnt = gran;
>>           cpumask_andnot(mask, sr->cpus, cpumask_of(cpu));
>>           cpumask_raise_softirq(mask, SCHED_SLAVE_SOFTIRQ);
>> -        next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
>> +        next = sched_wait_rendezvous_in(prev, &lock, &flags, cpu, now);
>>           if ( !next )
>>               return;
>>       }
>> @@ -2687,7 +2690,7 @@ static void schedule(void)
>>           atomic_set(&next->rendezvous_out_cnt, 0);
>>       }
>>   -    pcpu_schedule_unlock_irq(lock, cpu);
>> +    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>         vnext = sched_unit2vcpu_cpu(next, cpu);
>>       sched_context_switch(vprev, vnext,
>> 

-- 
Volodymyr Babchuk at EPAM