Re: [Xen-devel] [PATCH 44/60] xen/sched: add fall back to idle vcpu when scheduling unit

* Re: [Xen-devel] [PATCH 44/60] xen/sched: add fall back to idle vcpu when scheduling unit
@ 2019-07-02 15:01 Jan Beulich
  2019-07-02 16:22 ` Andrew Cooper
  2019-07-03  7:13 ` Juergen Gross
  0 siblings, 2 replies; 5+ messages in thread
From: Jan Beulich @ 2019-07-02 15:01 UTC (permalink / raw)
  To: Juergen Gross
  Cc: Stefano Stabellini, Wei Liu, Konrad Wilk, George Dunlap,
	Andrew Cooper, Tim Deegan, Dario Faggioli, Julien Grall,
	Ian Jackson, xen-devel, Roger Pau Monné

>>> On 28.05.19 at 12:32, <jgross@suse.com> wrote:
> When running an idle vcpu in a non-idle scheduling unit use a specific
> guest idle loop not performing any tasklets, memory scrubbing and
> livepatching in order to avoid populating the cpu caches with memory
> used by other domains (as far as possible). Softirqs are considered to
> be save (timers might want to be excluded, but this can be fine-tuned
> later).

How could timers be legitimately excluded? And how are softirqs
(which similarly can't be excluded here) any less risky than e.g.
tasklets?

As to scrubbing - what gets brought into cache is, except for a very
brief moment, the value the scrubbing routine actually stores. There's
no knowledge to be gained from that by a guest.

> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -159,6 +159,23 @@ static void idle_loop(void)
>      }
>  }
>  
> +/*
> + * Idle loop for siblings of active schedule units.
> + * We don't do any standard idle work like tasklets, page scrubbing or
> + * livepatching.
> + */
> +static void guest_idle_loop(void)
> +{
> +    unsigned int cpu = smp_processor_id();
> +
> +    for ( ; ; )
> +    {
> +        if ( !softirq_pending(cpu) )
> +            sched_guest_idle(pm_idle, cpu);
> +        do_softirq();
> +    }
> +}

In the comment I think you mean "siblings of <whatever> in
active schedule units"?

Having had quite some fun with soft-offlining of CPUs recently,
may I ask that you ASSERT(!cpu_is_offline(cpu)) in the loop
body, such that the absence of a call to play_dead() is also
covered?

> @@ -172,6 +189,10 @@ void startup_cpu_idle_loop(void)
>  
>  static void noreturn continue_idle_domain(struct vcpu *v)
>  {
> +    /* Idle vcpus might be attached to non-idle units! */
> +    if ( !is_idle_domain(v->sched_unit->domain) )
> +        reset_stack_and_jump(guest_idle_loop);
> +
>      reset_stack_and_jump(idle_loop);
>  }

You're aware that there's a call to check_for_livepatch_work() hidden
in reset_stack_and_jump(), which you say you don't want to allow in
this context?

> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -82,7 +82,18 @@ static struct scheduler __read_mostly ops;
>  static inline struct vcpu *sched_unit2vcpu_cpu(struct sched_unit *unit,
>                                                 unsigned int cpu)
>  {
> -    return unit->domain->vcpu[unit->unit_id + per_cpu(sched_res_idx, cpu)];
> +    unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
> +    const struct domain *d = unit->domain;
> +    struct vcpu *v;
> +
> +    if ( idx < d->max_vcpus && d->vcpu[idx] )
> +    {
> +        v = d->vcpu[idx];
> +        if ( v->new_state == RUNSTATE_running )
> +            return v;

Isn't this enough of the cache fill half of a gadget to warrant use of
array_index_nospec() or alike?

> @@ -209,19 +223,11 @@ static inline void vcpu_runstate_change(
>      v->runstate.state = new_state;
>  }
>  
> -static inline void sched_unit_runstate_change(struct sched_unit *unit,
> -    bool running, s_time_t new_entry_time)
> +void sched_guest_idle(void (*idle) (void), unsigned int cpu)

Stray blank between closing and opening parenthesis.

>  {
> -    struct vcpu *v;
> -
> -    for_each_sched_unit_vcpu ( unit, v )
> -        if ( running )
> -            vcpu_runstate_change(v, v->new_state, new_entry_time);
> -        else
> -            vcpu_runstate_change(v,
> -                ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
> -                 (vcpu_runnable(v) ? RUNSTATE_runnable : RUNSTATE_offline)),
> -                new_entry_time);
> +    atomic_inc(&get_sched_res(cpu)->urgent_count);
> +    idle();
> +    atomic_dec(&get_sched_res(cpu)->urgent_count);
>  }

What is "urgent" about an idle vCPU filling an empty sched unit slot?
That is, why do you need to prevent the thread from sleeping as
power efficiently as possible (potentially allowing the sibling thread
to even use more resources)?

> @@ -1637,33 +1644,67 @@ static void sched_switch_units(struct sched_resource *sd,
>                                 struct sched_unit *next, struct sched_unit *prev,
>                                 s_time_t now)
>  {
> -    sd->curr = next;
> -
> -    TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id, prev->unit_id,
> -             now - prev->state_entry_time);
> -    TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, next->unit_id,
> -             (next->vcpu->runstate.state == RUNSTATE_runnable) ?
> -             (now - next->state_entry_time) : 0, prev->next_time);
> +    int cpu;

unsigned int

> @@ -1719,25 +1760,25 @@ static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
>      if ( prev->next_time >= 0 ) /* -ve means no limit */
>          set_timer(&sd->s_timer, now + prev->next_time);
>  
> -    if ( likely(prev != next) )
> -        sched_switch_units(sd, next, prev, now);
> +    sched_switch_units(sd, next, prev, now);
>  
>      return next;
>  }
>  
> -static void context_saved(struct vcpu *prev)
> +static void context_saved(struct sched_unit *unit)
>  {
> -    struct sched_unit *unit = prev->sched_unit;
> -
>      unit->is_running = 0;
>      unit->state_entry_time = NOW();
> +    get_sched_res(smp_processor_id())->prev = NULL;
>  
>      /* Check for migration request /after/ clearing running flag. */
>      smp_mb();
>  
> -    sched_context_saved(vcpu_scheduler(prev), unit);
> +    sched_context_saved(vcpu_scheduler(unit->vcpu), unit);

An example of it unlikely being just one of the vCPU-s in a unit that
you actually want to deal with.

> @@ -1870,7 +1921,8 @@ static void sched_slave(void)
>  
>      pcpu_schedule_unlock_irq(lock, cpu);
>  
> -    sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu), now);
> +    sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
> +                         is_idle_unit(next) && !is_idle_unit(prev), now);
>  }
>  
>  /*
> @@ -1930,7 +1982,8 @@ static void schedule(void)
>      pcpu_schedule_unlock_irq(lock, cpu);
>  
>      vnext = sched_unit2vcpu_cpu(next, cpu);
> -    sched_context_switch(vprev, vnext, now);
> +    sched_context_switch(vprev, vnext,
> +                         !is_idle_unit(prev) && is_idle_unit(next), now);
>  }

Seeing these two calls I'm not only slightly puzzled by the expressions
having operands in opposite order wrt one another, but also why the
callee can't work out the condition without the new parameter/argument.

> --- a/xen/include/asm-arm/current.h
> +++ b/xen/include/asm-arm/current.h
> @@ -18,6 +18,7 @@ DECLARE_PER_CPU(struct vcpu *, curr_vcpu);
>  
>  #define current            (this_cpu(curr_vcpu))
>  #define set_current(vcpu)  do { current = (vcpu); } while (0)
> +#define get_cpu_current(cpu)  (per_cpu(curr_vcpu, cpu))

Pointless outer pair of parentheses.

> --- a/xen/include/asm-x86/current.h
> +++ b/xen/include/asm-x86/current.h
> @@ -77,6 +77,11 @@ struct cpu_info {
>      /* get_stack_bottom() must be 16-byte aligned */
>  };
>  
> +static inline struct cpu_info *get_cpu_info_from_stack(unsigned long sp)
> +{
> +    return (struct cpu_info *)((sp | (STACK_SIZE - 1)) + 1) - 1;
> +}
> +
>  static inline struct cpu_info *get_cpu_info(void)
>  {
>  #ifdef __clang__
> @@ -87,7 +92,7 @@ static inline struct cpu_info *get_cpu_info(void)
>      register unsigned long sp asm("rsp");
>  #endif
>  
> -    return (struct cpu_info *)((sp | (STACK_SIZE - 1)) + 1) - 1;
> +    return get_cpu_info_from_stack(sp);
>  }

With these, why does ...

> --- a/xen/include/asm-x86/smp.h
> +++ b/xen/include/asm-x86/smp.h
> @@ -76,6 +76,9 @@ void set_nr_sockets(void);
>  /* Representing HT and core siblings in each socket. */
>  extern cpumask_t **socket_cpumask;
>  
> +#define get_cpu_current(cpu) \
> +    (get_cpu_info_from_stack((unsigned long)stack_base[cpu])->current_vcpu)

... this end up in a different header? (The outermost pair of parentheses
isn't strictly needed here.)

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread