Re: [PATCH v4 08/12] x86/vpt: switch interrupt injection model

From: Jan Beulich <jbeulich@suse.com>
To: Roger Pau Monne <roger.pau@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wl@xen.org>,
	Jun Nakajima <jun.nakajima@intel.com>,
	Kevin Tian <kevin.tian@intel.com>,
	xen-devel@lists.xenproject.org
Subject: Re: [PATCH v4 08/12] x86/vpt: switch interrupt injection model
Date: Tue, 4 May 2021 13:00:39 +0200	[thread overview]
Message-ID: <b2e83796-ea71-ce71-4fc4-2bf1fc3bc3dc@suse.com> (raw)
In-Reply-To: <20210420140723.65321-9-roger.pau@citrix.com>

On 20.04.2021 16:07, Roger Pau Monne wrote:
> @@ -295,188 +248,153 @@ static void pt_irq_fired(struct vcpu *v, struct periodic_time *pt)
>              list_del(&pt->list);
>          pt->on_list = false;
>          pt->pending_intr_nr = 0;
> +
> +        return;
>      }
> -    else if ( mode_is(v->domain, one_missed_tick_pending) ||
> -              mode_is(v->domain, no_missed_ticks_pending) )
> -    {
> -        pt->last_plt_gtime = hvm_get_guest_time(v);
> -        pt_process_missed_ticks(pt);
> -        pt->pending_intr_nr = 0; /* 'collapse' all missed ticks */
> -        set_timer(&pt->timer, pt->scheduled);
> -    }
> -    else
> +
> +    pt_process_missed_ticks(pt);
> +    /* 'collapse' missed ticks according to the selected mode. */
> +    switch ( pt->vcpu->domain->arch.hvm.params[HVM_PARAM_TIMER_MODE] )
>      {
> -        pt->last_plt_gtime += pt->period;
> -        if ( --pt->pending_intr_nr == 0 )
> -        {
> -            pt_process_missed_ticks(pt);
> -            if ( pt->pending_intr_nr == 0 )
> -                set_timer(&pt->timer, pt->scheduled);
> -        }
> +    case HVMPTM_one_missed_tick_pending:
> +        pt->pending_intr_nr = min(pt->pending_intr_nr, 1u);
> +        break;
> +
> +    case HVMPTM_no_missed_ticks_pending:
> +        pt->pending_intr_nr = 0;
> +        break;
>      }
>  
> -    if ( mode_is(v->domain, delay_for_missed_ticks) &&
> -         (hvm_get_guest_time(v) < pt->last_plt_gtime) )
> -        hvm_set_guest_time(v, pt->last_plt_gtime);
> +    if ( !pt->pending_intr_nr )
> +        set_timer(&pt->timer, pt->scheduled);
>  }
>  
> -int pt_update_irq(struct vcpu *v)
> +static void pt_timer_fn(void *data)
>  {
> -    struct list_head *head = &v->arch.hvm.tm_list;
> -    struct periodic_time *pt, *temp, *earliest_pt;
> -    uint64_t max_lag;
> -    int irq, pt_vector = -1;
> -    bool level;
> +    struct periodic_time *pt = data;
> +    struct vcpu *v;
> +    time_cb *cb = NULL;
> +    void *cb_priv;
> +    unsigned int irq;
>  
> -    pt_vcpu_lock(v);
> +    pt_lock(pt);
>  
> -    earliest_pt = NULL;
> -    max_lag = -1ULL;
> -    list_for_each_entry_safe ( pt, temp, head, list )
> +    v = pt->vcpu;
> +    irq = pt->irq;
> +
> +    if ( inject_interrupt(pt) )
>      {
> -        if ( pt->pending_intr_nr )
> -        {
> -            if ( pt_irq_masked(pt) &&
> -                 /* Level interrupts should be asserted even if masked. */
> -                 !pt->level )
> -            {
> -                /* suspend timer emulation */
> -                list_del(&pt->list);
> -                pt->on_list = 0;
> -            }
> -            else
> -            {
> -                if ( (pt->last_plt_gtime + pt->period) < max_lag )
> -                {
> -                    max_lag = pt->last_plt_gtime + pt->period;
> -                    earliest_pt = pt;
> -                }
> -            }
> -        }
> +        pt->scheduled += pt->period;
> +        pt->do_not_freeze = 0;

Nit: "false" please.

> +        cb = pt->cb;
> +        cb_priv = pt->priv;
>      }
> -
> -    if ( earliest_pt == NULL )
> +    else
>      {
> -        pt_vcpu_unlock(v);
> -        return -1;
> +        /* Masked. */
> +        if ( pt->on_list )
> +            list_del(&pt->list);
> +        pt->on_list = false;
> +        pt->pending_intr_nr++;
>      }

inject_interrupt() returns whether it was able to deliver the interrupt.
This in particular fails if the interrupt was masked and is edge
triggered. This, unexpectedly to me, reports success if a level triggered
interrupt was already pending. But in either event, the missed ticks
accounting is, as per my understanding of the comment in hvm/params.h,
supposed to be dealing with missing delivery due to preemption only. An
interrupt being masked / already pending may not be in this state due to
the guest having got preempted, though. A guest keeping a timer interrupt
masked for an extended period of time should not get a flood of
interrupts later on, no matter what HVM_PARAM_TIMER_MODE is set to.

However, I'm not going to exclude that little bit of doc is wrong, or
implementation and doc aren't in agreement already before your change.

> -    earliest_pt->irq_issued = 1;

This looks to be the only place where the field gets set to non-zero.
If the field is unused after this change, it wants deleting. I notice
patch 11 does so, but it may be worthwhile pointing out
- in the description here, that field removal will happen later,
- in the later patch, that this field was already unused (and doesn't
  become dead by the other removal done there).

> -    irq = earliest_pt->irq;
> -    level = earliest_pt->level;
> +    pt_unlock(pt);
>  
> -    pt_vcpu_unlock(v);
> +    if ( cb )
> +        cb(v, cb_priv);
> +}
>  
> -    switch ( earliest_pt->source )
> -    {
> -    case PTSRC_lapic:
> -        /*
> -         * If periodic timer interrupt is handled by lapic, its vector in
> -         * IRR is returned and used to set eoi_exit_bitmap for virtual
> -         * interrupt delivery case. Otherwise return -1 to do nothing.
> -         */
> -        vlapic_set_irq(vcpu_vlapic(v), irq, 0);
> -        pt_vector = irq;
> -        break;
> +static void eoi_callback(struct periodic_time *pt)
> +{
> +    struct vcpu *v = NULL;
> +    time_cb *cb = NULL;
> +    void *cb_priv = NULL;
>  
> -    case PTSRC_isa:
> -        hvm_isa_irq_deassert(v->domain, irq);
> -        if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) &&
> -             v->domain->arch.hvm.vpic[irq >> 3].int_output )
> -            hvm_isa_irq_assert(v->domain, irq, NULL);
> -        else
> +    pt_lock(pt);
> +
> +    irq_eoi(pt);
> +    if ( pt->pending_intr_nr )
> +    {
> +        if ( inject_interrupt(pt) )
>          {
> -            pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector);
> -            /*
> -             * hvm_isa_irq_assert may not set the corresponding bit in vIRR
> -             * when mask field of IOAPIC RTE is set. Check it again.
> -             */
> -            if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
> -                pt_vector = -1;
> +            pt->pending_intr_nr--;
> +            cb = pt->cb;
> +            cb_priv = pt->priv;
> +            v = pt->vcpu;
>          }
> -        break;
> -
> -    case PTSRC_ioapic:
> -        pt_vector = hvm_ioapic_assert(v->domain, irq, level);
> -        if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
> +        else
>          {
> -            pt_vector = -1;
> -            if ( level )
> -            {
> -                /*
> -                 * Level interrupts are always asserted because the pin assert
> -                 * count is incremented regardless of whether the pin is masked
> -                 * or the vector latched in IRR, so also execute the callback
> -                 * associated with the timer.
> -                 */
> -                time_cb *cb = NULL;
> -                void *cb_priv = NULL;
> -
> -                pt_vcpu_lock(v);
> -                /* Make sure the timer is still on the list. */
> -                list_for_each_entry ( pt, &v->arch.hvm.tm_list, list )
> -                    if ( pt == earliest_pt )
> -                    {
> -                        pt_irq_fired(v, pt);
> -                        cb = pt->cb;
> -                        cb_priv = pt->priv;
> -                        break;
> -                    }
> -                pt_vcpu_unlock(v);
> -
> -                if ( cb != NULL )
> -                    cb(v, cb_priv);
> -            }
> +            /* Masked. */
> +            if ( pt->on_list )
> +                list_del(&pt->list);
> +            pt->on_list = false;
>          }
> -        break;
>      }
>  
> -    return pt_vector;
> +    pt_unlock(pt);
> +
> +    if ( cb )
> +        cb(v, cb_priv);
>  }
>  
> -static struct periodic_time *is_pt_irq(
> -    struct vcpu *v, struct hvm_intack intack)
> +static void vlapic_eoi_callback(struct vcpu *unused, unsigned int unused2,
> +                                void *data)
>  {
> -    struct list_head *head = &v->arch.hvm.tm_list;
> -    struct periodic_time *pt;
> -
> -    list_for_each_entry ( pt, head, list )
> -    {
> -        if ( pt->pending_intr_nr && pt->irq_issued &&
> -             (intack.vector == pt_irq_vector(pt, intack.source)) )
> -            return pt;
> -    }
> +    eoi_callback(data);
> +}
>  
> -    return NULL;
> +static void vioapic_eoi_callback(struct domain *unused, unsigned int unused2,
> +                                 void *data)
> +{
> +    eoi_callback(data);
>  }
>  
> -void pt_intr_post(struct vcpu *v, struct hvm_intack intack)
> +static bool inject_interrupt(struct periodic_time *pt)
>  {
> -    struct periodic_time *pt;
> -    time_cb *cb;
> -    void *cb_priv;
> +    struct vcpu *v = pt->vcpu;
> +    struct domain *d = v->domain;
> +    unsigned int irq = pt->irq;
>  
> -    if ( intack.source == hvm_intsrc_vector )
> -        return;
> +    /* Level interrupts should be asserted even if masked. */
> +    if ( pt_irq_masked(pt) && !pt->level )
> +        return false;
>  
> -    pt_vcpu_lock(v);
> -
> -    pt = is_pt_irq(v, intack);
> -    if ( pt == NULL )
> +    switch ( pt->source )
>      {
> -        pt_vcpu_unlock(v);
> -        return;
> +    case PTSRC_lapic:
> +        vlapic_set_irq_callback(vcpu_vlapic(v), pt->irq, 0, vlapic_eoi_callback,
> +                                pt);
> +        break;
> +
> +    case PTSRC_isa:
> +        hvm_isa_irq_deassert(d, irq);
> +        hvm_isa_irq_assert(d, irq, NULL);
> +        break;
> +
> +    case PTSRC_ioapic:
> +        hvm_ioapic_assert(d, irq, pt->level);
> +        break;
>      }

Why do ISA IRQs get de-asserted first, but IO-APIC ones don't? I
notice e.g. hvm_set_callback_irq_level() and hvm_set_pci_link_route()
have similar apparent asymmetries, so I guess I'm missing something.
In particular I can't spot - even prior to this change - where
hvm_irq->gsi_assert_count[gsi] would get decremented for a level
triggered IRQ, when hvm_ioapic_deassert() gets called only from
hvm/hpet.c:hpet_write(). I guess the main point is that that's the
only case of a level triggered timer interrupt for us?

> @@ -641,20 +590,29 @@ void pt_adjust_global_vcpu_target(struct vcpu *v)
>      write_unlock(&pl_time->vhpet.lock);
>  }
>  
> -
>  static void pt_resume(struct periodic_time *pt)
>  {
> +    struct vcpu *v;
> +    time_cb *cb = NULL;
> +    void *cb_priv;
> +
>      if ( pt->vcpu == NULL )
>          return;
>  
>      pt_lock(pt);
> -    if ( pt->pending_intr_nr && !pt->on_list )
> +    if ( pt->pending_intr_nr && !pt->on_list && inject_interrupt(pt) )
>      {
> +        pt->pending_intr_nr--;
> +        cb = pt->cb;
> +        cb_priv = pt->priv;
> +        v = pt->vcpu;
>          pt->on_list = 1;
>          list_add(&pt->list, &pt->vcpu->arch.hvm.tm_list);
> -        vcpu_kick(pt->vcpu);

Just for my own understanding: The replacement of this is what happens
down the call tree from inject_interrupt()?

Jan