Re: [PATCH v2] vpci/msix: handle accesses adjacent to the MSI-X table

From: Jan Beulich <jbeulich@suse.com>
To: Roger Pau Monne <roger.pau@citrix.com>
Cc: xen-devel@lists.xenproject.org
Subject: Re: [PATCH v2] vpci/msix: handle accesses adjacent to the MSI-X table
Date: Mon, 20 Mar 2023 13:08:48 +0100	[thread overview]
Message-ID: <aa9bf8f2-5ff1-27b0-bc41-b7b639648e85@suse.com> (raw)
In-Reply-To: <20230316120710.38817-1-roger.pau@citrix.com>

On 16.03.2023 13:07, Roger Pau Monne wrote:
> --- a/xen/drivers/vpci/msix.c
> +++ b/xen/drivers/vpci/msix.c
> @@ -27,6 +27,11 @@
>      ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
>       (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
>  
> +#define VMSIX_ADDR_SAME_PAGE(addr, vpci, nr)                              \
> +    (PFN_DOWN(addr) >= PFN_DOWN(vmsix_table_addr(vpci, nr)) &&            \
> +     PFN_DOWN((addr)) < PFN_UP(vmsix_table_addr(vpci, nr) +               \
> +                               vmsix_table_size(vpci, nr) - 1))

Looks like this would be better in line with get_slot() (and slightly
cheaper) if the 2nd comparison was ... <= PFN_DOWN().

> @@ -149,7 +154,7 @@ static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr)
>  
>          for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
>              if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
> -                 VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
> +                 VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) )
>                  return msix;
>      }
>  
> @@ -182,93 +187,201 @@ static struct vpci_msix_entry *get_entry(struct vpci_msix *msix,
>      return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE];
>  }
>  
> -static void __iomem *get_pba(struct vpci *vpci)
> +static void __iomem *get_table(struct vpci *vpci, unsigned int slot)
>  {
>      struct vpci_msix *msix = vpci->msix;
>      /*
> -     * PBA will only be unmapped when the device is deassigned, so access it
> -     * without holding the vpci lock.
> +     * Regions will only be unmapped when the device is deassigned, so access
> +     * them without holding the vpci lock.
>       */
> -    void __iomem *pba = read_atomic(&msix->pba);
> +    void __iomem *table = read_atomic(&msix->table[slot]);
> +    paddr_t addr = 0;
>  
> -    if ( likely(pba) )
> -        return pba;
> +    if ( likely(table) )
> +        return table;
>  
> -    pba = ioremap(vmsix_table_addr(vpci, VPCI_MSIX_PBA),
> -                  vmsix_table_size(vpci, VPCI_MSIX_PBA));
> -    if ( !pba )
> -        return read_atomic(&msix->pba);
> +    switch ( slot )
> +    {
> +    case VPCI_MSIX_TBL_TAIL:
> +        addr = vmsix_table_size(vpci, VPCI_MSIX_TABLE);
> +        fallthrough;
> +    case VPCI_MSIX_TBL_HEAD:
> +        addr += vmsix_table_addr(vpci, VPCI_MSIX_TABLE);
> +        break;
> +
> +    case VPCI_MSIX_PBA_TAIL:
> +        addr = vmsix_table_size(vpci, VPCI_MSIX_PBA);
> +        fallthrough;
> +    case VPCI_MSIX_PBA_HEAD:
> +        addr += vmsix_table_addr(vpci, VPCI_MSIX_PBA);
> +        break;

Hmm, wasn't the plan to stop special-casing the PBA, including its
special treatment wrt the p2m? Reading on I realize you need this for
the (future) DomU case (to enforce r/o-ness, albeit having looked at
the spec again the other day I'm not really convinced anymore we
really need to squash writes), but we should be able to avoid the
extra overhead for Dom0? (Granted it may make sense to leave this to
a separate patch, if we want to keep the DomU handling despite not
presently needing it.)

> +    }
> +
> +    table = ioremap(round_pgdown(addr), PAGE_SIZE);
> +    if ( !table )
> +        return read_atomic(&msix->table[slot]);
>  
>      spin_lock(&vpci->lock);
> -    if ( !msix->pba )
> +    if ( !msix->table[slot] )
>      {
> -        write_atomic(&msix->pba, pba);
> +        write_atomic(&msix->table[slot], table);
>          spin_unlock(&vpci->lock);
>      }
>      else
>      {
>          spin_unlock(&vpci->lock);
> -        iounmap(pba);
> +        iounmap(table);
>      }
>  
> -    return read_atomic(&msix->pba);
> +    return read_atomic(&msix->table[slot]);
>  }
>  
> -static int cf_check msix_read(
> -    struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data)
> +unsigned int get_slot(const struct vpci *vpci, unsigned long addr)
>  {
> -    const struct domain *d = v->domain;
> -    struct vpci_msix *msix = msix_find(d, addr);
> -    const struct vpci_msix_entry *entry;
> -    unsigned int offset;
> +    unsigned long pfn = PFN_DOWN(addr);
>  
> -    *data = ~0ul;
> +    /*
> +     * The logic below relies on having the tables identity mapped to the guest
> +     * address space, or for the `addr` parameter to be translated into its
> +     * host physical memory address equivalent.
> +     */
>  
> -    if ( !msix )
> -        return X86EMUL_RETRY;
> +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE)) )
> +        return VPCI_MSIX_TBL_HEAD;
> +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE) +
> +                         vmsix_table_size(vpci, VPCI_MSIX_TABLE) - 1) )
> +        return VPCI_MSIX_TBL_TAIL;
> +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA)) )
> +        return VPCI_MSIX_PBA_HEAD;
> +    if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA) +
> +                         vmsix_table_size(vpci, VPCI_MSIX_PBA) - 1) )
> +        return VPCI_MSIX_PBA_TAIL;
> +
> +    ASSERT_UNREACHABLE();
> +    return -1;
> +}
>  
> -    if ( !access_allowed(msix->pdev, addr, len) )
> -        return X86EMUL_OKAY;
> +static bool adjacent_handle(const struct vpci_msix *msix, unsigned long addr)
> +{
> +    unsigned int i;
>  
>      if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
> +        return true;
> +
> +    if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_TABLE) )
> +        return false;
> +
> +    for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
> +        if ( VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) )
> +            return true;
> +
> +    return false;
> +}
> +
> +static int adjacent_read(const struct domain *d, const struct vpci_msix *msix,
> +                         unsigned long addr, unsigned int len,
> +                         unsigned long *data)
> +{
> +    const void __iomem *mem;
> +    struct vpci *vpci = msix->pdev->vpci;
> +    unsigned int slot;
> +
> +    *data = ~0ul;
> +
> +    if ( !adjacent_handle(msix, addr + len - 1) )
> +        return X86EMUL_OKAY;
> +
> +    if ( addr & (len - 1) )

unlikely()?

>      {
> -        struct vpci *vpci = msix->pdev->vpci;
> -        unsigned int idx = addr - vmsix_table_addr(vpci, VPCI_MSIX_PBA);
> -        const void __iomem *pba = get_pba(vpci);
> +        unsigned int i;
> +
> +        gprintk(XENLOG_DEBUG, "%pp: unaligned read to MSI-X related page\n",
> +                &msix->pdev->sbdf);
>  
>          /*
> -         * Access to PBA.
> +         * Split unaligned accesses into byte sized ones. Shouldn't happen in
> +         * the first place, but devices shouldn't have registers in the same 4K
> +         * page as the MSIX tables either.
>           *
> -         * TODO: note that this relies on having the PBA identity mapped to the
> -         * guest address space. If this changes the address will need to be
> -         * translated.
> +         * It's unclear whether this could cause issues if a guest expects a
> +         * registers to be accessed atomically, it better use an aligned access
> +         * if it has such expectations.
>           */
> -        if ( !pba )
> -        {
> -            gprintk(XENLOG_WARNING,
> -                    "%pp: unable to map MSI-X PBA, report all pending\n",
> -                    &msix->pdev->sbdf);
> -            return X86EMUL_OKAY;
> -        }
>  
> -        switch ( len )
> +        for ( i = 0; i < len; i++ )
>          {
> -        case 4:
> -            *data = readl(pba + idx);
> -            break;
> +            unsigned long partial = ~0ul;

Pointless initializer (~0ul is written first thing above, i.e. also in
the recursive invocation). Then again that setting is also redundant
with msix_read()'s. So I guess the initializer wants to stay but the
setting at the top of the function can be dropped.

> +            int rc = adjacent_read(d, msix, addr + i, 1, &partial);
>  
> -        case 8:
> -            *data = readq(pba + idx);
> -            break;
> +            if ( rc != X86EMUL_OKAY )
> +                return rc;
>  
> -        default:
> -            ASSERT_UNREACHABLE();
> -            break;
> +            *data &= ~(0xfful << (i * 8));
> +            *data |= ((uint8_t)partial) << (i * 8);

This is UB for i >= 4; you'd need to cast back to unsigned long, at which
point I think the cast-free variant of masking by 0xff is to be preferred.

>          }
>  
>          return X86EMUL_OKAY;
>      }
>  
> +    slot = get_slot(vpci, addr);
> +    if ( slot >= ARRAY_SIZE(msix->table) )
> +        return X86EMUL_OKAY;
> +
> +    mem = get_table(vpci, slot);
> +    if ( !mem )
> +    {
> +        gprintk(XENLOG_WARNING,
> +                "%pp: unable to map MSI-X page, returning all bits set\n",
> +                &msix->pdev->sbdf);
> +        return X86EMUL_OKAY;
> +    }

Could this be moved ahead of the unaligned handling, so there would be
(commonly) only one such log message even for accesses we split?

Jan