All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
       [not found] ` <5A905C0902000078001AB15D@prv-mh.provo.novell.com>
@ 2018-02-26 11:20   ` Jan Beulich
  2018-02-26 18:00     ` Roger Pau Monné
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2018-02-26 11:20 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, Boris Ostrovsky

(re-sending with xen-devel re-added; not sure how it got lost)

>>> On 23.01.18 at 16:07, <roger.pau@citrix.com> wrote:
> ---
>  tools/tests/vpci/emul.h   |   1 +
>  xen/arch/x86/hvm/ioreq.c  |   4 +

Again the Cc to Paul is missing (no matter that it's just a tiny change).

> +static int map_range(unsigned long s, unsigned long e, void *data,
> +                     unsigned long *c)
> +{
> +    const struct map_data *map = data;
> +    int rc;
> +
> +    for ( ; ; )
> +    {
> +        unsigned long size = e - s + 1;

Considering the lack of a condition in the for() and the inclusiveness
of the range (which means you can't express an empty range) I don't
understand how ...

> +        /*
> +         * ARM TODOs:
> +         * - On ARM whether the memory is prefetchable or not should be passed
> +         *   to map_mmio_regions in order to decide which memory attributes
> +         *   should be used.
> +         *
> +         * - {un}map_mmio_regions doesn't support preemption, hence the bodge
> +         *   below in order to limit the amount of mappings to 64 pages for
> +         *   each function call.
> +         */
> +
> +#ifdef CONFIG_ARM
> +        size = min(64ul, size);
> +#endif
> +
> +        rc = (map->map ? map_mmio_regions : unmap_mmio_regions)
> +             (map->d, _gfn(s), size, _mfn(s));
> +        if ( rc == 0 )
> +        {
> +            *c += size;
> +#ifdef CONFIG_ARM
> +            rc = -ERESTART;
> +#endif
> +            break;

... this works in the ARM case. If the whole thing doesn't work (and
doesn't get built) for ARM right now, make this obvious by one or more
#error directives.

> +static void modify_decoding(const struct pci_dev *pdev, bool map, bool rom)
> +{
> +    struct vpci_header *header = &pdev->vpci->header;
> +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> +    unsigned int i;
> +
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        if ( rom && header->bars[i].type == VPCI_BAR_ROM )
> +        {
> +            unsigned int rom_pos = (i == 6) ? PCI_ROM_ADDRESS

I probably should have mentioned this earlier, but I'm really unhappy
about such literal "magic" numbers. Please use a suitable expression
or #define.

> +bool vpci_process_pending(struct vcpu *v)
> +{
> +    while ( v->vpci.mem )
> +    {
> +        struct map_data data = {
> +            .d = v->domain,
> +            .map = v->vpci.map,
> +        };
> +
> +        switch ( rangeset_consume_ranges(v->vpci.mem, map_range, &data) )
> +        {
> +        case -ERESTART:
> +            return true;
> +
> +        default:
> +            if ( v->vpci.map )
> +            {
> +                spin_lock(&v->vpci.pdev->vpci->lock);
> +                modify_decoding(v->vpci.pdev, v->vpci.map, v->vpci.rom);
> +                spin_unlock(&v->vpci.pdev->vpci->lock);
> +            }
> +            /* fallthrough. */
> +        case -ENOMEM:

You carefully handle this error here.

> +static void maybe_defer_map(struct domain *d, const struct pci_dev *pdev,
> +                            struct rangeset *mem, bool map, bool rom)
> +{
> +    struct vcpu *curr = current;
> +
> +    if ( is_idle_vcpu(curr) )
> +    {
> +        struct map_data data = { .d = d, .map = true };
> +
> +        /*
> +         * Only used for domain construction in order to map the BARs
> +         * of devices with memory decoding enabled.
> +         */
> +        ASSERT(map && !rom);
> +        rangeset_consume_ranges(mem, map_range, &data);

What if this produces -ENOMEM? And despite having looked at
several revisions of this, I can't make the connection to why this
is in an is_idle_vcpu() conditional (neither the direct caller nor the
next level up make this obvious to me). There's clearly a need
for extending the comment.

> +static void modify_bars(const struct pci_dev *pdev, bool map, bool rom)
> +{
> +    struct vpci_header *header = &pdev->vpci->header;
> +    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
> +    const struct pci_dev *tmp;
> +    unsigned int i;
> +    int rc;
> +
> +    if ( !map )
> +        modify_decoding(pdev, false, rom);
> +
> +    if ( !mem )
> +        return;

Similarly here - why is it okay (or what effect will it have) to return
here when we're out of memory, but the caller won't know?

> +    /*
> +     * Create a rangeset that represents the current device BARs memory region
> +     * and compare it against all the currently active BAR memory regions. If
> +     * an overlap is found, subtract it from the region to be
> +     * mapped/unmapped.
> +     *
> +     * NB: the rangeset uses inclusive frame numbers.
> +     */
> +
> +    /*
> +     * First fill the rangeset with all the BARs of this device or with the ROM
> +     * BAR only, depending on whether the guest is toggling the memory decode
> +     * bit of the command register, or the enable bit of the ROM BAR register.
> +     */
> +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> +    {
> +        const struct vpci_bar *bar = &header->bars[i];
> +
> +        if ( !MAPPABLE_BAR(bar) ||
> +             (rom ? bar->type != VPCI_BAR_ROM
> +                  : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
> +            continue;

Why does rom_enabled matter for the !rom case rather than for
the rom one? I.e.

        if ( !MAPPABLE_BAR(bar) ||
             (rom ? bar->type != VPCI_BAR_ROM || !header->rom_enabled
                  : bar->type == VPCI_BAR_ROM) )
            continue;

?

> +        rc = rangeset_add_range(mem, PFN_DOWN(bar->addr),
> +                                PFN_UP(bar->addr + bar->size - 1));
> +        if ( rc )
> +        {
> +            printk(XENLOG_G_WARNING
> +                   "Failed to add [%" PRI_gfn ", %" PRI_gfn "): %d\n",
> +                   PFN_DOWN(bar->addr), PFN_UP(bar->addr + bar->size - 1),

Either you use [a,b) and don't subtract 1, or you use [a,b] with the
subtraction. Same below.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-26 11:20   ` [PATCH v8 07/11] vpci/bars: add handlers to map the BARs Jan Beulich
@ 2018-02-26 18:00     ` Roger Pau Monné
  2018-02-27  8:32       ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Roger Pau Monné @ 2018-02-26 18:00 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, Boris Ostrovsky

On Mon, Feb 26, 2018 at 04:20:17AM -0700, Jan Beulich wrote:
> (re-sending with xen-devel re-added; not sure how it got lost)
> 
> >>> On 23.01.18 at 16:07, <roger.pau@citrix.com> wrote:
> > ---
> >  tools/tests/vpci/emul.h   |   1 +
> >  xen/arch/x86/hvm/ioreq.c  |   4 +
> 
> Again the Cc to Paul is missing (no matter that it's just a tiny change).

Sorry, I will run get_maintainer.pl against the whole patchset before
sending a new version.

> > +static int map_range(unsigned long s, unsigned long e, void *data,
> > +                     unsigned long *c)
> > +{
> > +    const struct map_data *map = data;
> > +    int rc;
> > +
> > +    for ( ; ; )
> > +    {
> > +        unsigned long size = e - s + 1;
> 
> Considering the lack of a condition in the for() and the inclusiveness
> of the range (which means you can't express an empty range) I don't
> understand how ...
> 
> > +        /*
> > +         * ARM TODOs:
> > +         * - On ARM whether the memory is prefetchable or not should be passed
> > +         *   to map_mmio_regions in order to decide which memory attributes
> > +         *   should be used.
> > +         *
> > +         * - {un}map_mmio_regions doesn't support preemption, hence the bodge
> > +         *   below in order to limit the amount of mappings to 64 pages for
> > +         *   each function call.
> > +         */
> > +
> > +#ifdef CONFIG_ARM
> > +        size = min(64ul, size);
> > +#endif
> > +
> > +        rc = (map->map ? map_mmio_regions : unmap_mmio_regions)
> > +             (map->d, _gfn(s), size, _mfn(s));
> > +        if ( rc == 0 )
> > +        {
> > +            *c += size;
> > +#ifdef CONFIG_ARM
> > +            rc = -ERESTART;
> > +#endif
> > +            break;
> 
> ... this works in the ARM case. If the whole thing doesn't work (and
> doesn't get built) for ARM right now, make this obvious by one or more
> #error directives.

ARM will never iterate, instead it will rely on always returning
-ERESTART and letting the caller of rangeset_consume_ranges deal with
it. What's wrong here is that the call to rangeset_consume_ranges
should be:

while ( rangeset_consume_ranges(...) == -ERESTART );

But that makes the code even more convoluted than what it already is.
I've basically added the ARM part because Julien requested it, but I
think the right way to fix this is to unify the behaviour of
{un}map_mmio_regions between x86 and ARM.

I will drop the ARM chunks.

> > +static void modify_decoding(const struct pci_dev *pdev, bool map, bool rom)
> > +{
> > +    struct vpci_header *header = &pdev->vpci->header;
> > +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> > +    unsigned int i;
> > +
> > +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > +    {
> > +        if ( rom && header->bars[i].type == VPCI_BAR_ROM )
> > +        {
> > +            unsigned int rom_pos = (i == 6) ? PCI_ROM_ADDRESS
> 
> I probably should have mentioned this earlier, but I'm really unhappy
> about such literal "magic" numbers. Please use a suitable expression
> or #define.
> 
> > +bool vpci_process_pending(struct vcpu *v)
> > +{
> > +    while ( v->vpci.mem )
> > +    {
> > +        struct map_data data = {
> > +            .d = v->domain,
> > +            .map = v->vpci.map,
> > +        };
> > +
> > +        switch ( rangeset_consume_ranges(v->vpci.mem, map_range, &data) )
> > +        {
> > +        case -ERESTART:
> > +            return true;
> > +
> > +        default:
> > +            if ( v->vpci.map )
> > +            {
> > +                spin_lock(&v->vpci.pdev->vpci->lock);
> > +                modify_decoding(v->vpci.pdev, v->vpci.map, v->vpci.rom);
> > +                spin_unlock(&v->vpci.pdev->vpci->lock);
> > +            }
> > +            /* fallthrough. */
> > +        case -ENOMEM:
> 
> You carefully handle this error here.

On second thought, I'm not sure handling ENOMEM separately makes
sense. Unless you object I plan to remove this special casing.

> > +static void maybe_defer_map(struct domain *d, const struct pci_dev *pdev,
> > +                            struct rangeset *mem, bool map, bool rom)
> > +{
> > +    struct vcpu *curr = current;
> > +
> > +    if ( is_idle_vcpu(curr) )
> > +    {
> > +        struct map_data data = { .d = d, .map = true };
> > +
> > +        /*
> > +         * Only used for domain construction in order to map the BARs
> > +         * of devices with memory decoding enabled.
> > +         */
> > +        ASSERT(map && !rom);
> > +        rangeset_consume_ranges(mem, map_range, &data);
> 
> What if this produces -ENOMEM? And despite having looked at
> several revisions of this, I can't make the connection to why this
> is in an is_idle_vcpu() conditional (neither the direct caller nor the
> next level up make this obvious to me). There's clearly a need
> for extending the comment.

I thought the comment above that mentions domain construction would be
enough. I can try to expand this, maybe like:

"This function will only be called from the idle vCPU while building
the domain, in which case it's not possible to defer the operation
(like done in the else branch). Call rangeset_consume_ranges in order
to establish the mappings right away."

> > +static void modify_bars(const struct pci_dev *pdev, bool map, bool rom)
> > +{
> > +    struct vpci_header *header = &pdev->vpci->header;
> > +    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
> > +    const struct pci_dev *tmp;
> > +    unsigned int i;
> > +    int rc;
> > +
> > +    if ( !map )
> > +        modify_decoding(pdev, false, rom);
> > +
> > +    if ( !mem )
> > +        return;
> 
> Similarly here - why is it okay (or what effect will it have) to return
> here when we're out of memory, but the caller won't know?

The behaviour here depends on the change to the memory decoding bit:

 - Clearing: memory decoding on device will be disabled, BARs won't be
   unmapped.
 - Setting: no change to device memory decoding bit, BARs won't be
   mapped.

Do you think this is suitable? IMO it's fine to disable the memory
decoding bit on the device and leave the memory regions mapped.

> > +    /*
> > +     * Create a rangeset that represents the current device BARs memory region
> > +     * and compare it against all the currently active BAR memory regions. If
> > +     * an overlap is found, subtract it from the region to be
> > +     * mapped/unmapped.
> > +     *
> > +     * NB: the rangeset uses inclusive frame numbers.
> > +     */
> > +
> > +    /*
> > +     * First fill the rangeset with all the BARs of this device or with the ROM
> > +     * BAR only, depending on whether the guest is toggling the memory decode
> > +     * bit of the command register, or the enable bit of the ROM BAR register.
> > +     */
> > +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> > +    {
> > +        const struct vpci_bar *bar = &header->bars[i];
> > +
> > +        if ( !MAPPABLE_BAR(bar) ||
> > +             (rom ? bar->type != VPCI_BAR_ROM
> > +                  : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
> > +            continue;
> 
> Why does rom_enabled matter for the !rom case rather than for
> the rom one? I.e.
> 
>         if ( !MAPPABLE_BAR(bar) ||
>              (rom ? bar->type != VPCI_BAR_ROM || !header->rom_enabled
>                   : bar->type == VPCI_BAR_ROM) )
>             continue;
> 
> ?

No, for the ROM case we only want to map/unmap the ROM, so that's the
only thing added to the rangeset. For the non-ROM case Xen will also
map/unmap the ROM if the enable bit is set.

Your proposed code would always map/unmap the ROM into the p2m when
the memory decoding bit is changed even if it's not enabled.

> > +        rc = rangeset_add_range(mem, PFN_DOWN(bar->addr),
> > +                                PFN_UP(bar->addr + bar->size - 1));
> > +        if ( rc )
> > +        {
> > +            printk(XENLOG_G_WARNING
> > +                   "Failed to add [%" PRI_gfn ", %" PRI_gfn "): %d\n",
> > +                   PFN_DOWN(bar->addr), PFN_UP(bar->addr + bar->size - 1),
> 
> Either you use [a,b) and don't subtract 1, or you use [a,b] with the
> subtraction. Same below.

[a, b) seems better in order to avoid the -1.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-26 18:00     ` Roger Pau Monné
@ 2018-02-27  8:32       ` Jan Beulich
  2018-02-27  9:21         ` Roger Pau Monné
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2018-02-27  8:32 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, Boris Ostrovsky

>>> On 26.02.18 at 19:00, <roger.pau@citrix.com> wrote:
> On Mon, Feb 26, 2018 at 04:20:17AM -0700, Jan Beulich wrote:
>> >>> On 23.01.18 at 16:07, <roger.pau@citrix.com> wrote:
>> > +bool vpci_process_pending(struct vcpu *v)
>> > +{
>> > +    while ( v->vpci.mem )
>> > +    {
>> > +        struct map_data data = {
>> > +            .d = v->domain,
>> > +            .map = v->vpci.map,
>> > +        };
>> > +
>> > +        switch ( rangeset_consume_ranges(v->vpci.mem, map_range, &data) )
>> > +        {
>> > +        case -ERESTART:
>> > +            return true;
>> > +
>> > +        default:
>> > +            if ( v->vpci.map )
>> > +            {
>> > +                spin_lock(&v->vpci.pdev->vpci->lock);
>> > +                modify_decoding(v->vpci.pdev, v->vpci.map, v->vpci.rom);
>> > +                spin_unlock(&v->vpci.pdev->vpci->lock);
>> > +            }
>> > +            /* fallthrough. */
>> > +        case -ENOMEM:
>> 
>> You carefully handle this error here.
> 
> On second thought, I'm not sure handling ENOMEM separately makes
> sense. Unless you object I plan to remove this special casing.

Indeed I was never really happy with the special casing, but I
did recall this being done intentionally, so I also didn't complain.

>> > +static void maybe_defer_map(struct domain *d, const struct pci_dev *pdev,
>> > +                            struct rangeset *mem, bool map, bool rom)
>> > +{
>> > +    struct vcpu *curr = current;
>> > +
>> > +    if ( is_idle_vcpu(curr) )
>> > +    {
>> > +        struct map_data data = { .d = d, .map = true };
>> > +
>> > +        /*
>> > +         * Only used for domain construction in order to map the BARs
>> > +         * of devices with memory decoding enabled.
>> > +         */
>> > +        ASSERT(map && !rom);
>> > +        rangeset_consume_ranges(mem, map_range, &data);
>> 
>> What if this produces -ENOMEM? And despite having looked at
>> several revisions of this, I can't make the connection to why this
>> is in an is_idle_vcpu() conditional (neither the direct caller nor the
>> next level up make this obvious to me). There's clearly a need
>> for extending the comment.
> 
> I thought the comment above that mentions domain construction would be
> enough. I can try to expand this, maybe like:
> 
> "This function will only be called from the idle vCPU while building
> the domain, in which case it's not possible to defer the operation
> (like done in the else branch). Call rangeset_consume_ranges in order
> to establish the mappings right away."

And what again is the connection between is_idle_domain() and
domain construction? I think the comment belongs ahead of the
if(), and it needs to make that connection.

>> > +static void modify_bars(const struct pci_dev *pdev, bool map, bool rom)
>> > +{
>> > +    struct vpci_header *header = &pdev->vpci->header;
>> > +    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>> > +    const struct pci_dev *tmp;
>> > +    unsigned int i;
>> > +    int rc;
>> > +
>> > +    if ( !map )
>> > +        modify_decoding(pdev, false, rom);
>> > +
>> > +    if ( !mem )
>> > +        return;
>> 
>> Similarly here - why is it okay (or what effect will it have) to return
>> here when we're out of memory, but the caller won't know?
> 
> The behaviour here depends on the change to the memory decoding bit:
> 
>  - Clearing: memory decoding on device will be disabled, BARs won't be
>    unmapped.
>  - Setting: no change to device memory decoding bit, BARs won't be
>    mapped.
> 
> Do you think this is suitable? IMO it's fine to disable the memory
> decoding bit on the device and leave the memory regions mapped.

As long as subsequent changes to the decoding bit can't leave
stale mappings. Plus there needs to be a comment to explain this.

>> > +    /*
>> > +     * Create a rangeset that represents the current device BARs memory region
>> > +     * and compare it against all the currently active BAR memory regions. If
>> > +     * an overlap is found, subtract it from the region to be
>> > +     * mapped/unmapped.
>> > +     *
>> > +     * NB: the rangeset uses inclusive frame numbers.
>> > +     */
>> > +
>> > +    /*
>> > +     * First fill the rangeset with all the BARs of this device or with the ROM
>> > +     * BAR only, depending on whether the guest is toggling the memory decode
>> > +     * bit of the command register, or the enable bit of the ROM BAR register.
>> > +     */
>> > +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> > +    {
>> > +        const struct vpci_bar *bar = &header->bars[i];
>> > +
>> > +        if ( !MAPPABLE_BAR(bar) ||
>> > +             (rom ? bar->type != VPCI_BAR_ROM
>> > +                  : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
>> > +            continue;
>> 
>> Why does rom_enabled matter for the !rom case rather than for
>> the rom one? I.e.
>> 
>>         if ( !MAPPABLE_BAR(bar) ||
>>              (rom ? bar->type != VPCI_BAR_ROM || !header->rom_enabled
>>                   : bar->type == VPCI_BAR_ROM) )
>>             continue;
>> 
>> ?
> 
> No, for the ROM case we only want to map/unmap the ROM, so that's the
> only thing added to the rangeset. For the non-ROM case Xen will also
> map/unmap the ROM if the enable bit is set.
> 
> Your proposed code would always map/unmap the ROM into the p2m when
> the memory decoding bit is changed even if it's not enabled.

I don't understand. Taking apart the conditional I've suggested,
and converting to human language:
- if the BAR is no mappable, continue
- if we want to deal with ROM (rom=true), if the BAR isn't ROM
  or isn't enabled, continue
- if we want to deal with non-ROM (rom=false), if the BAR is ROM,
  continue

To me this is in line with the 2nd paragraph of your reply. It's not
in line with the first, which makes me wonder whether "rom" is
misnamed and wants to be "rom_only". Still, the question would
remain of why rom_enabled doesn't matter when the variable is
true.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-27  8:32       ` Jan Beulich
@ 2018-02-27  9:21         ` Roger Pau Monné
  2018-02-27 10:01           ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Roger Pau Monné @ 2018-02-27  9:21 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, Boris Ostrovsky

On Tue, Feb 27, 2018 at 01:32:24AM -0700, Jan Beulich wrote:
> >>> On 26.02.18 at 19:00, <roger.pau@citrix.com> wrote:
> > On Mon, Feb 26, 2018 at 04:20:17AM -0700, Jan Beulich wrote:
> >> >>> On 23.01.18 at 16:07, <roger.pau@citrix.com> wrote:
> >> > +bool vpci_process_pending(struct vcpu *v)
> >> > +{
> >> > +    while ( v->vpci.mem )
> >> > +    {
> >> > +        struct map_data data = {
> >> > +            .d = v->domain,
> >> > +            .map = v->vpci.map,
> >> > +        };
> >> > +
> >> > +        switch ( rangeset_consume_ranges(v->vpci.mem, map_range, &data) )
> >> > +        {
> >> > +        case -ERESTART:
> >> > +            return true;
> >> > +
> >> > +        default:
> >> > +            if ( v->vpci.map )
> >> > +            {
> >> > +                spin_lock(&v->vpci.pdev->vpci->lock);
> >> > +                modify_decoding(v->vpci.pdev, v->vpci.map, v->vpci.rom);
> >> > +                spin_unlock(&v->vpci.pdev->vpci->lock);
> >> > +            }
> >> > +            /* fallthrough. */
> >> > +        case -ENOMEM:
> >> 
> >> You carefully handle this error here.
> > 
> > On second thought, I'm not sure handling ENOMEM separately makes
> > sense. Unless you object I plan to remove this special casing.
> 
> Indeed I was never really happy with the special casing, but I
> did recall this being done intentionally, so I also didn't complain.

IIRC we had some discussion about the different error codes and
somehow decided that ENOMEM might be worse than others, in the sense
that less memory would be mapped and the chances of the device
actually working would be slim. IMO we can always special case errors
later, let's start with something simpler.

> >> > +static void maybe_defer_map(struct domain *d, const struct pci_dev *pdev,
> >> > +                            struct rangeset *mem, bool map, bool rom)
> >> > +{
> >> > +    struct vcpu *curr = current;
> >> > +
> >> > +    if ( is_idle_vcpu(curr) )
> >> > +    {
> >> > +        struct map_data data = { .d = d, .map = true };
> >> > +
> >> > +        /*
> >> > +         * Only used for domain construction in order to map the BARs
> >> > +         * of devices with memory decoding enabled.
> >> > +         */
> >> > +        ASSERT(map && !rom);
> >> > +        rangeset_consume_ranges(mem, map_range, &data);
> >> 
> >> What if this produces -ENOMEM? And despite having looked at
> >> several revisions of this, I can't make the connection to why this
> >> is in an is_idle_vcpu() conditional (neither the direct caller nor the
> >> next level up make this obvious to me). There's clearly a need
> >> for extending the comment.
> > 
> > I thought the comment above that mentions domain construction would be
> > enough. I can try to expand this, maybe like:
> > 
> > "This function will only be called from the idle vCPU while building
> > the domain, in which case it's not possible to defer the operation
> > (like done in the else branch). Call rangeset_consume_ranges in order
> > to establish the mappings right away."
> 
> And what again is the connection between is_idle_domain() and
> domain construction? I think the comment belongs ahead of the
> if(), and it needs to make that connection.

Oh, domain constructions runs on the idle vCPU, that's the relation.

"This function will only be called from the idle vCPU while building
the domain (because Dom0 building runs on the idle vCPU), in which
case it's not possible to defer the operation (like done in the else
branch). Call rangeset_consume_ranges in order to establish the
mappings right away."

Does that seem clearer if placed ahead of the if?

> >> > +static void modify_bars(const struct pci_dev *pdev, bool map, bool rom)
> >> > +{
> >> > +    struct vpci_header *header = &pdev->vpci->header;
> >> > +    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
> >> > +    const struct pci_dev *tmp;
> >> > +    unsigned int i;
> >> > +    int rc;
> >> > +
> >> > +    if ( !map )
> >> > +        modify_decoding(pdev, false, rom);
> >> > +
> >> > +    if ( !mem )
> >> > +        return;
> >> 
> >> Similarly here - why is it okay (or what effect will it have) to return
> >> here when we're out of memory, but the caller won't know?
> > 
> > The behaviour here depends on the change to the memory decoding bit:
> > 
> >  - Clearing: memory decoding on device will be disabled, BARs won't be
> >    unmapped.
> >  - Setting: no change to device memory decoding bit, BARs won't be
> >    mapped.
> > 
> > Do you think this is suitable? IMO it's fine to disable the memory
> > decoding bit on the device and leave the memory regions mapped.
> 
> As long as subsequent changes to the decoding bit can't leave
> stale mappings. Plus there needs to be a comment to explain this.

With the current approach in the unmap case there will be stale
mappings left behind.

I guess it's better then to not modify the memory decoding bit at all
until the operation finishes. That also rises the question of whether
the memory decoding bit should be modified if p2m mapping/unmapping
reports an error.

Should we also attempt to rollback failed map/unmap operations? What
happens if the rollback also fails?

What about the following:

    +---------+   SUCCESS  +---------------------------------+
    |map/unmap+------------>Change decoding or ROM enable bit|
    +----+----+            +---------------------------------+
         |
         |FAILURE
         |
+--------v-------+ SUCCESS +---------------------------------------+
|revert operation+--------->No change to decoding or ROM enable bit|
+--------+-------+         +---------------------------------------+
         |
         |FAILURE
         |
   +-----v-----+
   |Kill domain|
   +-----------+

> >> > +    /*
> >> > +     * Create a rangeset that represents the current device BARs memory region
> >> > +     * and compare it against all the currently active BAR memory regions. If
> >> > +     * an overlap is found, subtract it from the region to be
> >> > +     * mapped/unmapped.
> >> > +     *
> >> > +     * NB: the rangeset uses inclusive frame numbers.
> >> > +     */
> >> > +
> >> > +    /*
> >> > +     * First fill the rangeset with all the BARs of this device or with the ROM
> >> > +     * BAR only, depending on whether the guest is toggling the memory decode
> >> > +     * bit of the command register, or the enable bit of the ROM BAR register.
> >> > +     */
> >> > +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> >> > +    {
> >> > +        const struct vpci_bar *bar = &header->bars[i];
> >> > +
> >> > +        if ( !MAPPABLE_BAR(bar) ||
> >> > +             (rom ? bar->type != VPCI_BAR_ROM
> >> > +                  : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
> >> > +            continue;
> >> 
> >> Why does rom_enabled matter for the !rom case rather than for
> >> the rom one? I.e.
> >> 
> >>         if ( !MAPPABLE_BAR(bar) ||
> >>              (rom ? bar->type != VPCI_BAR_ROM || !header->rom_enabled
> >>                   : bar->type == VPCI_BAR_ROM) )
> >>             continue;
> >> 
> >> ?
> > 
> > No, for the ROM case we only want to map/unmap the ROM, so that's the
> > only thing added to the rangeset. For the non-ROM case Xen will also
> > map/unmap the ROM if the enable bit is set.
> > 
> > Your proposed code would always map/unmap the ROM into the p2m when
> > the memory decoding bit is changed even if it's not enabled.
> 
> I don't understand. Taking apart the conditional I've suggested,
> and converting to human language:
> - if the BAR is no mappable, continue
> - if we want to deal with ROM (rom=true), if the BAR isn't ROM
>   or isn't enabled, continue

With the current flow rom_enabled is set after the ROM BAR mappings
are established, which means that when modify_bars is called with
rom=true rom_enabled is not yet set, and using the logic above the
mappings won't be created.

> - if we want to deal with non-ROM (rom=false), if the BAR is ROM,
>   continue

Xen also has to deal with the ROM if it's enabled when the memory
decoding bit is toggled, hence in rom=false case the ROM also needs to
be mapped/unampped if it's enabled.

This dependency between the memory decoding bit and the ROM enable bit
is quite convoluted TBH.

> To me this is in line with the 2nd paragraph of your reply. It's not
> in line with the first, which makes me wonder whether "rom" is
> misnamed and wants to be "rom_only". Still, the question would
> remain of why rom_enabled doesn't matter when the variable is
> true.

Yes, rom means rom_only (ie: ROM enable bit has been toggled and
memory decoding bit is enabled). I assume you would prefer to change
the name to rom_only.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-27  9:21         ` Roger Pau Monné
@ 2018-02-27 10:01           ` Jan Beulich
  2018-02-27 11:57             ` Roger Pau Monné
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2018-02-27 10:01 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, Boris Ostrovsky

>>> On 27.02.18 at 10:21, <roger.pau@citrix.com> wrote:
> On Tue, Feb 27, 2018 at 01:32:24AM -0700, Jan Beulich wrote:
>> >>> On 26.02.18 at 19:00, <roger.pau@citrix.com> wrote:
>> > On Mon, Feb 26, 2018 at 04:20:17AM -0700, Jan Beulich wrote:
>> >> >>> On 23.01.18 at 16:07, <roger.pau@citrix.com> wrote:
>> >> > +static void maybe_defer_map(struct domain *d, const struct pci_dev *pdev,
>> >> > +                            struct rangeset *mem, bool map, bool rom)
>> >> > +{
>> >> > +    struct vcpu *curr = current;
>> >> > +
>> >> > +    if ( is_idle_vcpu(curr) )
>> >> > +    {
>> >> > +        struct map_data data = { .d = d, .map = true };
>> >> > +
>> >> > +        /*
>> >> > +         * Only used for domain construction in order to map the BARs
>> >> > +         * of devices with memory decoding enabled.
>> >> > +         */
>> >> > +        ASSERT(map && !rom);
>> >> > +        rangeset_consume_ranges(mem, map_range, &data);
>> >> 
>> >> What if this produces -ENOMEM? And despite having looked at
>> >> several revisions of this, I can't make the connection to why this
>> >> is in an is_idle_vcpu() conditional (neither the direct caller nor the
>> >> next level up make this obvious to me). There's clearly a need
>> >> for extending the comment.
>> > 
>> > I thought the comment above that mentions domain construction would be
>> > enough. I can try to expand this, maybe like:
>> > 
>> > "This function will only be called from the idle vCPU while building
>> > the domain, in which case it's not possible to defer the operation
>> > (like done in the else branch). Call rangeset_consume_ranges in order
>> > to establish the mappings right away."
>> 
>> And what again is the connection between is_idle_domain() and
>> domain construction? I think the comment belongs ahead of the
>> if(), and it needs to make that connection.
> 
> Oh, domain constructions runs on the idle vCPU, that's the relation.
> 
> "This function will only be called from the idle vCPU while building
> the domain (because Dom0 building runs on the idle vCPU), in which
> case it's not possible to defer the operation (like done in the else
> branch). Call rangeset_consume_ranges in order to establish the
> mappings right away."
> 
> Does that seem clearer if placed ahead of the if?

Can be shorter imo - the thing I didn't pay attention to is that
this is all about _dom0_ building, not general _domain_ building.
Hence perhaps:

"Dom0 building runs on the idle vCPU, in which case it's not possible
 to defer the operation (like done in the else branch). Call
 rangeset_consume_ranges in order to establish the mappings right
 away."

>> >> > +static void modify_bars(const struct pci_dev *pdev, bool map, bool rom)
>> >> > +{
>> >> > +    struct vpci_header *header = &pdev->vpci->header;
>> >> > +    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
>> >> > +    const struct pci_dev *tmp;
>> >> > +    unsigned int i;
>> >> > +    int rc;
>> >> > +
>> >> > +    if ( !map )
>> >> > +        modify_decoding(pdev, false, rom);
>> >> > +
>> >> > +    if ( !mem )
>> >> > +        return;
>> >> 
>> >> Similarly here - why is it okay (or what effect will it have) to return
>> >> here when we're out of memory, but the caller won't know?
>> > 
>> > The behaviour here depends on the change to the memory decoding bit:
>> > 
>> >  - Clearing: memory decoding on device will be disabled, BARs won't be
>> >    unmapped.
>> >  - Setting: no change to device memory decoding bit, BARs won't be
>> >    mapped.
>> > 
>> > Do you think this is suitable? IMO it's fine to disable the memory
>> > decoding bit on the device and leave the memory regions mapped.
>> 
>> As long as subsequent changes to the decoding bit can't leave
>> stale mappings. Plus there needs to be a comment to explain this.
> 
> With the current approach in the unmap case there will be stale
> mappings left behind.
> 
> I guess it's better then to not modify the memory decoding bit at all
> until the operation finishes. That also rises the question of whether
> the memory decoding bit should be modified if p2m mapping/unmapping
> reports an error.
> 
> Should we also attempt to rollback failed map/unmap operations? What
> happens if the rollback also fails?

It is in particular this last question why I don't think rollback makes
sense. If there's any failure, I think the decode bit should be sticky
clear; we may want (need?) to invent some magical mechanism to
get a device back out of this state later on. But that way stale
mappings are not an immediate problem (I think).

> What about the following:
> 
>     +---------+   SUCCESS  +---------------------------------+
>     |map/unmap+------------>Change decoding or ROM enable bit|
>     +----+----+            +---------------------------------+
>          |
>          |FAILURE
>          |
> +--------v-------+ SUCCESS +---------------------------------------+
> |revert operation+--------->No change to decoding or ROM enable bit|
> +--------+-------+         +---------------------------------------+
>          |
>          |FAILURE
>          |
>    +-----v-----+
>    |Kill domain|
>    +-----------+

Possibly. Killing Dom0 is a bad thing though, if just some random
device has a problem.

>> >> > +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
>> >> > +    {
>> >> > +        const struct vpci_bar *bar = &header->bars[i];
>> >> > +
>> >> > +        if ( !MAPPABLE_BAR(bar) ||
>> >> > +             (rom ? bar->type != VPCI_BAR_ROM
>> >> > +                  : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
>> >> > +            continue;
>> >> 
>> >> Why does rom_enabled matter for the !rom case rather than for
>> >> the rom one? I.e.
>> >> 
>> >>         if ( !MAPPABLE_BAR(bar) ||
>> >>              (rom ? bar->type != VPCI_BAR_ROM || !header->rom_enabled
>> >>                   : bar->type == VPCI_BAR_ROM) )
>> >>             continue;
>> >> 
>> >> ?
>> > 
>> > No, for the ROM case we only want to map/unmap the ROM, so that's the
>> > only thing added to the rangeset. For the non-ROM case Xen will also
>> > map/unmap the ROM if the enable bit is set.
>> > 
>> > Your proposed code would always map/unmap the ROM into the p2m when
>> > the memory decoding bit is changed even if it's not enabled.
>> 
>> I don't understand. Taking apart the conditional I've suggested,
>> and converting to human language:
>> - if the BAR is no mappable, continue
>> - if we want to deal with ROM (rom=true), if the BAR isn't ROM
>>   or isn't enabled, continue
> 
> With the current flow rom_enabled is set after the ROM BAR mappings
> are established, which means that when modify_bars is called with
> rom=true rom_enabled is not yet set, and using the logic above the
> mappings won't be created.
> 
>> - if we want to deal with non-ROM (rom=false), if the BAR is ROM,
>>   continue
> 
> Xen also has to deal with the ROM if it's enabled when the memory
> decoding bit is toggled, hence in rom=false case the ROM also needs to
> be mapped/unampped if it's enabled.
> 
> This dependency between the memory decoding bit and the ROM enable bit
> is quite convoluted TBH.
> 
>> To me this is in line with the 2nd paragraph of your reply. It's not
>> in line with the first, which makes me wonder whether "rom" is
>> misnamed and wants to be "rom_only". Still, the question would
>> remain of why rom_enabled doesn't matter when the variable is
>> true.
> 
> Yes, rom means rom_only (ie: ROM enable bit has been toggled and
> memory decoding bit is enabled). I assume you would prefer to change
> the name to rom_only.

Yes, and perhaps provide an abridged version of your explanation
above in a comment next to the conditional.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-27 10:01           ` Jan Beulich
@ 2018-02-27 11:57             ` Roger Pau Monné
  2018-02-27 14:10               ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Roger Pau Monné @ 2018-02-27 11:57 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, Boris Ostrovsky

On Tue, Feb 27, 2018 at 03:01:44AM -0700, Jan Beulich wrote:
> >>> On 27.02.18 at 10:21, <roger.pau@citrix.com> wrote:
> > On Tue, Feb 27, 2018 at 01:32:24AM -0700, Jan Beulich wrote:
> >> >>> On 26.02.18 at 19:00, <roger.pau@citrix.com> wrote:
> >> > On Mon, Feb 26, 2018 at 04:20:17AM -0700, Jan Beulich wrote:
> >> >> >>> On 23.01.18 at 16:07, <roger.pau@citrix.com> wrote:
> >> >> > +static void maybe_defer_map(struct domain *d, const struct pci_dev *pdev,
> >> >> > +                            struct rangeset *mem, bool map, bool rom)
> >> >> > +{
> >> >> > +    struct vcpu *curr = current;
> >> >> > +
> >> >> > +    if ( is_idle_vcpu(curr) )
> >> >> > +    {
> >> >> > +        struct map_data data = { .d = d, .map = true };
> >> >> > +
> >> >> > +        /*
> >> >> > +         * Only used for domain construction in order to map the BARs
> >> >> > +         * of devices with memory decoding enabled.
> >> >> > +         */
> >> >> > +        ASSERT(map && !rom);
> >> >> > +        rangeset_consume_ranges(mem, map_range, &data);
> >> >> 
> >> >> What if this produces -ENOMEM? And despite having looked at
> >> >> several revisions of this, I can't make the connection to why this
> >> >> is in an is_idle_vcpu() conditional (neither the direct caller nor the
> >> >> next level up make this obvious to me). There's clearly a need
> >> >> for extending the comment.
> >> > 
> >> > I thought the comment above that mentions domain construction would be
> >> > enough. I can try to expand this, maybe like:
> >> > 
> >> > "This function will only be called from the idle vCPU while building
> >> > the domain, in which case it's not possible to defer the operation
> >> > (like done in the else branch). Call rangeset_consume_ranges in order
> >> > to establish the mappings right away."
> >> 
> >> And what again is the connection between is_idle_domain() and
> >> domain construction? I think the comment belongs ahead of the
> >> if(), and it needs to make that connection.
> > 
> > Oh, domain constructions runs on the idle vCPU, that's the relation.
> > 
> > "This function will only be called from the idle vCPU while building
> > the domain (because Dom0 building runs on the idle vCPU), in which
> > case it's not possible to defer the operation (like done in the else
> > branch). Call rangeset_consume_ranges in order to establish the
> > mappings right away."
> > 
> > Does that seem clearer if placed ahead of the if?
> 
> Can be shorter imo - the thing I didn't pay attention to is that
> this is all about _dom0_ building, not general _domain_ building.
> Hence perhaps:
> 
> "Dom0 building runs on the idle vCPU, in which case it's not possible
>  to defer the operation (like done in the else branch). Call
>  rangeset_consume_ranges in order to establish the mappings right
>  away."

LGTM. Thanks for re-writing it.

> >> >> > +static void modify_bars(const struct pci_dev *pdev, bool map, bool rom)
> >> >> > +{
> >> >> > +    struct vpci_header *header = &pdev->vpci->header;
> >> >> > +    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
> >> >> > +    const struct pci_dev *tmp;
> >> >> > +    unsigned int i;
> >> >> > +    int rc;
> >> >> > +
> >> >> > +    if ( !map )
> >> >> > +        modify_decoding(pdev, false, rom);
> >> >> > +
> >> >> > +    if ( !mem )
> >> >> > +        return;
> >> >> 
> >> >> Similarly here - why is it okay (or what effect will it have) to return
> >> >> here when we're out of memory, but the caller won't know?
> >> > 
> >> > The behaviour here depends on the change to the memory decoding bit:
> >> > 
> >> >  - Clearing: memory decoding on device will be disabled, BARs won't be
> >> >    unmapped.
> >> >  - Setting: no change to device memory decoding bit, BARs won't be
> >> >    mapped.
> >> > 
> >> > Do you think this is suitable? IMO it's fine to disable the memory
> >> > decoding bit on the device and leave the memory regions mapped.
> >> 
> >> As long as subsequent changes to the decoding bit can't leave
> >> stale mappings. Plus there needs to be a comment to explain this.
> > 
> > With the current approach in the unmap case there will be stale
> > mappings left behind.
> > 
> > I guess it's better then to not modify the memory decoding bit at all
> > until the operation finishes. That also rises the question of whether
> > the memory decoding bit should be modified if p2m mapping/unmapping
> > reports an error.
> > 
> > Should we also attempt to rollback failed map/unmap operations? What
> > happens if the rollback also fails?
> 
> It is in particular this last question why I don't think rollback makes
> sense. If there's any failure, I think the decode bit should be sticky
> clear; we may want (need?) to invent some magical mechanism to
> get a device back out of this state later on. But that way stale
> mappings are not an immediate problem (I think).

The only problem I see with this approach is that if an error happens
in the unmap case we will disable memory decoding and leave some stale
p2m mappings. Then the guest might change the position of the BARs,
and those stale mappings would be completely forgotten.

I don't think this is a problem for Dom0, but DomUs need to be handled
differently (possibly along the lines of the diagram below).

> > What about the following:
> > 
> >     +---------+   SUCCESS  +---------------------------------+
> >     |map/unmap+------------>Change decoding or ROM enable bit|
> >     +----+----+            +---------------------------------+
> >          |
> >          |FAILURE
> >          |
> > +--------v-------+ SUCCESS +---------------------------------------+
> > |revert operation+--------->No change to decoding or ROM enable bit|
> > +--------+-------+         +---------------------------------------+
> >          |
> >          |FAILURE
> >          |
> >    +-----v-----+
> >    |Kill domain|
> >    +-----------+
> 
> Possibly. Killing Dom0 is a bad thing though, if just some random
> device has a problem.

So for Dom0 the diagram would be the following:

    +---------+   SUCCESS  +---------------------------------+
    |map/unmap+------------>Change decoding or ROM enable bit|
    +----+----+            +---------------------------------+
         |
         |FAILURE
         |
+--------v-----------------------+
|Clear decoding or ROM enable bit|
+--------------------------------+

> >> >> > +    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> >> >> > +    {
> >> >> > +        const struct vpci_bar *bar = &header->bars[i];
> >> >> > +
> >> >> > +        if ( !MAPPABLE_BAR(bar) ||
> >> >> > +             (rom ? bar->type != VPCI_BAR_ROM
> >> >> > +                  : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
> >> >> > +            continue;
> >> >> 
> >> >> Why does rom_enabled matter for the !rom case rather than for
> >> >> the rom one? I.e.
> >> >> 
> >> >>         if ( !MAPPABLE_BAR(bar) ||
> >> >>              (rom ? bar->type != VPCI_BAR_ROM || !header->rom_enabled
> >> >>                   : bar->type == VPCI_BAR_ROM) )
> >> >>             continue;
> >> >> 
> >> >> ?
> >> > 
> >> > No, for the ROM case we only want to map/unmap the ROM, so that's the
> >> > only thing added to the rangeset. For the non-ROM case Xen will also
> >> > map/unmap the ROM if the enable bit is set.
> >> > 
> >> > Your proposed code would always map/unmap the ROM into the p2m when
> >> > the memory decoding bit is changed even if it's not enabled.
> >> 
> >> I don't understand. Taking apart the conditional I've suggested,
> >> and converting to human language:
> >> - if the BAR is no mappable, continue
> >> - if we want to deal with ROM (rom=true), if the BAR isn't ROM
> >>   or isn't enabled, continue
> > 
> > With the current flow rom_enabled is set after the ROM BAR mappings
> > are established, which means that when modify_bars is called with
> > rom=true rom_enabled is not yet set, and using the logic above the
> > mappings won't be created.
> > 
> >> - if we want to deal with non-ROM (rom=false), if the BAR is ROM,
> >>   continue
> > 
> > Xen also has to deal with the ROM if it's enabled when the memory
> > decoding bit is toggled, hence in rom=false case the ROM also needs to
> > be mapped/unampped if it's enabled.
> > 
> > This dependency between the memory decoding bit and the ROM enable bit
> > is quite convoluted TBH.
> > 
> >> To me this is in line with the 2nd paragraph of your reply. It's not
> >> in line with the first, which makes me wonder whether "rom" is
> >> misnamed and wants to be "rom_only". Still, the question would
> >> remain of why rom_enabled doesn't matter when the variable is
> >> true.
> > 
> > Yes, rom means rom_only (ie: ROM enable bit has been toggled and
> > memory decoding bit is enabled). I assume you would prefer to change
> > the name to rom_only.
> 
> Yes, and perhaps provide an abridged version of your explanation
> above in a comment next to the conditional.

OK, I can do that.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-27 11:57             ` Roger Pau Monné
@ 2018-02-27 14:10               ` Jan Beulich
  2018-02-28 11:20                 ` Roger Pau Monné
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2018-02-27 14:10 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, Boris Ostrovsky

>>> On 27.02.18 at 12:57, <roger.pau@citrix.com> wrote:
> On Tue, Feb 27, 2018 at 03:01:44AM -0700, Jan Beulich wrote:
>> >>> On 27.02.18 at 10:21, <roger.pau@citrix.com> wrote:
>> > With the current approach in the unmap case there will be stale
>> > mappings left behind.
>> > 
>> > I guess it's better then to not modify the memory decoding bit at all
>> > until the operation finishes. That also rises the question of whether
>> > the memory decoding bit should be modified if p2m mapping/unmapping
>> > reports an error.
>> > 
>> > Should we also attempt to rollback failed map/unmap operations? What
>> > happens if the rollback also fails?
>> 
>> It is in particular this last question why I don't think rollback makes
>> sense. If there's any failure, I think the decode bit should be sticky
>> clear; we may want (need?) to invent some magical mechanism to
>> get a device back out of this state later on. But that way stale
>> mappings are not an immediate problem (I think).
> 
> The only problem I see with this approach is that if an error happens
> in the unmap case we will disable memory decoding and leave some stale
> p2m mappings. Then the guest might change the position of the BARs,
> and those stale mappings would be completely forgotten.

Well, the implication was that together with the decode enable bit
not being allowed to be turned off would be that some recording
would perhaps be needed of the stale mappings. Allowing the
decode enable bit to be set again would then depend on the stale
mappings first being dealt with. As long as the decode enable bit
can't be set, the owning domain playing with the BARs is of no
interest.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-27 14:10               ` Jan Beulich
@ 2018-02-28 11:20                 ` Roger Pau Monné
  2018-02-28 11:34                   ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Roger Pau Monné @ 2018-02-28 11:20 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, Boris Ostrovsky

On Tue, Feb 27, 2018 at 07:10:51AM -0700, Jan Beulich wrote:
> >>> On 27.02.18 at 12:57, <roger.pau@citrix.com> wrote:
> > On Tue, Feb 27, 2018 at 03:01:44AM -0700, Jan Beulich wrote:
> >> >>> On 27.02.18 at 10:21, <roger.pau@citrix.com> wrote:
> >> > With the current approach in the unmap case there will be stale
> >> > mappings left behind.
> >> > 
> >> > I guess it's better then to not modify the memory decoding bit at all
> >> > until the operation finishes. That also rises the question of whether
> >> > the memory decoding bit should be modified if p2m mapping/unmapping
> >> > reports an error.
> >> > 
> >> > Should we also attempt to rollback failed map/unmap operations? What
> >> > happens if the rollback also fails?
> >> 
> >> It is in particular this last question why I don't think rollback makes
> >> sense. If there's any failure, I think the decode bit should be sticky
> >> clear; we may want (need?) to invent some magical mechanism to
> >> get a device back out of this state later on. But that way stale
> >> mappings are not an immediate problem (I think).
> > 
> > The only problem I see with this approach is that if an error happens
> > in the unmap case we will disable memory decoding and leave some stale
> > p2m mappings. Then the guest might change the position of the BARs,
> > and those stale mappings would be completely forgotten.
> 
> Well, the implication was that together with the decode enable bit
> not being allowed to be turned off would be that some recording
> would perhaps be needed of the stale mappings. Allowing the
> decode enable bit to be set again would then depend on the stale
> mappings first being dealt with. As long as the decode enable bit
> can't be set, the owning domain playing with the BARs is of no
> interest.

I've been giving some thought to this, and I cannot find a good
solution. So far the less worse one would be:

    +--------------+  YES  +---------+
    |Is broken set?|------>|No action|
    +--------------+       +---------+
         |
         | NO
	 |
    +----v----+   SUCCESS  +---------------------------------+
    |map/unmap+------------>Change decoding or ROM enable bit|
    +----+----+            +---------------------------------+
         |
         |FAILURE
         |
+--------v----------------+
|Clear memory decoding bit|
|Set broken               |
+-------------------------+

Let me know what you think of this.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-28 11:20                 ` Roger Pau Monné
@ 2018-02-28 11:34                   ` Jan Beulich
  2018-02-28 12:13                     ` Roger Pau Monné
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2018-02-28 11:34 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, BorisOstrovsky

>>> On 28.02.18 at 12:20, <roger.pau@citrix.com> wrote:
> I've been giving some thought to this, and I cannot find a good
> solution. So far the less worse one would be:
> 
>     +--------------+  YES  +---------+
>     |Is broken set?|------>|No action|
>     +--------------+       +---------+
>          |
>          | NO
> 	 |
>     +----v----+   SUCCESS  +---------------------------------+
>     |map/unmap+------------>Change decoding or ROM enable bit|
>     +----+----+            +---------------------------------+
>          |
>          |FAILURE
>          |
> +--------v----------------+
> |Clear memory decoding bit|
> |Set broken               |
> +-------------------------+
> 
> Let me know what you think of this.

Looks reasonable, maybe with "set broken" replaced by "set
broken if Dom0, else remove device from domain". This at least
partly depends on how recovery from "broken" would look like,
i.e. whether that would be applicable (and safe) for a DomU as
well.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-02-28 11:34                   ` Jan Beulich
@ 2018-02-28 12:13                     ` Roger Pau Monné
  0 siblings, 0 replies; 12+ messages in thread
From: Roger Pau Monné @ 2018-02-28 12:13 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, AndrewCooper,
	Ian Jackson, TimDeegan, Paul Durrant, xen-devel, BorisOstrovsky

On Wed, Feb 28, 2018 at 04:34:01AM -0700, Jan Beulich wrote:
> >>> On 28.02.18 at 12:20, <roger.pau@citrix.com> wrote:
> > I've been giving some thought to this, and I cannot find a good
> > solution. So far the less worse one would be:
> > 
> >     +--------------+  YES  +---------+
> >     |Is broken set?|------>|No action|
> >     +--------------+       +---------+
> >          |
> >          | NO
> > 	 |
> >     +----v----+   SUCCESS  +---------------------------------+
> >     |map/unmap+------------>Change decoding or ROM enable bit|
> >     +----+----+            +---------------------------------+
> >          |
> >          |FAILURE
> >          |
> > +--------v----------------+
> > |Clear memory decoding bit|
> > |Set broken               |
> > +-------------------------+
> > 
> > Let me know what you think of this.
> 
> Looks reasonable, maybe with "set broken" replaced by "set
> broken if Dom0, else remove device from domain". This at least
> partly depends on how recovery from "broken" would look like,
> i.e. whether that would be applicable (and safe) for a DomU as
> well.

I will add the "remove device from domain" for the DomU case and put
this as a comment somewhere. IMO it can always be improved in the
future with better recovery options.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-01-23 15:07 ` [PATCH v8 07/11] vpci/bars: add handlers to map the BARs Roger Pau Monne
@ 2018-01-29 14:01   ` Roger Pau Monné
  0 siblings, 0 replies; 12+ messages in thread
From: Roger Pau Monné @ 2018-01-29 14:01 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, Andrew Cooper,
	Ian Jackson, Tim Deegan, Jan Beulich, xen-devel, Boris Ostrovsky

On Tue, Jan 23, 2018 at 03:07:31PM +0000, Roger Pau Monne wrote:
> +static void bar_write(const struct pci_dev *pdev, unsigned int reg,
> +                      uint32_t val, void *data)
> +{
> +    struct vpci_bar *bar = data;
> +    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> +    bool hi = false;
> +
> +    if ( pci_conf_read16(pdev->seg, pdev->bus, slot, func, PCI_COMMAND) &
> +         PCI_COMMAND_MEMORY )
> +    {
> +        gprintk(XENLOG_WARNING,
> +                "%04x:%02x:%02x.%u: ignored BAR %lu write with memory decoding enabled\n",
> +                pdev->seg, pdev->bus, slot, func,
> +                (pdev->vpci->header.bars - bar) / 4);

This should be bar - pdev->vpci->header.bars. Will fix it in next
version.

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v8 07/11] vpci/bars: add handlers to map the BARs
  2018-01-23 15:07 [PATCH v8 00/11] vpci: PCI config space emulation Roger Pau Monne
@ 2018-01-23 15:07 ` Roger Pau Monne
  2018-01-29 14:01   ` Roger Pau Monné
  0 siblings, 1 reply; 12+ messages in thread
From: Roger Pau Monne @ 2018-01-23 15:07 UTC (permalink / raw)
  To: xen-devel
  Cc: Stefano Stabellini, Wei Liu, George Dunlap, Andrew Cooper,
	Ian Jackson, Tim Deegan, Jan Beulich, Boris Ostrovsky,
	Roger Pau Monne

Introduce a set of handlers that trap accesses to the PCI BARs and the
command register, in order to snoop BAR sizing and BAR relocation.

The command handler is used to detect changes to bit 2 (response to
memory space accesses), and maps/unmaps the BARs of the device into
the guest p2m. A rangeset is used in order to figure out which memory
to map/unmap. This makes it easier to keep track of the possible
overlaps with other BARs, and will also simplify MSI-X support, where
certain regions of a BAR might be used for the MSI-X table or PBA.

The BAR register handlers are used to detect attempts by the guest to
size or relocate the BARs.

Note that the long running BAR mapping and unmapping operations are
deferred to be performed by hvm_io_pending, so that they can be safely
preempted.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Tim Deegan <tim@xen.org>
Cc: Wei Liu <wei.liu2@citrix.com>
---
Changes since v7:
 - Order includes.
 - Add newline between switch cases.
 - Fix typo in comment (hopping).
 - Wrap ternary conditional in parentheses.
 - Remove CONFIG_HAS_PCI gueard from sched.h vpci_vcpu usage.
 - Add comment regarding vpci_vcpu usage.
 - Move rom_enabled from BAR struct to header.
 - Do not protect vpci_vcpu with __XEN__ guards.

Changes since v6:
 - s/vpci_check_pending/vpci_process_pending/.
 - Improve error handling in vpci_process_pending.
 - Add a comment that explains how vpci_check_bar_overlap works.
 - Add error messages to vpci_modify_bars and vpci_modify_rom.
 - Introduce vpci_hw_read16/32, in order to passthrough reads to
   the underlying hw.
 - Print BAR number on error in vpci_bar_write.
 - Place the CONFIG_HAS_PCI guards inside the vpci.h header and
   provide an empty vpci_vcpu structure for the !CONFIG_HAS_PCI case.
 - Define CONFIG_HAS_PCI in the test harness emul.h header before
   including vpci.h
 - Add ARM TODOs and an ARM-specific bodge to vpci_map_range due to
   the lack of preemption in {un}map_mmio_regions.
 - Make vpci_maybe_defer_map void.
 - Set rom_enabled in vpci_init_bars.
 - Defer enabling/disabling the memory decoding (or the ROM enable
   bit) until the memory has been mapped/unmapped.
 - Remove vpci_ prefix from static functions.
 - Use the same code in order to map the general BARs and the ROM
   BARs.
 - Remove the seg/bus local variables and use pdev->{seg,bus} instead.
 - Convert the bools in the BAR related structs into bool bitfields.
 - Add the must_check attribute to vpci_process_pending.
 - Open code check_bar_overlap inside modify_bars, which was it's only
   user.

Changes since v5:
 - Switch to the new handler type.
 - Use pci_sbdf_t to size the BARs.
 - Use a single return for vpci_modify_bar.
 - Do not return an error code from vpci_modify_bars, just log the
   failure.
 - Remove the 'sizing' parameter. Instead just let the guest write
   directly to the BAR, and read the value back. This simplifies the
   BAR register handlers, specially the read one.
 - Ignore ROM BAR writes with memory decoding enabled and ROM enabled.
 - Do not propagate failures to setup the ROM BAR in vpci_init_bars.
 - Add preemption support to the BAR mapping/unmapping operations.

Changes since v4:
 - Expand commit message to mention the reason behind the usage of
   rangesets.
 - Fix comment related to the inclusiveness of rangesets.
 - Fix off-by-one error in the calculation of the end of memory
   regions.
 - Store the state of the BAR (mapped/unmapped) in the vpci_bar
   enabled field, previously was only used by ROMs.
 - Fix double negation of return code.
 - Modify vpci_cmd_write so it has a single call to pci_conf_write16.
 - Print a warning when trying to write to the BAR with memory
   decoding enabled (and ignore the write).
 - Remove header_type local variable, it's used only once.
 - Move the read of the command register.
 - Restore previous command register value in the exit paths.
 - Only set address to INVALID_PADDR if the initial BAR value matches
    ~0 & PCI_BASE_ADDRESS_MEM_MASK.
 - Don't disable the enabled bit in the expansion ROM register, memory
   decoding is already disabled and takes precedence.
 - Don't use INVALID_PADDR, just set the initial BAR address to the
   value found in the hardware.
 - Introduce rom_enabled to store the status of the
   PCI_ROM_ADDRESS_ENABLE bit.
 - Reorder fields of the structure to prevent holes.

Changes since v3:
 - Propagate previous changes: drop xen_ prefix and use u8/u16/u32
   instead of the previous half_word/word/double_word.
 - Constify some of the paramerters.
 - s/VPCI_BAR_MEM/VPCI_BAR_MEM32/.
 - Simplify the number of fields stored for each BAR, a single address
   field is stored and contains the address of the BAR both on Xen and
   in the guest.
 - Allow the guest to move the BARs around in the physical memory map.
 - Add support for expansion ROM BARs.
 - Do not cache the value of the command register.
 - Remove a label used in vpci_cmd_write.
 - Fix the calculation of the sizing mask in vpci_bar_write.
 - Check the memory decode bit in order to decide if a BAR is
   positioned or not.
 - Disable memory decoding before sizing the BARs in Xen.
 - When mapping/unmapping BARs check if there's overlap between BARs,
   in order to avoid unmapping memory required by another BAR.
 - Introduce a macro to check whether a BAR is mappable or not.
 - Add a comment regarding the lack of support for SR-IOV.
 - Remove the usage of the GENMASK macro.

Changes since v2:
 - Detect unset BARs and allow the hardware domain to position them.
---
 tools/tests/vpci/emul.h   |   1 +
 xen/arch/x86/hvm/ioreq.c  |   4 +
 xen/drivers/vpci/Makefile |   2 +-
 xen/drivers/vpci/header.c | 521 ++++++++++++++++++++++++++++++++++++++++++++++
 xen/drivers/vpci/vpci.c   |  14 ++
 xen/include/xen/sched.h   |   4 +
 xen/include/xen/vpci.h    |  55 +++++
 7 files changed, 600 insertions(+), 1 deletion(-)
 create mode 100644 xen/drivers/vpci/header.c

diff --git a/tools/tests/vpci/emul.h b/tools/tests/vpci/emul.h
index fd0317995a..f6f9358b74 100644
--- a/tools/tests/vpci/emul.h
+++ b/tools/tests/vpci/emul.h
@@ -80,6 +80,7 @@ typedef union {
     };
 } pci_sbdf_t;
 
+#define CONFIG_HAS_PCI
 #include "vpci.h"
 
 #define __hwdom_init
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 5aeaaaccd9..03995cdaef 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -26,6 +26,7 @@
 #include <xen/domain.h>
 #include <xen/event.h>
 #include <xen/paging.h>
+#include <xen/vpci.h>
 
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/ioreq.h>
@@ -48,6 +49,9 @@ bool hvm_io_pending(struct vcpu *v)
     struct domain *d = v->domain;
     struct hvm_ioreq_server *s;
 
+    if ( has_vpci(d) && vpci_process_pending(v) )
+        return true;
+
     list_for_each_entry ( s,
                           &d->arch.hvm_domain.ioreq_server.list,
                           list_entry )
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 840a906470..241467212f 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1 +1 @@
-obj-y += vpci.o
+obj-y += vpci.o header.o
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
new file mode 100644
index 0000000000..994cfbb8f7
--- /dev/null
+++ b/xen/drivers/vpci/header.c
@@ -0,0 +1,521 @@
+/*
+ * Generic functionality for handling accesses to the PCI header from the
+ * configuration space.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/p2m-common.h>
+#include <xen/sched.h>
+#include <xen/softirq.h>
+#include <xen/vpci.h>
+
+#include <asm/event.h>
+
+#define MAPPABLE_BAR(x)                                                 \
+    ((x)->type == VPCI_BAR_MEM32 || (x)->type == VPCI_BAR_MEM64_LO ||   \
+     (x)->type == VPCI_BAR_ROM)
+
+struct map_data {
+    struct domain *d;
+    bool map;
+};
+
+static int map_range(unsigned long s, unsigned long e, void *data,
+                     unsigned long *c)
+{
+    const struct map_data *map = data;
+    int rc;
+
+    for ( ; ; )
+    {
+        unsigned long size = e - s + 1;
+
+        /*
+         * ARM TODOs:
+         * - On ARM whether the memory is prefetchable or not should be passed
+         *   to map_mmio_regions in order to decide which memory attributes
+         *   should be used.
+         *
+         * - {un}map_mmio_regions doesn't support preemption, hence the bodge
+         *   below in order to limit the amount of mappings to 64 pages for
+         *   each function call.
+         */
+
+#ifdef CONFIG_ARM
+        size = min(64ul, size);
+#endif
+
+        rc = (map->map ? map_mmio_regions : unmap_mmio_regions)
+             (map->d, _gfn(s), size, _mfn(s));
+        if ( rc == 0 )
+        {
+            *c += size;
+#ifdef CONFIG_ARM
+            rc = -ERESTART;
+#endif
+            break;
+        }
+        if ( rc < 0 )
+        {
+            printk(XENLOG_G_WARNING
+                   "Failed to identity %smap [%" PRI_gfn ", %" PRI_gfn ") for d%d: %d\n",
+                   map ? "" : "un", s, e, map->d->domain_id, rc);
+            break;
+        }
+        ASSERT(rc < size);
+        *c += rc;
+        s += rc;
+        if ( general_preempt_check() )
+        {
+            if ( !is_idle_vcpu(current) )
+                return -ERESTART;
+
+            process_pending_softirqs();
+        }
+    }
+
+    return rc;
+}
+
+static void modify_decoding(const struct pci_dev *pdev, bool map, bool rom)
+{
+    struct vpci_header *header = &pdev->vpci->header;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    unsigned int i;
+
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    {
+        if ( rom && header->bars[i].type == VPCI_BAR_ROM )
+        {
+            unsigned int rom_pos = (i == 6) ? PCI_ROM_ADDRESS
+                                            : PCI_ROM_ADDRESS1;
+            uint32_t val = pci_conf_read32(pdev->seg, pdev->bus, slot, func,
+                                           rom_pos);
+
+            header->bars[i].enabled = header->rom_enabled = map;
+
+            val &= ~PCI_ROM_ADDRESS_ENABLE;
+            val |= map ? PCI_ROM_ADDRESS_ENABLE : 0;
+            pci_conf_write32(pdev->seg, pdev->bus, slot, func, rom_pos, val);
+            break;
+        }
+        if ( !rom && (header->bars[i].type != VPCI_BAR_ROM ||
+                      header->rom_enabled) )
+            header->bars[i].enabled = map;
+    }
+
+    if ( !rom )
+    {
+        uint16_t cmd = pci_conf_read16(pdev->seg, pdev->bus, slot,
+                                       func, PCI_COMMAND);
+
+        cmd &= ~PCI_COMMAND_MEMORY;
+        cmd |= map ? PCI_COMMAND_MEMORY : 0;
+        pci_conf_write16(pdev->seg, pdev->bus, slot, func, PCI_COMMAND,
+                         cmd);
+    }
+}
+
+bool vpci_process_pending(struct vcpu *v)
+{
+    while ( v->vpci.mem )
+    {
+        struct map_data data = {
+            .d = v->domain,
+            .map = v->vpci.map,
+        };
+
+        switch ( rangeset_consume_ranges(v->vpci.mem, map_range, &data) )
+        {
+        case -ERESTART:
+            return true;
+
+        default:
+            if ( v->vpci.map )
+            {
+                spin_lock(&v->vpci.pdev->vpci->lock);
+                modify_decoding(v->vpci.pdev, v->vpci.map, v->vpci.rom);
+                spin_unlock(&v->vpci.pdev->vpci->lock);
+            }
+            /* fallthrough. */
+        case -ENOMEM:
+            /*
+             * Other errors are ignored, hoping that at least some regions
+             * will be mapped and that would be enough for the device to
+             * function. Note that in the unmap case the memory decoding or
+             * ROM enable bit have already been toggled off before attempting
+             * to perform the p2m unmap.
+             */
+            rangeset_destroy(v->vpci.mem);
+            v->vpci.mem = NULL;
+            break;
+        }
+    }
+
+    return false;
+}
+
+static void maybe_defer_map(struct domain *d, const struct pci_dev *pdev,
+                            struct rangeset *mem, bool map, bool rom)
+{
+    struct vcpu *curr = current;
+
+    if ( is_idle_vcpu(curr) )
+    {
+        struct map_data data = { .d = d, .map = true };
+
+        /*
+         * Only used for domain construction in order to map the BARs
+         * of devices with memory decoding enabled.
+         */
+        ASSERT(map && !rom);
+        rangeset_consume_ranges(mem, map_range, &data);
+        modify_decoding(pdev, true, false);
+        rangeset_destroy(mem);
+    }
+    else
+    {
+        /*
+         * NB: when deferring the {un}map the state of the device should not be
+         * trusted. For example the enable bit is toggled after the device is
+         * mapped. This can lead to parallel mapping operations being started
+         * for the same device if the domain is not well-behaved.
+         *
+         * In any case, the worse that can happen are errors from the {un}map
+         * operations, which will lead to the devices not working properly.
+         */
+        ASSERT(curr->domain == d);
+        curr->vpci.pdev = pdev;
+        curr->vpci.mem = mem;
+        curr->vpci.map = map;
+        curr->vpci.rom = rom;
+    }
+}
+
+static void modify_bars(const struct pci_dev *pdev, bool map, bool rom)
+{
+    struct vpci_header *header = &pdev->vpci->header;
+    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
+    const struct pci_dev *tmp;
+    unsigned int i;
+    int rc;
+
+    if ( !map )
+        modify_decoding(pdev, false, rom);
+
+    if ( !mem )
+        return;
+
+    /*
+     * Create a rangeset that represents the current device BARs memory region
+     * and compare it against all the currently active BAR memory regions. If
+     * an overlap is found, subtract it from the region to be
+     * mapped/unmapped.
+     *
+     * NB: the rangeset uses inclusive frame numbers.
+     */
+
+    /*
+     * First fill the rangeset with all the BARs of this device or with the ROM
+     * BAR only, depending on whether the guest is toggling the memory decode
+     * bit of the command register, or the enable bit of the ROM BAR register.
+     */
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+    {
+        const struct vpci_bar *bar = &header->bars[i];
+
+        if ( !MAPPABLE_BAR(bar) ||
+             (rom ? bar->type != VPCI_BAR_ROM
+                  : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
+            continue;
+
+        rc = rangeset_add_range(mem, PFN_DOWN(bar->addr),
+                                PFN_UP(bar->addr + bar->size - 1));
+        if ( rc )
+        {
+            printk(XENLOG_G_WARNING
+                   "Failed to add [%" PRI_gfn ", %" PRI_gfn "): %d\n",
+                   PFN_DOWN(bar->addr), PFN_UP(bar->addr + bar->size - 1),
+                   rc);
+            rangeset_destroy(mem);
+            return;
+        }
+    }
+
+    /*
+     * Check for overlaps with other BARs. Note that only BARs that are
+     * currently mapped (enabled) are checked for overlaps.
+     */
+    list_for_each_entry(tmp, &pdev->domain->arch.pdev_list, domain_list)
+        for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
+        {
+            const struct vpci_bar *bar = &tmp->vpci->header.bars[i];
+            unsigned long start = PFN_DOWN(bar->addr);
+            unsigned long end = PFN_UP(bar->addr + bar->size - 1);
+
+            if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) )
+                continue;
+
+            rc = rangeset_remove_range(mem, start, end);
+            if ( rc )
+            {
+                printk(XENLOG_G_WARNING
+                       "Failed to remove [%" PRI_gfn ", %" PRI_gfn "): %d\n",
+                       start, end, rc);
+                rangeset_destroy(mem);
+                return;
+            }
+        }
+
+    maybe_defer_map(pdev->domain, pdev, mem, map, rom);
+}
+
+static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
+                      uint32_t cmd, void *data)
+{
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    uint16_t current_cmd = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
+                                           reg);
+
+    /*
+     * Let Dom0 play with all the bits directly except for the memory
+     * decoding one.
+     */
+    if ( (cmd ^ current_cmd) & PCI_COMMAND_MEMORY )
+        modify_bars(pdev, cmd & PCI_COMMAND_MEMORY, false);
+    else
+        pci_conf_write16(pdev->seg, pdev->bus, slot, func, reg, cmd);
+}
+
+static void bar_write(const struct pci_dev *pdev, unsigned int reg,
+                      uint32_t val, void *data)
+{
+    struct vpci_bar *bar = data;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    bool hi = false;
+
+    if ( pci_conf_read16(pdev->seg, pdev->bus, slot, func, PCI_COMMAND) &
+         PCI_COMMAND_MEMORY )
+    {
+        gprintk(XENLOG_WARNING,
+                "%04x:%02x:%02x.%u: ignored BAR %lu write with memory decoding enabled\n",
+                pdev->seg, pdev->bus, slot, func,
+                (pdev->vpci->header.bars - bar) / 4);
+        return;
+    }
+
+    if ( bar->type == VPCI_BAR_MEM64_HI )
+    {
+        ASSERT(reg > PCI_BASE_ADDRESS_0);
+        bar--;
+        hi = true;
+    }
+    else
+        val &= PCI_BASE_ADDRESS_MEM_MASK;
+
+    /*
+     * Update the cached address, so that when memory decoding is enabled
+     * Xen can map the BAR into the guest p2m.
+     */
+    bar->addr &= ~(0xffffffffull << (hi ? 32 : 0));
+    bar->addr |= (uint64_t)val << (hi ? 32 : 0);
+
+    /* Make sure Xen writes back the same value for the BAR RO bits. */
+    if ( !hi )
+    {
+        val |= bar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
+                                           : PCI_BASE_ADDRESS_MEM_TYPE_64;
+        val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
+    }
+
+    pci_conf_write32(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                     PCI_FUNC(pdev->devfn), reg, val);
+}
+
+static void rom_write(const struct pci_dev *pdev, unsigned int reg,
+                      uint32_t val, void *data)
+{
+    struct vpci_header *header = &pdev->vpci->header;
+    struct vpci_bar *rom = data;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    uint16_t cmd = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
+                                   PCI_COMMAND);
+    bool new_enabled = val & PCI_ROM_ADDRESS_ENABLE;
+
+    if ( (cmd & PCI_COMMAND_MEMORY) && header->rom_enabled && new_enabled )
+    {
+        gprintk(XENLOG_WARNING,
+                "%04x:%02x:%02x.%u: ignored ROM BAR write with memory decoding enabled\n",
+                pdev->seg, pdev->bus, slot, func);
+        return;
+    }
+
+    if ( !header->rom_enabled )
+        rom->addr = val & PCI_ROM_ADDRESS_MASK;
+
+    /* Check if ROM BAR should be mapped/unmapped. */
+    if ( (cmd & PCI_COMMAND_MEMORY) && header->rom_enabled != new_enabled )
+        modify_bars(pdev, new_enabled, true);
+    else
+    {
+        header->rom_enabled = new_enabled;
+        pci_conf_write32(pdev->seg, pdev->bus, slot, func, reg, val);
+    }
+
+    if ( !new_enabled )
+        rom->addr = val & PCI_ROM_ADDRESS_MASK;
+}
+
+static int init_bars(struct pci_dev *pdev)
+{
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    uint16_t cmd;
+    uint64_t addr, size;
+    unsigned int i, num_bars, rom_reg;
+    struct vpci_header *header = &pdev->vpci->header;
+    struct vpci_bar *bars = header->bars;
+    pci_sbdf_t sbdf = {
+        .seg = pdev->seg,
+        .bus = pdev->bus,
+        .dev = slot,
+        .func = func,
+    };
+    int rc;
+
+    switch ( pci_conf_read8(pdev->seg, pdev->bus, slot, func, PCI_HEADER_TYPE)
+             & 0x7f )
+    {
+    case PCI_HEADER_TYPE_NORMAL:
+        num_bars = 6;
+        rom_reg = PCI_ROM_ADDRESS;
+        break;
+
+    case PCI_HEADER_TYPE_BRIDGE:
+        num_bars = 2;
+        rom_reg = PCI_ROM_ADDRESS1;
+        break;
+
+    default:
+        return -EOPNOTSUPP;
+    }
+
+    /* Setup a handler for the command register. */
+    rc = vpci_add_register(pdev->vpci, vpci_hw_read16, cmd_write, PCI_COMMAND,
+                           2, header);
+    if ( rc )
+        return rc;
+
+    /* Disable memory decoding before sizing. */
+    cmd = pci_conf_read16(pdev->seg, pdev->bus, slot, func, PCI_COMMAND);
+    if ( cmd & PCI_COMMAND_MEMORY )
+        pci_conf_write16(pdev->seg, pdev->bus, slot, func, PCI_COMMAND,
+                         cmd & ~PCI_COMMAND_MEMORY);
+
+    for ( i = 0; i < num_bars; i++ )
+    {
+        uint8_t reg = PCI_BASE_ADDRESS_0 + i * 4;
+        uint32_t val;
+
+        if ( i && bars[i - 1].type == VPCI_BAR_MEM64_LO )
+        {
+            bars[i].type = VPCI_BAR_MEM64_HI;
+            rc = vpci_add_register(pdev->vpci, vpci_hw_read32, bar_write, reg,
+                                   4, &bars[i]);
+            if ( rc )
+            {
+                pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                                 PCI_COMMAND, cmd);
+                return rc;
+            }
+
+            continue;
+        }
+
+        val = pci_conf_read32(pdev->seg, pdev->bus, slot, func, reg);
+        if ( (val & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO )
+        {
+            bars[i].type = VPCI_BAR_IO;
+            continue;
+        }
+        if ( (val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
+             PCI_BASE_ADDRESS_MEM_TYPE_64 )
+            bars[i].type = VPCI_BAR_MEM64_LO;
+        else
+            bars[i].type = VPCI_BAR_MEM32;
+
+        rc = pci_size_mem_bar(sbdf, reg, &addr, &size,
+                              (i == num_bars - 1) ? PCI_BAR_LAST : 0);
+        if ( rc < 0 )
+        {
+            pci_conf_write16(pdev->seg, pdev->bus, slot, func, PCI_COMMAND,
+                             cmd);
+            return rc;
+        }
+
+        if ( size == 0 )
+        {
+            bars[i].type = VPCI_BAR_EMPTY;
+            continue;
+        }
+
+        bars[i].addr = addr;
+        bars[i].size = size;
+        bars[i].prefetchable = val & PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+        rc = vpci_add_register(pdev->vpci, vpci_hw_read32, bar_write, reg, 4,
+                               &bars[i]);
+        if ( rc )
+        {
+            pci_conf_write16(pdev->seg, pdev->bus, slot, func, PCI_COMMAND,
+                             cmd);
+            return rc;
+        }
+    }
+
+    /* Check expansion ROM. */
+    rc = pci_size_mem_bar(sbdf, rom_reg, &addr, &size, PCI_BAR_ROM);
+    if ( rc > 0 && size )
+    {
+        struct vpci_bar *rom = &header->bars[num_bars];
+
+        rom->type = VPCI_BAR_ROM;
+        rom->size = size;
+        rom->addr = addr;
+        header->rom_enabled = pci_conf_read32(pdev->seg, pdev->bus, slot, func,
+                                              rom_reg) & PCI_ROM_ADDRESS_ENABLE;
+
+        rc = vpci_add_register(pdev->vpci, vpci_hw_read32, rom_write, rom_reg,
+                               4, rom);
+        if ( rc )
+            rom->type = VPCI_BAR_EMPTY;
+    }
+
+    if ( cmd & PCI_COMMAND_MEMORY )
+        modify_bars(pdev, true, false);
+
+    return 0;
+}
+REGISTER_VPCI_INIT(init_bars);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 4740d02edf..3da1822901 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -102,6 +102,20 @@ static void vpci_ignored_write(const struct pci_dev *pdev, unsigned int reg,
 {
 }
 
+uint32_t vpci_hw_read16(const struct pci_dev *pdev, unsigned int reg,
+                        void *data)
+{
+    return pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                           PCI_FUNC(pdev->devfn), reg);
+}
+
+uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg,
+                        void *data)
+{
+    return pci_conf_read32(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                           PCI_FUNC(pdev->devfn), reg);
+}
+
 int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
                       vpci_write_t *write_handler, unsigned int offset,
                       unsigned int size, void *data)
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 2541ecb04f..b6509e8532 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -20,6 +20,7 @@
 #include <xen/smp.h>
 #include <xen/perfc.h>
 #include <asm/atomic.h>
+#include <xen/vpci.h>
 #include <xen/wait.h>
 #include <public/xen.h>
 #include <public/domctl.h>
@@ -264,6 +265,9 @@ struct vcpu
 
     struct evtchn_fifo_vcpu *evtchn_fifo;
 
+    /* vPCI per-vCPU area, used to store data for long running operations. */
+    struct vpci_vcpu vpci;
+
     struct arch_vcpu arch;
 };
 
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 9f2864fb0c..6ae254407f 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -1,6 +1,8 @@
 #ifndef _XEN_VPCI_H_
 #define _XEN_VPCI_H_
 
+#ifdef CONFIG_HAS_PCI
+
 #include <xen/pci.h>
 #include <xen/types.h>
 #include <xen/list.h>
@@ -34,12 +36,65 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size);
 void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
                 uint32_t data);
 
+/* Passthrough handlers. */
+uint32_t vpci_hw_read16(const struct pci_dev *pdev, unsigned int reg,
+                        void *data);
+uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg,
+                        void *data);
+
+/*
+ * Check for pending vPCI operations on this vcpu. Returns true if the vcpu
+ * should not run.
+ */
+bool __must_check vpci_process_pending(struct vcpu *v);
+
 struct vpci {
     /* List of vPCI handlers for a device. */
     struct list_head handlers;
     spinlock_t lock;
+
+#ifdef __XEN__
+    /* Hide the rest of the vpci struct from the user-space test harness. */
+    struct vpci_header {
+        /* Information about the PCI BARs of this device. */
+        struct vpci_bar {
+            uint64_t addr;
+            uint64_t size;
+            enum {
+                VPCI_BAR_EMPTY,
+                VPCI_BAR_IO,
+                VPCI_BAR_MEM32,
+                VPCI_BAR_MEM64_LO,
+                VPCI_BAR_MEM64_HI,
+                VPCI_BAR_ROM,
+            } type;
+            bool prefetchable : 1;
+            /* Store whether the BAR is mapped into guest p2m. */
+            bool enabled      : 1;
+        } bars[7]; /* At most 6 BARS + 1 expansion ROM BAR. */
+        /*
+         * Store whether the ROM enable bit is set (doesn't imply ROM BAR
+         * is mapped into guest p2m) if there's a ROM BAR on the device.
+         */
+        bool rom_enabled      : 1;
+        /* FIXME: currently there's no support for SR-IOV. */
+    } header;
+#endif
+};
+
+struct vpci_vcpu {
+    /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
+    struct rangeset *mem;
+    const struct pci_dev *pdev;
+    bool map : 1;
+    bool rom : 1;
 };
 
+#else /* !CONFIG_HAS_PCI */
+struct vpci_vpcu {
+};
+#endif
+
 #endif
 
 /*
-- 
2.15.1


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-02-28 12:15 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <5A6750020200009C0431366C@prv-mh.provo.novell.com>
     [not found] ` <5A905C0902000078001AB15D@prv-mh.provo.novell.com>
2018-02-26 11:20   ` [PATCH v8 07/11] vpci/bars: add handlers to map the BARs Jan Beulich
2018-02-26 18:00     ` Roger Pau Monné
2018-02-27  8:32       ` Jan Beulich
2018-02-27  9:21         ` Roger Pau Monné
2018-02-27 10:01           ` Jan Beulich
2018-02-27 11:57             ` Roger Pau Monné
2018-02-27 14:10               ` Jan Beulich
2018-02-28 11:20                 ` Roger Pau Monné
2018-02-28 11:34                   ` Jan Beulich
2018-02-28 12:13                     ` Roger Pau Monné
2018-01-23 15:07 [PATCH v8 00/11] vpci: PCI config space emulation Roger Pau Monne
2018-01-23 15:07 ` [PATCH v8 07/11] vpci/bars: add handlers to map the BARs Roger Pau Monne
2018-01-29 14:01   ` Roger Pau Monné

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.