Re: Graphics pass-through

From: Alex Williamson <alex.williamson@redhat.com>
To: Jan Kiszka <jan.kiszka@siemens.com>
Cc: "André Weidemann" <Andre.Weidemann@web.de>,
	"Prasad Joshi" <prasadjoshi124@gmail.com>,
	kvm@vger.kernel.org,
	"Oswaldo Cadenas" <oswaldo.cadenas@gmail.com>
Subject: Re: Graphics pass-through
Date: Thu, 05 May 2011 09:17:22 -0600	[thread overview]
Message-ID: <1304608642.3081.35.camel@x201> (raw)
In-Reply-To: <4DC264CD.9080700@siemens.com>

Hi Jan,

On Thu, 2011-05-05 at 10:50 +0200, Jan Kiszka wrote:
> Hi Alex,
> 
> On 2011-01-28 01:45, Alex Williamson wrote:
> > On Thu, 2011-01-27 at 12:56 +0100, André Weidemann wrote:
> >> Hi Alex,
> >>
> >> On 26.01.2011 06:12, Alex Williamson wrote:
> >>
> >>> So while your initial results are promising, my guess is that you're
> >>> using card specific drivers and still need to consider some of the
> >>> harder problems with generic support for vga assignment.  I hacked on
> >>> this for a bit trying to see if I could get vga assignment working
> >>> with the vfio driver.  Setting up the legacy access and preventing
> >>> qemu from stealing it back should get you basic vga modes and might
> >>> even allow the option rom to run to initialize the card for pre-boot.
> >>> I was able to get this far on a similar ATI card.  I never hard much
> >>> luck with other cards though, and I was never able to get the vesa
> >>> extensions working.  Thanks,
> >>
> >> Do you mind sharing these patches?
> > 
> > Attached.
> > 
> 
> We are about to try some pass-through with an NVIDA card. So I already
> hacked on your vfio patch to make it build against current devices
> assignment code. Some questions arose while studying the code:

Cool!

> > --- /dev/null
> > +++ b/hw/vfio-vga.c
> > @@ -0,0 +1,291 @@
> > +/*
> > + * vfio VGA device assignment support
> > + *
> > + * Copyright Red Hat, Inc. 2010
> > + *
> > + * Authors:
> > + *  Alex Williamson <alex.williamson@redhat.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Based on qemu-kvm device-assignment:
> > + *  Adapted for KVM by Qumranet.
> > + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> > + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> > + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> > + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> > + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> > + */
> > +
> > +#include <stdio.h>
> > +#include <unistd.h>
> > +#include <sys/io.h>
> > +#include <sys/mman.h>
> > +#include <sys/types.h>
> > +#include <sys/stat.h>
> > +#include "event_notifier.h"
> > +#include "hw.h"
> > +#include "memory.h"
> > +#include "monitor.h"
> > +#include "pc.h"
> > +#include "qemu-error.h"
> > +#include "sysemu.h"
> > +#include "vfio.h"
> > +#include <pci/header.h>
> > +#include <pci/types.h>
> > +#include <linux/types.h>
> > +#include "linux-vfio.h"
> > +
> > +//#define DEBUG_VFIO_VGA
> > +#ifdef DEBUG_VFIO_VGA
> > +#define DPRINTF(fmt, ...) \
> > +    do { printf("vfio-vga: " fmt, ## __VA_ARGS__); } while (0)
> > +#else
> > +#define DPRINTF(fmt, ...) \
> > +    do { } while (0)
> > +#endif
> > +
> > +/*
> > + * VGA setup
> > + */
> > +static void vfio_vga_write(VFIODevice *vdev, uint32_t addr,
> > +                           uint32_t val, int len)
> > +{
> > +    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, 0xa0000 + addr, len, val);
> > +    switch (len) {
> > +        case 1:
> > +            *(uint8_t *)(vdev->vga_mmio + addr) = (uint8_t)val;
> > +            break;
> > +        case 2:
> > +            *(uint16_t *)(vdev->vga_mmio + addr) = (uint16_t)val;
> > +            break;
> > +        case 4:
> > +            *(uint32_t *)(vdev->vga_mmio + addr) = val;
> > +            break;
> > +    }
> > +}
> > +
> > +static void vfio_vga_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> > +{
> > +    vfio_vga_write(opaque, addr, val, 1);
> > +}
> > +
> > +static void vfio_vga_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> > +{
> > +    vfio_vga_write(opaque, addr, val, 2);
> > +}
> > +
> > +static void vfio_vga_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> > +{
> > +    vfio_vga_write(opaque, addr, val, 4);
> > +}
> > +
> > +static CPUWriteMemoryFunc * const vfio_vga_writes[] = {
> > +    &vfio_vga_writeb,
> > +    &vfio_vga_writew,
> > +    &vfio_vga_writel
> > +};
> > +
> > +static uint32_t vfio_vga_read(VFIODevice *vdev, uint32_t addr, int len)
> > +{
> > +    uint32_t val = 0xffffffff;
> > +    switch (len) {
> > +        case 1:
> > +            val = (uint32_t)*(uint8_t *)(vdev->vga_mmio + addr);
> > +            break;
> > +        case 2:
> > +            val = (uint32_t)*(uint16_t *)(vdev->vga_mmio + addr);
> > +            break;
> > +        case 4:
> > +            val = *(uint32_t *)(vdev->vga_mmio + addr);
> > +            break;
> > +    }
> > +    DPRINTF("%s 0x%x %d = 0x%x\n", __func__, 0xa0000 + addr, len, val);
> > +    return val;
> > +}
> > +
> > +static uint32_t vfio_vga_readb(void *opaque, target_phys_addr_t addr)
> > +{
> > +    return vfio_vga_read(opaque, addr, 1);
> > +}
> > +
> > +static uint32_t vfio_vga_readw(void *opaque, target_phys_addr_t addr)
> > +{
> > +    return vfio_vga_read(opaque, addr, 2);
> > +}
> > +
> > +static uint32_t vfio_vga_readl(void *opaque, target_phys_addr_t addr)
> > +{
> > +    return vfio_vga_read(opaque, addr, 4);
> > +}
> > +
> > +static CPUReadMemoryFunc * const vfio_vga_reads[] = {
> > +    &vfio_vga_readb,
> > +    &vfio_vga_readw,
> > +    &vfio_vga_readl
> > +};
> > +
> > +static void vfio_vga_out(VFIODevice *vdev, uint32_t addr, uint32_t val, int len)
> > +{
> > +    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, addr, len, val);
> > +    ioperm(0x3b0, 0x30, 1); /* XXX fix me */
> 
> Why do you have to re-establish the ioperms here on each access? Are we
> just lacking the use of generic kvm ioperm management?

IIRC, setting it up initially wasn't sticking, so I put it here as just
a quick fix to make sure it was set before we used it.  I never fully
made it though debugging why it wasn't working when set earlier.

In general, legacy mmio and ioport needs a better solution.  I wish x86
implemented the legacy io feature of pci sysfs so we could do it that
way, which might also move vga arbitration and chipset vga routing into
the host kernel.

> > +    switch (len) {
> > +        case 1:
> > +            outb(val, addr);
> > +            break;
> > +        case 2:
> > +            outw(val, addr);
> > +            break;
> > +        case 4:
> > +            outl(val, addr);
> > +            break;
> > +    }
> > +}
> > +
> > +static void vfio_vga_outb(void *opaque, uint32_t addr, uint32_t val)
> > +{
> > +    vfio_vga_out(opaque, addr, val, 1);
> > +}
> > +
> > +static void vfio_vga_outw(void *opaque, uint32_t addr, uint32_t val)
> > +{
> > +    vfio_vga_out(opaque, addr, val, 2);
> > +}
> > +
> > +static void vfio_vga_outl(void *opaque, uint32_t addr, uint32_t val)
> > +{
> > +    vfio_vga_out(opaque, addr, val, 4);
> > +}
> > +
> > +static uint32_t vfio_vga_in(VFIODevice *vdev, uint32_t addr, int len)
> > +{
> > +    uint32_t val = 0xffffffff;
> > +    ioperm(0x3b0, 0x30, 1); /* XXX fix me */
> > +    switch (len) {
> > +        case 1:
> > +            val = inb(addr);
> > +            break;
> > +        case 2:
> > +            val = inw(addr);
> > +            break;
> > +        case 4:
> > +            val = inl(addr);
> > +            break;
> > +    }
> > +    DPRINTF("%s 0x%x, %d = 0x%x\n", __func__, addr, len, val);
> > +    return val;
> > +}
> > +
> > +static uint32_t vfio_vga_inb(void *opaque, uint32_t addr)
> > +{
> > +    return vfio_vga_in(opaque, addr, 1);
> > +}
> > +
> > +static uint32_t vfio_vga_inw(void *opaque, uint32_t addr)
> > +{
> > +    return vfio_vga_in(opaque, addr, 2);
> > +}
> > +
> > +static uint32_t vfio_vga_inl(void *opaque, uint32_t addr)
> > +{
> > +    return vfio_vga_in(opaque, addr, 4);
> > +}
> > +
> > +int vfio_vga_setup(VFIODevice *vdev)
> > +{
> > +    char buf[256];
> > +    int ret;
> > +
> > +    if (vga_interface_type != VGA_NONE) {
> > +        fprintf(stderr,
> > +                "VGA devie assigned without -vga none param, no ISA VGA\n");
> > +        return -1;
> > +    }
> > +
> > +    vdev->vga_fd = open("/dev/vga_arbiter", O_RDWR);
> > +    if (vdev->vga_fd < 0) {
> > +        fprintf(stderr, "%s - Failed to open vga arbiter (%s)\n",
> > +                __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +    ret = read(vdev->vga_fd, buf, sizeof(buf));
> > +    if (ret <= 0) {
> > +        fprintf(stderr, "%s - Failed to read from vga arbiter (%s)\n",
> > +                __func__, strerror(errno));
> > +        close(vdev->vga_fd);
> > +        return -1;
> > +    }
> > +    buf[ret - 1] = 0;
> > +    vdev->vga_orig = qemu_strdup(buf);
> > +
> > +    snprintf(buf, sizeof(buf), "target PCI:%04x:%02x:%02x.%x",
> > +             vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
> > +    ret = write(vdev->vga_fd, buf, strlen(buf));
> > +    if (ret != strlen(buf)) {
> > +        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
> > +                __func__, strerror(errno));
> > +        close(vdev->vga_fd);
> > +        return -1;
> > +    }
> > +    snprintf(buf, sizeof(buf), "decodes io+mem");
> > +    ret = write(vdev->vga_fd, buf, strlen(buf));
> > +    if (ret != strlen(buf)) {
> > +        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
> > +                __func__, strerror(errno));
> > +        close(vdev->vga_fd);
> > +        return -1;
> > +    }
> 
> OK, so we grab the assigned adapter and make it handle legacy io+mem. I
> guess this approach only works with a single guest with an assigned
> adapter. Would it be possible and not extremely costly to do some
> on-demand grabbing of the range to share it with multiple VMs?

Yes, and that was my intention but never got that far.  Each legacy io
access should switch the arbiter to the necessary device.  Unfortunately
the vga arbiter only works if everyone uses it, and so far it seems like
nobody does.  Obviously some pretty hefty performance implications with
switch on every read.  I'm not sure how that's going to play out.  I
expect once we bootstrap the VGA device and load a real driver, the
legacy areas are seldom used.

> And what about the host? When does Linux release the legacy range?
> Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?

Well, that's where it'd be nice if the vga arbiter was actually in more
widespread use.  It currently seems to be nothing more than a shared
mutex, but it would actually be useful if it included backends to do the
chipset vga routing changes.  I think when I was testing this, I was
externally poking PCI bridge chipset to toggle the VGA_EN bit.

> Is there some other way to pass the legacy accesses from the guest to a
> specific adapter without going via the host's legacy area? I.e. do some
> adapters allow remapping?

Not that I know of on x86.  I wouldn't be surprised if some adapters
just re-route the legacy address ranges to standard PCI mappings, but I
don't know how to figure out if that's true and what the offsets would
be.  I've seen ia64 hardware that supports a _TRA offset such that each
PCI root bridge can support it's own legacy io port space, but that
requires a whole different ioport model.

I believe X.org tries to tackle this by brute force, manually changing
VGA enabled bits on PCI bridges.  I think this is part if why it's
difficult to run multiple X servers on the same system.  Not sure if
that problem has gotten any better since I last looked.

> > +
> > +    vdev->vga_mmio_fd = open("/dev/mem", O_RDWR);
> > +    if (vdev->vga_mmio_fd < 0) {
> > +        fprintf(stderr, "%s - Failed to open /dev/mem (%s)\n",
> > +                __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +    vdev->vga_mmio = mmap(NULL, 0x40000, PROT_READ | PROT_WRITE,
> > +                          MAP_SHARED, vdev->vga_mmio_fd, 0xa0000);
> > +    if (vdev->vga_mmio == MAP_FAILED) {
> > +        fprintf(stderr, "%s - mmap failed (%s)\n", __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +#if 1
> > +    vdev->vga_io = cpu_register_io_memory(vfio_vga_reads,
> > +                                          vfio_vga_writes, vdev);
> > +    cpu_register_physical_memory(0xa0000, 0x20000, vdev->vga_io);
> > +    qemu_register_coalesced_mmio(0xa0000, 0x20000);
> > +#else
> > +    cpu_register_physical_memory(0xa0000, 0x20000, 
> > +        qemu_ram_map(&vdev->pdev.qdev, "VGA", 0x20000, vdev->vga_mmio));
> > +    qemu_register_coalesced_mmio(0xa0000, 0x20000);
> > +#endif
> 
> To make the second case work, we would have to track the mode switches
> of the guest via legacy VGA interfaces and switch the mapping on the
> fly, right?

Yeah, something like that.   IIRC, I was expecting the second case to
work since I'm doing a static switch of the legacy address space and I
can't recall if it wasn't working or if I used the read/write interface
just so I could add fprintfs to make sure something is happening.
Thanks,

Alex

> > +
> > +    register_ioport_write(0x3b0, 0x30, 1, vfio_vga_outb, vdev);
> > +    register_ioport_write(0x3b0, 0x30, 2, vfio_vga_outw, vdev);
> > +    register_ioport_write(0x3b0, 0x30, 4, vfio_vga_outl, vdev);
> > +    register_ioport_read(0x3b0, 0x30, 1, vfio_vga_inb, vdev);
> > +    register_ioport_read(0x3b0, 0x30, 2, vfio_vga_inw, vdev);
> > +    register_ioport_read(0x3b0, 0x30, 4, vfio_vga_inl, vdev);
> > +    if (ioperm(0x3b0, 0x30, 1)) {
> > +        fprintf(stderr, "%s - ioperm failed (%s)\n", __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +    return 0;
> > +}
> > +
> > +void vfio_vga_exit(VFIODevice *vdev)
> > +{
> > +    if (!vdev->vga_io)
> > +        return;
> > +
> > +    isa_unassign_ioport(0x3b0, 0x30);
> > +    qemu_unregister_coalesced_mmio(0xa0000, 0x20000);
> > +    cpu_register_physical_memory(0xa0000, 0x20000, IO_MEM_UNASSIGNED);
> > +    cpu_unregister_io_memory(vdev->vga_io);
> > +    munmap(vdev->vga_mmio, 0x40000);
> > +    close(vdev->vga_mmio_fd);
> > +    qemu_free(vdev->vga_orig);
> > +    close(vdev->vga_fd);
> > +}
> > +
> 
> Thanks,
> Jan
>