Re: Graphics pass-through

From: Jan Kiszka <jan.kiszka@siemens.com>
To: Alex Williamson <alex.williamson@redhat.com>
Cc: "André Weidemann" <Andre.Weidemann@web.de>,
	"Prasad Joshi" <prasadjoshi124@gmail.com>,
	kvm@vger.kernel.org,
	"Oswaldo Cadenas" <oswaldo.cadenas@gmail.com>
Subject: Re: Graphics pass-through
Date: Thu, 05 May 2011 10:50:21 +0200	[thread overview]
Message-ID: <4DC264CD.9080700@siemens.com> (raw)
In-Reply-To: <1296175554.2891.29.camel@x201>

Hi Alex,

On 2011-01-28 01:45, Alex Williamson wrote:
> On Thu, 2011-01-27 at 12:56 +0100, André Weidemann wrote:
>> Hi Alex,
>>
>> On 26.01.2011 06:12, Alex Williamson wrote:
>>
>>> So while your initial results are promising, my guess is that you're
>>> using card specific drivers and still need to consider some of the
>>> harder problems with generic support for vga assignment.  I hacked on
>>> this for a bit trying to see if I could get vga assignment working
>>> with the vfio driver.  Setting up the legacy access and preventing
>>> qemu from stealing it back should get you basic vga modes and might
>>> even allow the option rom to run to initialize the card for pre-boot.
>>> I was able to get this far on a similar ATI card.  I never hard much
>>> luck with other cards though, and I was never able to get the vesa
>>> extensions working.  Thanks,
>>
>> Do you mind sharing these patches?
> 
> Attached.
> 

We are about to try some pass-through with an NVIDA card. So I already
hacked on your vfio patch to make it build against current devices
assignment code. Some questions arose while studying the code:

...

> --- /dev/null
> +++ b/hw/vfio-vga.c
> @@ -0,0 +1,291 @@
> +/*
> + * vfio VGA device assignment support
> + *
> + * Copyright Red Hat, Inc. 2010
> + *
> + * Authors:
> + *  Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Based on qemu-kvm device-assignment:
> + *  Adapted for KVM by Qumranet.
> + *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
> + *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
> + *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
> + *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
> + *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
> + */
> +
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <sys/io.h>
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include "event_notifier.h"
> +#include "hw.h"
> +#include "memory.h"
> +#include "monitor.h"
> +#include "pc.h"
> +#include "qemu-error.h"
> +#include "sysemu.h"
> +#include "vfio.h"
> +#include <pci/header.h>
> +#include <pci/types.h>
> +#include <linux/types.h>
> +#include "linux-vfio.h"
> +
> +//#define DEBUG_VFIO_VGA
> +#ifdef DEBUG_VFIO_VGA
> +#define DPRINTF(fmt, ...) \
> +    do { printf("vfio-vga: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif
> +
> +/*
> + * VGA setup
> + */
> +static void vfio_vga_write(VFIODevice *vdev, uint32_t addr,
> +                           uint32_t val, int len)
> +{
> +    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, 0xa0000 + addr, len, val);
> +    switch (len) {
> +        case 1:
> +            *(uint8_t *)(vdev->vga_mmio + addr) = (uint8_t)val;
> +            break;
> +        case 2:
> +            *(uint16_t *)(vdev->vga_mmio + addr) = (uint16_t)val;
> +            break;
> +        case 4:
> +            *(uint32_t *)(vdev->vga_mmio + addr) = val;
> +            break;
> +    }
> +}
> +
> +static void vfio_vga_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    vfio_vga_write(opaque, addr, val, 1);
> +}
> +
> +static void vfio_vga_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    vfio_vga_write(opaque, addr, val, 2);
> +}
> +
> +static void vfio_vga_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
> +{
> +    vfio_vga_write(opaque, addr, val, 4);
> +}
> +
> +static CPUWriteMemoryFunc * const vfio_vga_writes[] = {
> +    &vfio_vga_writeb,
> +    &vfio_vga_writew,
> +    &vfio_vga_writel
> +};
> +
> +static uint32_t vfio_vga_read(VFIODevice *vdev, uint32_t addr, int len)
> +{
> +    uint32_t val = 0xffffffff;
> +    switch (len) {
> +        case 1:
> +            val = (uint32_t)*(uint8_t *)(vdev->vga_mmio + addr);
> +            break;
> +        case 2:
> +            val = (uint32_t)*(uint16_t *)(vdev->vga_mmio + addr);
> +            break;
> +        case 4:
> +            val = *(uint32_t *)(vdev->vga_mmio + addr);
> +            break;
> +    }
> +    DPRINTF("%s 0x%x %d = 0x%x\n", __func__, 0xa0000 + addr, len, val);
> +    return val;
> +}
> +
> +static uint32_t vfio_vga_readb(void *opaque, target_phys_addr_t addr)
> +{
> +    return vfio_vga_read(opaque, addr, 1);
> +}
> +
> +static uint32_t vfio_vga_readw(void *opaque, target_phys_addr_t addr)
> +{
> +    return vfio_vga_read(opaque, addr, 2);
> +}
> +
> +static uint32_t vfio_vga_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    return vfio_vga_read(opaque, addr, 4);
> +}
> +
> +static CPUReadMemoryFunc * const vfio_vga_reads[] = {
> +    &vfio_vga_readb,
> +    &vfio_vga_readw,
> +    &vfio_vga_readl
> +};
> +
> +static void vfio_vga_out(VFIODevice *vdev, uint32_t addr, uint32_t val, int len)
> +{
> +    DPRINTF("%s 0x%x %d - 0x%x\n", __func__, addr, len, val);
> +    ioperm(0x3b0, 0x30, 1); /* XXX fix me */

Why do you have to re-establish the ioperms here on each access? Are we
just lacking the use of generic kvm ioperm management?

> +    switch (len) {
> +        case 1:
> +            outb(val, addr);
> +            break;
> +        case 2:
> +            outw(val, addr);
> +            break;
> +        case 4:
> +            outl(val, addr);
> +            break;
> +    }
> +}
> +
> +static void vfio_vga_outb(void *opaque, uint32_t addr, uint32_t val)
> +{
> +    vfio_vga_out(opaque, addr, val, 1);
> +}
> +
> +static void vfio_vga_outw(void *opaque, uint32_t addr, uint32_t val)
> +{
> +    vfio_vga_out(opaque, addr, val, 2);
> +}
> +
> +static void vfio_vga_outl(void *opaque, uint32_t addr, uint32_t val)
> +{
> +    vfio_vga_out(opaque, addr, val, 4);
> +}
> +
> +static uint32_t vfio_vga_in(VFIODevice *vdev, uint32_t addr, int len)
> +{
> +    uint32_t val = 0xffffffff;
> +    ioperm(0x3b0, 0x30, 1); /* XXX fix me */
> +    switch (len) {
> +        case 1:
> +            val = inb(addr);
> +            break;
> +        case 2:
> +            val = inw(addr);
> +            break;
> +        case 4:
> +            val = inl(addr);
> +            break;
> +    }
> +    DPRINTF("%s 0x%x, %d = 0x%x\n", __func__, addr, len, val);
> +    return val;
> +}
> +
> +static uint32_t vfio_vga_inb(void *opaque, uint32_t addr)
> +{
> +    return vfio_vga_in(opaque, addr, 1);
> +}
> +
> +static uint32_t vfio_vga_inw(void *opaque, uint32_t addr)
> +{
> +    return vfio_vga_in(opaque, addr, 2);
> +}
> +
> +static uint32_t vfio_vga_inl(void *opaque, uint32_t addr)
> +{
> +    return vfio_vga_in(opaque, addr, 4);
> +}
> +
> +int vfio_vga_setup(VFIODevice *vdev)
> +{
> +    char buf[256];
> +    int ret;
> +
> +    if (vga_interface_type != VGA_NONE) {
> +        fprintf(stderr,
> +                "VGA devie assigned without -vga none param, no ISA VGA\n");
> +        return -1;
> +    }
> +
> +    vdev->vga_fd = open("/dev/vga_arbiter", O_RDWR);
> +    if (vdev->vga_fd < 0) {
> +        fprintf(stderr, "%s - Failed to open vga arbiter (%s)\n",
> +                __func__, strerror(errno));
> +        return -1;
> +    }
> +    ret = read(vdev->vga_fd, buf, sizeof(buf));
> +    if (ret <= 0) {
> +        fprintf(stderr, "%s - Failed to read from vga arbiter (%s)\n",
> +                __func__, strerror(errno));
> +        close(vdev->vga_fd);
> +        return -1;
> +    }
> +    buf[ret - 1] = 0;
> +    vdev->vga_orig = qemu_strdup(buf);
> +
> +    snprintf(buf, sizeof(buf), "target PCI:%04x:%02x:%02x.%x",
> +             vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
> +    ret = write(vdev->vga_fd, buf, strlen(buf));
> +    if (ret != strlen(buf)) {
> +        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
> +                __func__, strerror(errno));
> +        close(vdev->vga_fd);
> +        return -1;
> +    }
> +    snprintf(buf, sizeof(buf), "decodes io+mem");
> +    ret = write(vdev->vga_fd, buf, strlen(buf));
> +    if (ret != strlen(buf)) {
> +        fprintf(stderr, "%s - Failed to write to vga arbiter (%s)\n",
> +                __func__, strerror(errno));
> +        close(vdev->vga_fd);
> +        return -1;
> +    }

OK, so we grab the assigned adapter and make it handle legacy io+mem. I
guess this approach only works with a single guest with an assigned
adapter. Would it be possible and not extremely costly to do some
on-demand grabbing of the range to share it with multiple VMs?

And what about the host? When does Linux release the legacy range?
Always or only when a specific (!=vga/vesa) framebuffer driver is loaded?

Is there some other way to pass the legacy accesses from the guest to a
specific adapter without going via the host's legacy area? I.e. do some
adapters allow remapping?

> +
> +    vdev->vga_mmio_fd = open("/dev/mem", O_RDWR);
> +    if (vdev->vga_mmio_fd < 0) {
> +        fprintf(stderr, "%s - Failed to open /dev/mem (%s)\n",
> +                __func__, strerror(errno));
> +        return -1;
> +    }
> +    vdev->vga_mmio = mmap(NULL, 0x40000, PROT_READ | PROT_WRITE,
> +                          MAP_SHARED, vdev->vga_mmio_fd, 0xa0000);
> +    if (vdev->vga_mmio == MAP_FAILED) {
> +        fprintf(stderr, "%s - mmap failed (%s)\n", __func__, strerror(errno));
> +        return -1;
> +    }
> +
> +#if 1
> +    vdev->vga_io = cpu_register_io_memory(vfio_vga_reads,
> +                                          vfio_vga_writes, vdev);
> +    cpu_register_physical_memory(0xa0000, 0x20000, vdev->vga_io);
> +    qemu_register_coalesced_mmio(0xa0000, 0x20000);
> +#else
> +    cpu_register_physical_memory(0xa0000, 0x20000, 
> +        qemu_ram_map(&vdev->pdev.qdev, "VGA", 0x20000, vdev->vga_mmio));
> +    qemu_register_coalesced_mmio(0xa0000, 0x20000);
> +#endif

To make the second case work, we would have to track the mode switches
of the guest via legacy VGA interfaces and switch the mapping on the
fly, right?

> +
> +    register_ioport_write(0x3b0, 0x30, 1, vfio_vga_outb, vdev);
> +    register_ioport_write(0x3b0, 0x30, 2, vfio_vga_outw, vdev);
> +    register_ioport_write(0x3b0, 0x30, 4, vfio_vga_outl, vdev);
> +    register_ioport_read(0x3b0, 0x30, 1, vfio_vga_inb, vdev);
> +    register_ioport_read(0x3b0, 0x30, 2, vfio_vga_inw, vdev);
> +    register_ioport_read(0x3b0, 0x30, 4, vfio_vga_inl, vdev);
> +    if (ioperm(0x3b0, 0x30, 1)) {
> +        fprintf(stderr, "%s - ioperm failed (%s)\n", __func__, strerror(errno));
> +        return -1;
> +    }
> +    return 0;
> +}
> +
> +void vfio_vga_exit(VFIODevice *vdev)
> +{
> +    if (!vdev->vga_io)
> +        return;
> +
> +    isa_unassign_ioport(0x3b0, 0x30);
> +    qemu_unregister_coalesced_mmio(0xa0000, 0x20000);
> +    cpu_register_physical_memory(0xa0000, 0x20000, IO_MEM_UNASSIGNED);
> +    cpu_unregister_io_memory(vdev->vga_io);
> +    munmap(vdev->vga_mmio, 0x40000);
> +    close(vdev->vga_mmio_fd);
> +    qemu_free(vdev->vga_orig);
> +    close(vdev->vga_fd);
> +}
> +

Thanks,
Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux