From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([208.118.235.92]:44871) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1SpWql-0007Fj-LG for qemu-devel@nongnu.org; Thu, 12 Jul 2012 23:47:29 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1SpWqj-0000A7-LB for qemu-devel@nongnu.org; Thu, 12 Jul 2012 23:47:27 -0400 Message-ID: <1342151231.2229.101.camel@bling.home> From: Alex Williamson Date: Thu, 12 Jul 2012 21:47:11 -0600 In-Reply-To: <1342083134-28669-1-git-send-email-aik@ozlabs.ru> References: <1341899497-23265-1-git-send-email-aik@ozlabs.ru> <1342083134-28669-1-git-send-email-aik@ozlabs.ru> Content-Type: text/plain; charset="UTF-8" Mime-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH] RFC: vfio-powerpc: added VFIO support (v2) List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Alexey Kardashevskiy Cc: qemu-devel@nongnu.org, qemu-ppc@nongnu.org, Alexander Graf , David Gibson On Thu, 2012-07-12 at 18:52 +1000, Alexey Kardashevskiy wrote: > It literally does the following: > > 1. POWERPC IOMMU support (the kernel counterpart is required) > > 2. The patch assumes that IOAPIC calls are going to be replaced > with something generic. I have something in my local git but it's > too early, we need to extend PCIINTxRoute first. > > 3. vfio_get_group() made public. I want to open IOMMU group from > the sPAPR code to have everything I need for VFIO on sPAPR and > avoid ugly workarounds with finilizing PHB setup on sPAPR. > > 4. Change sPAPR PHB to scan the PCI bus which is used for > the IOMMU-VFIO group. Now it is enough to add the following to > the QEMU command line to get VFIO up with all the devices from > IOMMU group with id=3: > -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\ > mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000 > > Signed-off-by: Alexey Kardashevskiy > --- > hw/ppc/Makefile.objs | 3 ++ > hw/spapr.h | 4 ++ > hw/spapr_iommu.c | 87 ++++++++++++++++++++++++++++++++++++++ > hw/spapr_pci.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++--- > hw/spapr_pci.h | 5 +++ > hw/vfio_pci.c | 28 +++++++++++- > hw/vfio_pci.h | 2 + > 7 files changed, 237 insertions(+), 7 deletions(-) > > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs > index f573a95..c46a049 100644 > --- a/hw/ppc/Makefile.objs > +++ b/hw/ppc/Makefile.objs > @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o > # Xilinx PPC peripherals > obj-y += xilinx_ethlite.o > > +# VFIO PCI device assignment > +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o > + > obj-y := $(addprefix ../,$(obj-y)) > diff --git a/hw/spapr.h b/hw/spapr.h > index b37f337..9dca704 100644 > --- a/hw/spapr.h > +++ b/hw/spapr.h > @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname, > int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, > DMAContext *dma); > > +void spapr_vfio_init_dma(int fd, uint32_t liobn, > + uint64_t *dma32_window_start, > + uint64_t *dma32_window_size); > + > #endif /* !defined (__HW_SPAPR_H__) */ > diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c > index 50c288d..0a194e8 100644 > --- a/hw/spapr_iommu.c > +++ b/hw/spapr_iommu.c > @@ -16,6 +16,8 @@ > * You should have received a copy of the GNU Lesser General Public > * License along with this library; if not, see . > */ > +#include > + > #include "hw.h" > #include "kvm.h" > #include "qdev.h" > @@ -23,6 +25,7 @@ > #include "dma.h" > > #include "hw/spapr.h" > +#include "hw/linux-vfio.h" > > #include > > @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce) > return 0; > } > > +/* -------- API for POWERPC IOMMU -------- */ > + > +#define POWERPC_IOMMU 2 > + > +struct tce_iommu_info { > + __u32 argsz; > + __u32 dma32_window_start; > + __u32 dma32_window_size; > +}; > + > +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > + > +struct tce_iommu_dma_map { > + __u32 argsz; > + __u64 va; > + __u64 dmaaddr; > +}; > + > +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) > +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) > + > +typedef struct sPAPRVFIOTable { > + int fd; > + uint32_t liobn; > + QLIST_ENTRY(sPAPRVFIOTable) list; > +} sPAPRVFIOTable; > + > +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables; > + > +void spapr_vfio_init_dma(int fd, uint32_t liobn, > + uint64_t *dma32_window_start, > + uint64_t *dma32_window_size) > +{ > + sPAPRVFIOTable *t; > + struct tce_iommu_info info = { .argsz = sizeof(info) }; > + > + if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) { > + fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno); > + return; > + } > + *dma32_window_start = info.dma32_window_start; > + *dma32_window_size = info.dma32_window_size; > + > + t = g_malloc0(sizeof(*t)); > + t->fd = fd; > + t->liobn = liobn; > + > + QLIST_INSERT_HEAD(&vfio_tce_tables, t, list); > +} > + > +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce) > +{ > + sPAPRVFIOTable *t; > + struct tce_iommu_dma_map map = { > + .argsz = sizeof(map), > + .va = 0, > + .dmaaddr = ioba, > + }; > + > + QLIST_FOREACH(t, &vfio_tce_tables, list) { > + if (t->liobn != liobn) { > + continue; > + } > + if (tce) { > + map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK); > + if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) { > + fprintf(stderr, "TCE_MAP_DMA: %d\n", errno); > + return H_PARAMETER; > + } > + } else { > + if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) { > + fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno); > + return H_PARAMETER; > + } > + } > + return H_SUCCESS; > + } > + return H_CONTINUE; /* positive non-zero value */ > +} > + > static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, > target_ulong opcode, target_ulong *args) > { > @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, > if (0 >= ret) { > return ret ? H_PARAMETER : H_SUCCESS; > } > + ret = put_tce_vfio(liobn, ioba, tce); > + if (0 >= ret) { > + return ret ? H_PARAMETER : H_SUCCESS; > + } > #ifdef DEBUG_TCE > fprintf(stderr, "%s on liobn=" TARGET_FMT_lx > " ioba 0x" TARGET_FMT_lx " TCE 0x" TARGET_FMT_lx " ret=%d\n", > diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c > index 014297b..92c48b6 100644 > --- a/hw/spapr_pci.c > +++ b/hw/spapr_pci.c > @@ -22,6 +22,9 @@ > * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > * THE SOFTWARE. > */ > +#include > +#include > + > #include "hw.h" > #include "pci.h" > #include "msi.h" > @@ -29,10 +32,10 @@ > #include "pci_host.h" > #include "hw/spapr.h" > #include "hw/spapr_pci.h" > +#include "hw/vfio_pci.h" :-\ > #include "exec-memory.h" > #include > #include "trace.h" > - > #include "hw/pci_internals.h" > > /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ > @@ -440,6 +443,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level) > level); > } > > +static int pci_spapr_get_irq(void *opaque, int irq_num) > +{ > + sPAPRPHBState *phb = opaque; > + return phb->lsi_table[irq_num].dt_irq; > +} > + > static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr, > unsigned size) > { > @@ -515,6 +524,82 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque, > return phb->dma; > } > > +static int spapr_pci_scan_vfio(sPAPRPHBState *phb) > +{ > + char iommupath[256]; > + DIR *dirp; > + struct dirent *entry; > + > + phb->iommugroup = vfio_get_group(phb->iommugroupid); > + if (!phb->iommugroup) { > + fprintf(stderr, "Cannot open IOMMU group %d\n", phb->iommugroupid); > + return -1; > + } > + > + if (!phb->scan) { > + printf("Autoscan disabled\n"); > + return 0; > + } > + > + sprintf(iommupath, "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid); > + dirp = opendir(iommupath); > + > + while ((entry = readdir(dirp)) != NULL) { > + char *tmp = alloca(strlen(iommupath) + strlen(entry->d_name) + 32); > + FILE *deviceclassfile; > + unsigned deviceclass = 0, domainid, busid, devid, fnid; > + char addr[32]; > + DeviceState *dev; > + > + if (4 != sscanf(entry->d_name, "%X:%X:%X.%x", > + &domainid, &busid, &devid, &fnid)) { > + continue; > + } > + > + sprintf(tmp, "%s%s/class", iommupath, entry->d_name); > + printf("Reading device class from %s\n", tmp); > + > + deviceclassfile = fopen(tmp, "r"); > + if (deviceclassfile) { > + fscanf(deviceclassfile, "%x", &deviceclass); > + fclose(deviceclassfile); > + } > + if (!deviceclass) { > + continue; > + } > +#define PCI_BASE_CLASS_BRIDGE 0x06 > + if ((phb->scan < 2) && ((deviceclass >> 16) == PCI_BASE_CLASS_BRIDGE)) { > + continue; > + } > + if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) { > + /* Tweak USB */ > + phb->force_addr = 1; > + phb->enable_multifunction = 1; > + } > + > + printf("Creating device %X:%X:%X.%x class=0x%X\n", > + domainid, busid, devid, fnid, deviceclass); > + > + dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci"); > + if (!dev) { > + fprintf(stderr, "failed to create vfio-pci\n"); > + continue; > + } > + qdev_prop_parse(dev, "host", entry->d_name); > + if (phb->force_addr) { > + sprintf(addr, "%X.%X", devid, fnid); > + qdev_prop_parse(dev, "addr", addr); > + } > + if (phb->enable_multifunction) { > + qdev_prop_set_bit(dev, "multifunction", 1); > + } > + qdev_init_nofail(dev); > + } > + closedir(dirp); > + > + return 0; > +} > + > static int spapr_phb_init(SysBusDevice *s) > { > sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s); > @@ -567,15 +652,13 @@ static int spapr_phb_init(SysBusDevice *s) > > bus = pci_register_bus(&phb->host_state.busdev.qdev, > phb->busname ? phb->busname : phb->dtbusname, > - pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb, > + pci_spapr_set_irq, pci_spapr_get_irq, > + pci_spapr_map_irq, phb, > &phb->memspace, &phb->iospace, > PCI_DEVFN(0, 0), PCI_NUM_PINS); > phb->host_state.bus = bus; > > phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16); > - phb->dma_window_start = 0; > - phb->dma_window_size = 0x40000000; > - phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size); > pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb); > > QLIST_INSERT_HEAD(&spapr->phbs, phb, list); > @@ -588,6 +671,24 @@ static int spapr_phb_init(SysBusDevice *s) > } > } > > + if (phb->iommugroupid >= 0) { > + if (0 > spapr_pci_scan_vfio(phb)) { > + return -1; > + } > + if (!phb->iommugroup || !phb->iommugroup->container) { > + return -1; > + } > + spapr_vfio_init_dma(phb->iommugroup->container->fd, phb->dma_liobn, I'm not really a fan of this approach, the vfio data structures are not designed for external use. Perhaps they should be pulled into vfio_pci.c. Could we instead just make a group service function, something like: int vfio_group_iommu_ioctl(int iommu_group, int request, ...) Then it could just be a passthrough and you don't have to keep an fd or de-reference vfio structures. There's a little overhead that we have to lookup the group each time, but this is your slow method anyway and how many groups are going to be attached to a single guest. > + &phb->dma_window_start, > + &phb->dma_window_size); > + return 0; > + } > + > + phb->dma_window_start = 0; > + phb->dma_window_size = 0x40000000; > + phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, > + phb->dma_window_size); > + > return 0; > } > > @@ -599,6 +700,10 @@ static Property spapr_phb_properties[] = { > DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0), > DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000), > DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0), > + DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1), > + DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), /* 0 don't 1 +devices 2 +buses */ > + DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0), > + DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h > index 145071c..1953a74 100644 > --- a/hw/spapr_pci.h > +++ b/hw/spapr_pci.h > @@ -57,6 +57,11 @@ typedef struct sPAPRPHBState { > int nvec; > } msi_table[SPAPR_MSIX_MAX_DEVS]; > > + int32_t iommugroupid; > + struct VFIOGroup *iommugroup; > + uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */ > + uint8_t enable_multifunction, force_addr; > + > QLIST_ENTRY(sPAPRPHBState) list; > } sPAPRPHBState; > > diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c > index 1ac287f..73681fb 100644 > --- a/hw/vfio_pci.c > +++ b/hw/vfio_pci.c > @@ -21,7 +21,6 @@ > #include > #include > #include > -#include > #include > #include > #include > @@ -43,6 +42,12 @@ > #include "range.h" > #include "vfio_pci.h" > #include "linux-vfio.h" > +#ifndef TARGET_PPC64 > +#include > +#else > +#include "hw/pci_internals.h" > +#include "hw/spapr.h" > +#endif > > //#define DEBUG_VFIO > #ifdef DEBUG_VFIO > @@ -1581,6 +1586,25 @@ static int vfio_connect_container(VFIOGroup *group) > > memory_listener_register(&container->listener, get_system_memory()); > > +#define POWERPC_IOMMU 2 > + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) { > + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); > + if (ret) { > + error_report("vfio: failed to set group container: %s\n", > + strerror(errno)); > + g_free(container); > + close(fd); > + return -1; > + } > + > + ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU); > + if (ret) { > + error_report("vfio: failed to set iommu for container: %s\n", > + strerror(errno)); > + g_free(container); > + close(fd); > + return -1; > + } > } else { > error_report("vfio: No available IOMMU models\n"); > g_free(container); > @@ -1620,7 +1644,7 @@ static void vfio_disconnect_container(VFIOGroup *group) > } > } > > -static VFIOGroup *vfio_get_group(int groupid) > +VFIOGroup *vfio_get_group(int groupid) > { > VFIOGroup *group; > char path[32]; > diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h > index 226607c..d63dd63 100644 > --- a/hw/vfio_pci.h > +++ b/hw/vfio_pci.h > @@ -105,4 +105,6 @@ typedef struct VFIOGroup { > #define VFIO_FLAG_IOMMU_SHARED_BIT 0 > #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT) > > +VFIOGroup *vfio_get_group(int groupid); > + > #endif /* __VFIO_H__ */