From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Michael S. Tsirkin" Subject: Re: [PATCH 2/7] pci: memory access API and IOMMU support Date: Thu, 2 Sep 2010 08:28:26 +0300 Message-ID: <20100902052826.GB4273@redhat.com> References: <1283007298-10942-1-git-send-email-eduard.munteanu@linux360.ro> <1283007298-10942-3-git-send-email-eduard.munteanu@linux360.ro> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Cc: joro@8bytes.org, blauwirbel@gmail.com, paul@codesourcery.com, avi@redhat.com, anthony@codemonkey.ws, av1474@comtv.ru, yamahata@valinux.co.jp, kvm@vger.kernel.org, qemu-devel@nongnu.org To: Eduard - Gabriel Munteanu Return-path: Received: from mx1.redhat.com ([209.132.183.28]:56182 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750739Ab0IBFex (ORCPT ); Thu, 2 Sep 2010 01:34:53 -0400 Content-Disposition: inline In-Reply-To: <1283007298-10942-3-git-send-email-eduard.munteanu@linux360.ro> Sender: kvm-owner@vger.kernel.org List-ID: On Sat, Aug 28, 2010 at 05:54:53PM +0300, Eduard - Gabriel Munteanu wrote: > PCI devices should access memory through pci_memory_*() instead of > cpu_physical_memory_*(). This also provides support for translation and > access checking in case an IOMMU is emulated. > > Memory maps are treated as remote IOTLBs (that is, translation caches > belonging to the IOMMU-aware device itself). Clients (devices) must > provide callbacks for map invalidation in case these maps are > persistent beyond the current I/O context, e.g. AIO DMA transfers. > > Signed-off-by: Eduard - Gabriel Munteanu I am concerned about adding more pointer chaising on data path. Could we have 1. an iommu pointer in a device, inherited by secondary buses when they are created and by devices from buses when they are attached. 2. translation pointer in the iommu instead of the bus 3. pci_memory_XX functions inline, doing fast path for non-iommu case: if (__builtin_expect(!dev->iommu, 1) return cpu_memory_rw > --- > hw/pci.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++- > hw/pci.h | 74 +++++++++++++++++++++ > hw/pci_internals.h | 12 ++++ > qemu-common.h | 1 + > 4 files changed, 271 insertions(+), 1 deletions(-) Almost nothing here is PCI specific. Can this code go into dma.c/dma.h? We would have struct DMADevice, APIs like device_dma_write etc. This would help us get rid of the void * stuff as well? > > diff --git a/hw/pci.c b/hw/pci.c > index 2dc1577..b460905 100644 > --- a/hw/pci.c > +++ b/hw/pci.c > @@ -158,6 +158,19 @@ static void pci_device_reset(PCIDevice *dev) > pci_update_mappings(dev); > } > > +static int pci_no_translate(PCIDevice *iommu, > + PCIDevice *dev, > + pcibus_t addr, > + target_phys_addr_t *paddr, > + target_phys_addr_t *len, > + unsigned perms) > +{ > + *paddr = addr; > + *len = -1; > + > + return 0; > +} > + > static void pci_bus_reset(void *opaque) > { > PCIBus *bus = opaque; > @@ -220,7 +233,10 @@ void pci_bus_new_inplace(PCIBus *bus, DeviceState *parent, > { > qbus_create_inplace(&bus->qbus, &pci_bus_info, parent, name); > assert(PCI_FUNC(devfn_min) == 0); > - bus->devfn_min = devfn_min; > + > + bus->devfn_min = devfn_min; > + bus->iommu = NULL; > + bus->translate = pci_no_translate; > > /* host bridge */ > QLIST_INIT(&bus->child); > @@ -1789,3 +1805,170 @@ static char *pcibus_get_dev_path(DeviceState *dev) > return strdup(path); > } > > +void pci_register_iommu(PCIDevice *iommu, > + PCITranslateFunc *translate) > +{ > + iommu->bus->iommu = iommu; > + iommu->bus->translate = translate; > +} > + The above seems broken for secondary buses, right? Also, can we use qdev for initialization in some way, instead of adding more APIs? E.g. I think it would be nice if we could just use qdev command line flags to control which bus is behind iommu and which isn't. > +void pci_memory_rw(PCIDevice *dev, > + pcibus_t addr, > + uint8_t *buf, > + pcibus_t len, > + int is_write) > +{ > + int err; > + unsigned perms; > + PCIDevice *iommu = dev->bus->iommu; > + target_phys_addr_t paddr, plen; > + > + perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ; > + > + while (len) { > + err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms); > + if (err) > + return; > + > + /* The translation might be valid for larger regions. */ > + if (plen > len) > + plen = len; > + > + cpu_physical_memory_rw(paddr, buf, plen, is_write); > + > + len -= plen; > + addr += plen; > + buf += plen; > + } > +} > + > +static void pci_memory_register_map(PCIDevice *dev, > + pcibus_t addr, > + pcibus_t len, > + target_phys_addr_t paddr, > + PCIInvalidateMapFunc *invalidate, > + void *invalidate_opaque) > +{ > + PCIMemoryMap *map; > + > + map = qemu_malloc(sizeof(PCIMemoryMap)); > + map->addr = addr; > + map->len = len; > + map->paddr = paddr; > + map->invalidate = invalidate; > + map->invalidate_opaque = invalidate_opaque; > + > + QLIST_INSERT_HEAD(&dev->memory_maps, map, list); > +} > + > +static void pci_memory_unregister_map(PCIDevice *dev, > + target_phys_addr_t paddr, > + target_phys_addr_t len) > +{ > + PCIMemoryMap *map; > + > + QLIST_FOREACH(map, &dev->memory_maps, list) { > + if (map->paddr == paddr && map->len == len) { > + QLIST_REMOVE(map, list); > + free(map); > + } > + } > +} > + > +void pci_memory_invalidate_range(PCIDevice *dev, > + pcibus_t addr, > + pcibus_t len) > +{ > + PCIMemoryMap *map; > + > + QLIST_FOREACH(map, &dev->memory_maps, list) { > + if (ranges_overlap(addr, len, map->addr, map->len)) { > + map->invalidate(map->invalidate_opaque); > + QLIST_REMOVE(map, list); > + free(map); > + } > + } > +} > + > +void *pci_memory_map(PCIDevice *dev, > + PCIInvalidateMapFunc *cb, > + void *opaque, > + pcibus_t addr, > + target_phys_addr_t *len, > + int is_write) > +{ > + int err; > + unsigned perms; > + PCIDevice *iommu = dev->bus->iommu; > + target_phys_addr_t paddr, plen; > + > + perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ; > + > + plen = *len; > + err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms); > + if (err) > + return NULL; > + > + /* > + * If this is true, the virtual region is contiguous, > + * but the translated physical region isn't. We just > + * clamp *len, much like cpu_physical_memory_map() does. > + */ > + if (plen < *len) > + *len = plen; > + > + /* We treat maps as remote TLBs to cope with stuff like AIO. */ > + if (cb) > + pci_memory_register_map(dev, addr, *len, paddr, cb, opaque); > + > + return cpu_physical_memory_map(paddr, len, is_write); > +} > + All the above is really only useful for when there is an iommu, right? So maybe we should shortcut all this if there's no iommu? > +void pci_memory_unmap(PCIDevice *dev, > + void *buffer, > + target_phys_addr_t len, > + int is_write, > + target_phys_addr_t access_len) > +{ > + cpu_physical_memory_unmap(buffer, len, is_write, access_len); > + pci_memory_unregister_map(dev, (target_phys_addr_t) buffer, len); > +} > + > +#define DEFINE_PCI_LD(suffix, size) \ > +uint##size##_t pci_ld##suffix(PCIDevice *dev, pcibus_t addr) \ > +{ \ > + int err; \ > + target_phys_addr_t paddr, plen; \ > + \ > + err = dev->bus->translate(dev->bus->iommu, dev, \ > + addr, &paddr, &plen, IOMMU_PERM_READ); \ > + if (err || (plen < size / 8)) \ > + return 0; \ > + \ > + return ld##suffix##_phys(paddr); \ > +} > + > +#define DEFINE_PCI_ST(suffix, size) \ > +void pci_st##suffix(PCIDevice *dev, pcibus_t addr, uint##size##_t val) \ > +{ \ > + int err; \ > + target_phys_addr_t paddr, plen; \ > + \ > + err = dev->bus->translate(dev->bus->iommu, dev, \ > + addr, &paddr, &plen, IOMMU_PERM_WRITE); \ > + if (err || (plen < size / 8)) \ > + return; \ > + \ > + st##suffix##_phys(paddr, val); \ > +} > + > +DEFINE_PCI_LD(ub, 8) > +DEFINE_PCI_LD(uw, 16) > +DEFINE_PCI_LD(l, 32) > +DEFINE_PCI_LD(q, 64) > + > +DEFINE_PCI_ST(b, 8) > +DEFINE_PCI_ST(w, 16) > +DEFINE_PCI_ST(l, 32) > +DEFINE_PCI_ST(q, 64) > + > diff --git a/hw/pci.h b/hw/pci.h > index c551f96..3131016 100644 > --- a/hw/pci.h > +++ b/hw/pci.h > @@ -172,6 +172,8 @@ struct PCIDevice { > char *romfile; > ram_addr_t rom_offset; > uint32_t rom_bar; > + > + QLIST_HEAD(, PCIMemoryMap) memory_maps; > }; > > PCIDevice *pci_register_device(PCIBus *bus, const char *name, > @@ -391,4 +393,76 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1, > return !(last2 < first1 || last1 < first2); > } > > +/* > + * Memory I/O and PCI IOMMU definitions. > + */ > + > +#define IOMMU_PERM_READ (1 << 0) > +#define IOMMU_PERM_WRITE (1 << 1) > +#define IOMMU_PERM_RW (IOMMU_PERM_READ | IOMMU_PERM_WRITE) > + > +typedef int PCIInvalidateMapFunc(void *opaque); > +typedef int PCITranslateFunc(PCIDevice *iommu, > + PCIDevice *dev, > + pcibus_t addr, > + target_phys_addr_t *paddr, > + target_phys_addr_t *len, > + unsigned perms); > + > +extern void pci_memory_rw(PCIDevice *dev, > + pcibus_t addr, > + uint8_t *buf, > + pcibus_t len, > + int is_write); > +extern void *pci_memory_map(PCIDevice *dev, > + PCIInvalidateMapFunc *cb, > + void *opaque, > + pcibus_t addr, > + target_phys_addr_t *len, > + int is_write); > +extern void pci_memory_unmap(PCIDevice *dev, > + void *buffer, > + target_phys_addr_t len, > + int is_write, > + target_phys_addr_t access_len); > +extern void pci_register_iommu(PCIDevice *dev, > + PCITranslateFunc *translate); > +extern void pci_memory_invalidate_range(PCIDevice *dev, > + pcibus_t addr, > + pcibus_t len); > + > +#define DECLARE_PCI_LD(suffix, size) \ > +extern uint##size##_t pci_ld##suffix(PCIDevice *dev, pcibus_t addr); > + > +#define DECLARE_PCI_ST(suffix, size) \ > +extern void pci_st##suffix(PCIDevice *dev, \ > + pcibus_t addr, \ > + uint##size##_t val); > + > +DECLARE_PCI_LD(ub, 8) > +DECLARE_PCI_LD(uw, 16) > +DECLARE_PCI_LD(l, 32) > +DECLARE_PCI_LD(q, 64) > + > +DECLARE_PCI_ST(b, 8) > +DECLARE_PCI_ST(w, 16) > +DECLARE_PCI_ST(l, 32) > +DECLARE_PCI_ST(q, 64) > + > +static inline void pci_memory_read(PCIDevice *dev, > + pcibus_t addr, > + uint8_t *buf, > + pcibus_t len) > +{ > + pci_memory_rw(dev, addr, buf, len, 0); > +} > + > +static inline void pci_memory_write(PCIDevice *dev, > + pcibus_t addr, > + const uint8_t *buf, > + pcibus_t len) > +{ > + pci_memory_rw(dev, addr, (uint8_t *) buf, len, 1); > +} > + > #endif > diff --git a/hw/pci_internals.h b/hw/pci_internals.h > index e3c93a3..fb134b9 100644 > --- a/hw/pci_internals.h > +++ b/hw/pci_internals.h > @@ -33,6 +33,9 @@ struct PCIBus { > Keep a count of the number of devices with raised IRQs. */ > int nirq; > int *irq_count; > + > + PCIDevice *iommu; > + PCITranslateFunc *translate; > }; Why is translate pointer in a bus? I think it's a work of an iommu? > struct PCIBridge { > @@ -44,4 +47,13 @@ struct PCIBridge { > const char *bus_name; > }; > > +struct PCIMemoryMap { > + pcibus_t addr; > + pcibus_t len; > + target_phys_addr_t paddr; > + PCIInvalidateMapFunc *invalidate; > + void *invalidate_opaque; Can we have a structure that encapsulates the mapping data instead of a void *? > + QLIST_ENTRY(PCIMemoryMap) list; > +}; > + > #endif /* QEMU_PCI_INTERNALS_H */ > diff --git a/qemu-common.h b/qemu-common.h > index d735235..8b060e8 100644 > --- a/qemu-common.h > +++ b/qemu-common.h > @@ -218,6 +218,7 @@ typedef struct SMBusDevice SMBusDevice; > typedef struct PCIHostState PCIHostState; > typedef struct PCIExpressHost PCIExpressHost; > typedef struct PCIBus PCIBus; > +typedef struct PCIMemoryMap PCIMemoryMap; > typedef struct PCIDevice PCIDevice; > typedef struct PCIBridge PCIBridge; > typedef struct SerialState SerialState; > -- > 1.7.1 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html