From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751308AbcFVWEM (ORCPT ); Wed, 22 Jun 2016 18:04:12 -0400 Received: from mx1.redhat.com ([209.132.183.28]:59705 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750916AbcFVWEK (ORCPT ); Wed, 22 Jun 2016 18:04:10 -0400 Date: Wed, 22 Jun 2016 16:04:06 -0600 From: Alex Williamson To: Yongji Xie Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, bhelgaas@google.com, aik@ozlabs.ru, benh@kernel.crashing.org, paulus@samba.org, mpe@ellerman.id.au, warrier@linux.vnet.ibm.com, zhong@linux.vnet.ibm.com, nikunj@linux.vnet.ibm.com, gwshan@linux.vnet.ibm.com, kevin.tian@intel.com Subject: Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive Message-ID: <20160622160406.68da3395@t450s.home> In-Reply-To: <201605301311.u4UD99he028186@mx0a-001b2d01.pphosted.com> References: <201605301311.u4UD99he028186@mx0a-001b2d01.pphosted.com> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.38]); Wed, 22 Jun 2016 22:04:09 +0000 (UTC) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Mon, 30 May 2016 21:06:37 +0800 Yongji Xie wrote: > Current vfio-pci implementation disallows to mmap > sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio > page may be shared with other BARs. This will cause some > performance issues when we passthrough a PCI device with > this kind of BARs. Guest will be not able to handle the mmio > accesses to the BARs which leads to mmio emulations in host. > > However, not all sub-page BARs will share page with other BARs. > We should allow to mmap the sub-page MMIO BARs which we can > make sure will not share page with other BARs. > > This patch adds support for this case. And we try to add a > dummy resource to reserve the remainder of the page which > hot-add device's BAR might be assigned into. But it's not > necessary to handle the case when the BAR is not page aligned. > Because we can't expect the BAR will be assigned into the same > location in a page in guest when we passthrough the BAR. And > it's hard to access this BAR in userspace because we have > no way to get the BAR's location in a page. > > Signed-off-by: Yongji Xie > --- Hi Yongji, On 5/22, message-id <201605230345.u4N3dJip043323@mx0a-001b2d01.pphosted.com> you indicated you'd post the QEMU code which is enabled by this patch "soon". Have I missed that? I'm still waiting to see it. Thanks, Alex > drivers/vfio/pci/vfio_pci.c | 87 ++++++++++++++++++++++++++++++++--- > drivers/vfio/pci/vfio_pci_private.h | 8 ++++ > 2 files changed, 89 insertions(+), 6 deletions(-) > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index 188b1ff..3cca2a7 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) > return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; > } > > +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) > +{ > + struct resource *res; > + int bar; > + struct vfio_pci_dummy_resource *dummy_res; > + > + INIT_LIST_HEAD(&vdev->dummy_resources_list); > + > + for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { > + res = vdev->pdev->resource + bar; > + > + if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) > + goto no_mmap; > + > + if (!(res->flags & IORESOURCE_MEM)) > + goto no_mmap; > + > + /* > + * The PCI core shouldn't set up a resource with a > + * type but zero size. But there may be bugs that > + * cause us to do that. > + */ > + if (!resource_size(res)) > + goto no_mmap; > + > + if (resource_size(res) >= PAGE_SIZE) { > + vdev->bar_mmap_supported[bar] = true; > + continue; > + } > + > + if (!(res->start & ~PAGE_MASK)) { > + /* > + * Add a dummy resource to reserve the remainder > + * of the exclusive page in case that hot-add > + * device's bar is assigned into it. > + */ > + dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); > + if (dummy_res == NULL) > + goto no_mmap; > + > + dummy_res->resource.start = res->end + 1; > + dummy_res->resource.end = res->start + PAGE_SIZE - 1; > + dummy_res->resource.flags = res->flags; > + if (request_resource(res->parent, > + &dummy_res->resource)) { > + kfree(dummy_res); > + goto no_mmap; > + } > + dummy_res->index = bar; > + list_add(&dummy_res->res_next, > + &vdev->dummy_resources_list); > + vdev->bar_mmap_supported[bar] = true; > + continue; > + } > + /* > + * Here we don't handle the case when the BAR is not page > + * aligned because we can't expect the BAR will be > + * assigned into the same location in a page in guest > + * when we passthrough the BAR. And it's hard to access > + * this BAR in userspace because we have no way to get > + * the BAR's location in a page. > + */ > +no_mmap: > + vdev->bar_mmap_supported[bar] = false; > + } > +} > + > static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); > static void vfio_pci_disable(struct vfio_pci_device *vdev); > > @@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) > } > } > > + vfio_pci_probe_mmaps(vdev); > + > return 0; > } > > static void vfio_pci_disable(struct vfio_pci_device *vdev) > { > struct pci_dev *pdev = vdev->pdev; > + struct vfio_pci_dummy_resource *dummy_res, *tmp; > int i, bar; > > /* Stop the device from further DMA */ > @@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) > vdev->barmap[bar] = NULL; > } > > + list_for_each_entry_safe(dummy_res, tmp, > + &vdev->dummy_resources_list, res_next) { > + list_del(&dummy_res->res_next); > + release_resource(&dummy_res->resource); > + kfree(dummy_res); > + } > + > vdev->needs_reset = true; > > /* > @@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data, > > info.flags = VFIO_REGION_INFO_FLAG_READ | > VFIO_REGION_INFO_FLAG_WRITE; > - if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && > - pci_resource_flags(pdev, info.index) & > - IORESOURCE_MEM && info.size >= PAGE_SIZE) { > + if (vdev->bar_mmap_supported[info.index]) { > info.flags |= VFIO_REGION_INFO_FLAG_MMAP; > if (info.index == vdev->msix_bar) { > ret = msix_sparse_mmap_cap(vdev, &caps); > @@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) > return -EINVAL; > if (index >= VFIO_PCI_ROM_REGION_INDEX) > return -EINVAL; > - if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) > + if (!vdev->bar_mmap_supported[index]) > return -EINVAL; > > - phys_len = pci_resource_len(pdev, index); > + phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); > req_len = vma->vm_end - vma->vm_start; > pgoff = vma->vm_pgoff & > ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); > req_start = pgoff << PAGE_SHIFT; > > - if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) > + if (req_start + req_len > phys_len) > return -EINVAL; > > if (index == vdev->msix_bar) { > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h > index 016c14a..2128de8 100644 > --- a/drivers/vfio/pci/vfio_pci_private.h > +++ b/drivers/vfio/pci/vfio_pci_private.h > @@ -57,9 +57,16 @@ struct vfio_pci_region { > u32 flags; > }; > > +struct vfio_pci_dummy_resource { > + struct resource resource; > + int index; > + struct list_head res_next; > +}; > + > struct vfio_pci_device { > struct pci_dev *pdev; > void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; > + bool bar_mmap_supported[PCI_STD_RESOURCE_END + 1]; > u8 *pci_config_map; > u8 *vconfig; > struct perm_bits *msi_perm; > @@ -88,6 +95,7 @@ struct vfio_pci_device { > int refcnt; > struct eventfd_ctx *err_trigger; > struct eventfd_ctx *req_trigger; > + struct list_head dummy_resources_list; > }; > > #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)