From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751691AbcFWCy7 (ORCPT ); Wed, 22 Jun 2016 22:54:59 -0400 Received: from mx1.redhat.com ([209.132.183.28]:33899 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750959AbcFWCy5 (ORCPT ); Wed, 22 Jun 2016 22:54:57 -0400 Date: Wed, 22 Jun 2016 20:54:54 -0600 From: Alex Williamson To: Yongji Xie Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, bhelgaas@google.com, aik@ozlabs.ru, benh@kernel.crashing.org, paulus@samba.org, mpe@ellerman.id.au, warrier@linux.vnet.ibm.com, zhong@linux.vnet.ibm.com, nikunj@linux.vnet.ibm.com, gwshan@linux.vnet.ibm.com, kevin.tian@intel.com Subject: Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive Message-ID: <20160622205454.1d07311c@t450s.home> In-Reply-To: References: <201605301311.u4UD99he028186@mx0a-001b2d01.pphosted.com> <20160622160406.68da3395@t450s.home> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.25]); Thu, 23 Jun 2016 02:54:56 +0000 (UTC) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu, 23 Jun 2016 10:39:30 +0800 Yongji Xie wrote: > Hi, Alex > > On 2016/6/23 6:04, Alex Williamson wrote: > > > On Mon, 30 May 2016 21:06:37 +0800 > > Yongji Xie wrote: > > > >> Current vfio-pci implementation disallows to mmap > >> sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio > >> page may be shared with other BARs. This will cause some > >> performance issues when we passthrough a PCI device with > >> this kind of BARs. Guest will be not able to handle the mmio > >> accesses to the BARs which leads to mmio emulations in host. > >> > >> However, not all sub-page BARs will share page with other BARs. > >> We should allow to mmap the sub-page MMIO BARs which we can > >> make sure will not share page with other BARs. > >> > >> This patch adds support for this case. And we try to add a > >> dummy resource to reserve the remainder of the page which > >> hot-add device's BAR might be assigned into. But it's not > >> necessary to handle the case when the BAR is not page aligned. > >> Because we can't expect the BAR will be assigned into the same > >> location in a page in guest when we passthrough the BAR. And > >> it's hard to access this BAR in userspace because we have > >> no way to get the BAR's location in a page. > >> > >> Signed-off-by: Yongji Xie > >> --- > > Hi Yongji, > > > > On 5/22, message-id > > <201605230345.u4N3dJip043323@mx0a-001b2d01.pphosted.com> you indicated > > you'd post the QEMU code which is enabled by this patch "soon". Have I > > missed that? I'm still waiting to see it. Thanks, > > > > Alex > > I posted it on May 24th [1]. Do I need to resend it? > > [1] http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04125.html I found it. Thanks, Alex > >> drivers/vfio/pci/vfio_pci.c | 87 ++++++++++++++++++++++++++++++++--- > >> drivers/vfio/pci/vfio_pci_private.h | 8 ++++ > >> 2 files changed, 89 insertions(+), 6 deletions(-) > >> > >> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > >> index 188b1ff..3cca2a7 100644 > >> --- a/drivers/vfio/pci/vfio_pci.c > >> +++ b/drivers/vfio/pci/vfio_pci.c > >> @@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) > >> return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; > >> } > >> > >> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) > >> +{ > >> + struct resource *res; > >> + int bar; > >> + struct vfio_pci_dummy_resource *dummy_res; > >> + > >> + INIT_LIST_HEAD(&vdev->dummy_resources_list); > >> + > >> + for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { > >> + res = vdev->pdev->resource + bar; > >> + > >> + if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) > >> + goto no_mmap; > >> + > >> + if (!(res->flags & IORESOURCE_MEM)) > >> + goto no_mmap; > >> + > >> + /* > >> + * The PCI core shouldn't set up a resource with a > >> + * type but zero size. But there may be bugs that > >> + * cause us to do that. > >> + */ > >> + if (!resource_size(res)) > >> + goto no_mmap; > >> + > >> + if (resource_size(res) >= PAGE_SIZE) { > >> + vdev->bar_mmap_supported[bar] = true; > >> + continue; > >> + } > >> + > >> + if (!(res->start & ~PAGE_MASK)) { > >> + /* > >> + * Add a dummy resource to reserve the remainder > >> + * of the exclusive page in case that hot-add > >> + * device's bar is assigned into it. > >> + */ > >> + dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); > >> + if (dummy_res == NULL) > >> + goto no_mmap; > >> + > >> + dummy_res->resource.start = res->end + 1; > >> + dummy_res->resource.end = res->start + PAGE_SIZE - 1; > >> + dummy_res->resource.flags = res->flags; > >> + if (request_resource(res->parent, > >> + &dummy_res->resource)) { > >> + kfree(dummy_res); > >> + goto no_mmap; > >> + } > >> + dummy_res->index = bar; > >> + list_add(&dummy_res->res_next, > >> + &vdev->dummy_resources_list); > >> + vdev->bar_mmap_supported[bar] = true; > >> + continue; > >> + } > >> + /* > >> + * Here we don't handle the case when the BAR is not page > >> + * aligned because we can't expect the BAR will be > >> + * assigned into the same location in a page in guest > >> + * when we passthrough the BAR. And it's hard to access > >> + * this BAR in userspace because we have no way to get > >> + * the BAR's location in a page. > >> + */ > >> +no_mmap: > >> + vdev->bar_mmap_supported[bar] = false; > >> + } > >> +} > >> + > >> static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); > >> static void vfio_pci_disable(struct vfio_pci_device *vdev); > >> > >> @@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) > >> } > >> } > >> > >> + vfio_pci_probe_mmaps(vdev); > >> + > >> return 0; > >> } > >> > >> static void vfio_pci_disable(struct vfio_pci_device *vdev) > >> { > >> struct pci_dev *pdev = vdev->pdev; > >> + struct vfio_pci_dummy_resource *dummy_res, *tmp; > >> int i, bar; > >> > >> /* Stop the device from further DMA */ > >> @@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) > >> vdev->barmap[bar] = NULL; > >> } > >> > >> + list_for_each_entry_safe(dummy_res, tmp, > >> + &vdev->dummy_resources_list, res_next) { > >> + list_del(&dummy_res->res_next); > >> + release_resource(&dummy_res->resource); > >> + kfree(dummy_res); > >> + } > >> + > >> vdev->needs_reset = true; > >> > >> /* > >> @@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data, > >> > >> info.flags = VFIO_REGION_INFO_FLAG_READ | > >> VFIO_REGION_INFO_FLAG_WRITE; > >> - if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && > >> - pci_resource_flags(pdev, info.index) & > >> - IORESOURCE_MEM && info.size >= PAGE_SIZE) { > >> + if (vdev->bar_mmap_supported[info.index]) { > >> info.flags |= VFIO_REGION_INFO_FLAG_MMAP; > >> if (info.index == vdev->msix_bar) { > >> ret = msix_sparse_mmap_cap(vdev, &caps); > >> @@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) > >> return -EINVAL; > >> if (index >= VFIO_PCI_ROM_REGION_INDEX) > >> return -EINVAL; > >> - if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) > >> + if (!vdev->bar_mmap_supported[index]) > >> return -EINVAL; > >> > >> - phys_len = pci_resource_len(pdev, index); > >> + phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); > >> req_len = vma->vm_end - vma->vm_start; > >> pgoff = vma->vm_pgoff & > >> ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); > >> req_start = pgoff << PAGE_SHIFT; > >> > >> - if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) > >> + if (req_start + req_len > phys_len) > >> return -EINVAL; > >> > >> if (index == vdev->msix_bar) { > >> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h > >> index 016c14a..2128de8 100644 > >> --- a/drivers/vfio/pci/vfio_pci_private.h > >> +++ b/drivers/vfio/pci/vfio_pci_private.h > >> @@ -57,9 +57,16 @@ struct vfio_pci_region { > >> u32 flags; > >> }; > >> > >> +struct vfio_pci_dummy_resource { > >> + struct resource resource; > >> + int index; > >> + struct list_head res_next; > >> +}; > >> + > >> struct vfio_pci_device { > >> struct pci_dev *pdev; > >> void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; > >> + bool bar_mmap_supported[PCI_STD_RESOURCE_END + 1]; > >> u8 *pci_config_map; > >> u8 *vconfig; > >> struct perm_bits *msi_perm; > >> @@ -88,6 +95,7 @@ struct vfio_pci_device { > >> int refcnt; > >> struct eventfd_ctx *err_trigger; > >> struct eventfd_ctx *req_trigger; > >> + struct list_head dummy_resources_list; > >> }; > >> > >> #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) >