All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
       [not found] <201605301311.u4UD99he028186@mx0a-001b2d01.pphosted.com>
@ 2016-06-22 22:04 ` Alex Williamson
  2016-06-23  2:39   ` Yongji Xie
  2016-06-23 16:12 ` Alex Williamson
  1 sibling, 1 reply; 17+ messages in thread
From: Alex Williamson @ 2016-06-22 22:04 UTC (permalink / raw)
  To: Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On Mon, 30 May 2016 21:06:37 +0800
Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:

> Current vfio-pci implementation disallows to mmap
> sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
> page may be shared with other BARs. This will cause some
> performance issues when we passthrough a PCI device with
> this kind of BARs. Guest will be not able to handle the mmio
> accesses to the BARs which leads to mmio emulations in host.
> 
> However, not all sub-page BARs will share page with other BARs.
> We should allow to mmap the sub-page MMIO BARs which we can
> make sure will not share page with other BARs.
> 
> This patch adds support for this case. And we try to add a
> dummy resource to reserve the remainder of the page which
> hot-add device's BAR might be assigned into. But it's not
> necessary to handle the case when the BAR is not page aligned.
> Because we can't expect the BAR will be assigned into the same
> location in a page in guest when we passthrough the BAR. And
> it's hard to access this BAR in userspace because we have
> no way to get the BAR's location in a page.
> 
> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
> ---

Hi Yongji,

On 5/22, message-id
<201605230345.u4N3dJip043323@mx0a-001b2d01.pphosted.com> you indicated
you'd post the QEMU code which is enabled by this patch "soon".  Have I
missed that?  I'm still waiting to see it.  Thanks,

Alex

>  drivers/vfio/pci/vfio_pci.c         |   87 ++++++++++++++++++++++++++++++++---
>  drivers/vfio/pci/vfio_pci_private.h |    8 ++++
>  2 files changed, 89 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 188b1ff..3cca2a7 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>  	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>  }
>  
> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> +{
> +	struct resource *res;
> +	int bar;
> +	struct vfio_pci_dummy_resource *dummy_res;
> +
> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> +
> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> +		res = vdev->pdev->resource + bar;
> +
> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> +			goto no_mmap;
> +
> +		if (!(res->flags & IORESOURCE_MEM))
> +			goto no_mmap;
> +
> +		/*
> +		 * The PCI core shouldn't set up a resource with a
> +		 * type but zero size. But there may be bugs that
> +		 * cause us to do that.
> +		 */
> +		if (!resource_size(res))
> +			goto no_mmap;
> +
> +		if (resource_size(res) >= PAGE_SIZE) {
> +			vdev->bar_mmap_supported[bar] = true;
> +			continue;
> +		}
> +
> +		if (!(res->start & ~PAGE_MASK)) {
> +			/*
> +			 * Add a dummy resource to reserve the remainder
> +			 * of the exclusive page in case that hot-add
> +			 * device's bar is assigned into it.
> +			 */
> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> +			if (dummy_res == NULL)
> +				goto no_mmap;
> +
> +			dummy_res->resource.start = res->end + 1;
> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> +			dummy_res->resource.flags = res->flags;
> +			if (request_resource(res->parent,
> +						&dummy_res->resource)) {
> +				kfree(dummy_res);
> +				goto no_mmap;
> +			}
> +			dummy_res->index = bar;
> +			list_add(&dummy_res->res_next,
> +					&vdev->dummy_resources_list);
> +			vdev->bar_mmap_supported[bar] = true;
> +			continue;
> +		}
> +		/*
> +		 * Here we don't handle the case when the BAR is not page
> +		 * aligned because we can't expect the BAR will be
> +		 * assigned into the same location in a page in guest
> +		 * when we passthrough the BAR. And it's hard to access
> +		 * this BAR in userspace because we have no way to get
> +		 * the BAR's location in a page.
> +		 */
> +no_mmap:
> +		vdev->bar_mmap_supported[bar] = false;
> +	}
> +}
> +
>  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
>  static void vfio_pci_disable(struct vfio_pci_device *vdev);
>  
> @@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
>  		}
>  	}
>  
> +	vfio_pci_probe_mmaps(vdev);
> +
>  	return 0;
>  }
>  
>  static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  {
>  	struct pci_dev *pdev = vdev->pdev;
> +	struct vfio_pci_dummy_resource *dummy_res, *tmp;
>  	int i, bar;
>  
>  	/* Stop the device from further DMA */
> @@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  		vdev->barmap[bar] = NULL;
>  	}
>  
> +	list_for_each_entry_safe(dummy_res, tmp,
> +				 &vdev->dummy_resources_list, res_next) {
> +		list_del(&dummy_res->res_next);
> +		release_resource(&dummy_res->resource);
> +		kfree(dummy_res);
> +	}
> +
>  	vdev->needs_reset = true;
>  
>  	/*
> @@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data,
>  
>  			info.flags = VFIO_REGION_INFO_FLAG_READ |
>  				     VFIO_REGION_INFO_FLAG_WRITE;
> -			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
> -			    pci_resource_flags(pdev, info.index) &
> -			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
> +			if (vdev->bar_mmap_supported[info.index]) {
>  				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>  				if (info.index == vdev->msix_bar) {
>  					ret = msix_sparse_mmap_cap(vdev, &caps);
> @@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>  		return -EINVAL;
>  	if (index >= VFIO_PCI_ROM_REGION_INDEX)
>  		return -EINVAL;
> -	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
> +	if (!vdev->bar_mmap_supported[index])
>  		return -EINVAL;
>  
> -	phys_len = pci_resource_len(pdev, index);
> +	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
>  	req_len = vma->vm_end - vma->vm_start;
>  	pgoff = vma->vm_pgoff &
>  		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
>  	req_start = pgoff << PAGE_SHIFT;
>  
> -	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
> +	if (req_start + req_len > phys_len)
>  		return -EINVAL;
>  
>  	if (index == vdev->msix_bar) {
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index 016c14a..2128de8 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -57,9 +57,16 @@ struct vfio_pci_region {
>  	u32				flags;
>  };
>  
> +struct vfio_pci_dummy_resource {
> +	struct resource		resource;
> +	int			index;
> +	struct list_head	res_next;
> +};
> +
>  struct vfio_pci_device {
>  	struct pci_dev		*pdev;
>  	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
> +	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
>  	u8			*pci_config_map;
>  	u8			*vconfig;
>  	struct perm_bits	*msi_perm;
> @@ -88,6 +95,7 @@ struct vfio_pci_device {
>  	int			refcnt;
>  	struct eventfd_ctx	*err_trigger;
>  	struct eventfd_ctx	*req_trigger;
> +	struct list_head	dummy_resources_list;
>  };
>  
>  #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-22 22:04 ` [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive Alex Williamson
@ 2016-06-23  2:39   ` Yongji Xie
  2016-06-23  2:54     ` Alex Williamson
  0 siblings, 1 reply; 17+ messages in thread
From: Yongji Xie @ 2016-06-23  2:39 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

Hi, Alex

On 2016/6/23 6:04, Alex Williamson wrote:

> On Mon, 30 May 2016 21:06:37 +0800
> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>
>> Current vfio-pci implementation disallows to mmap
>> sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
>> page may be shared with other BARs. This will cause some
>> performance issues when we passthrough a PCI device with
>> this kind of BARs. Guest will be not able to handle the mmio
>> accesses to the BARs which leads to mmio emulations in host.
>>
>> However, not all sub-page BARs will share page with other BARs.
>> We should allow to mmap the sub-page MMIO BARs which we can
>> make sure will not share page with other BARs.
>>
>> This patch adds support for this case. And we try to add a
>> dummy resource to reserve the remainder of the page which
>> hot-add device's BAR might be assigned into. But it's not
>> necessary to handle the case when the BAR is not page aligned.
>> Because we can't expect the BAR will be assigned into the same
>> location in a page in guest when we passthrough the BAR. And
>> it's hard to access this BAR in userspace because we have
>> no way to get the BAR's location in a page.
>>
>> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
>> ---
> Hi Yongji,
>
> On 5/22, message-id
> <201605230345.u4N3dJip043323@mx0a-001b2d01.pphosted.com> you indicated
> you'd post the QEMU code which is enabled by this patch "soon".  Have I
> missed that?  I'm still waiting to see it.  Thanks,
>
> Alex

I posted it on May 24th [1]. Do I need to resend it?

[1] http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04125.html

Thanks,
Yongji

>>   drivers/vfio/pci/vfio_pci.c         |   87 ++++++++++++++++++++++++++++++++---
>>   drivers/vfio/pci/vfio_pci_private.h |    8 ++++
>>   2 files changed, 89 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index 188b1ff..3cca2a7 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>>   	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>>   }
>>   
>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>> +{
>> +	struct resource *res;
>> +	int bar;
>> +	struct vfio_pci_dummy_resource *dummy_res;
>> +
>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
>> +
>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
>> +		res = vdev->pdev->resource + bar;
>> +
>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
>> +			goto no_mmap;
>> +
>> +		if (!(res->flags & IORESOURCE_MEM))
>> +			goto no_mmap;
>> +
>> +		/*
>> +		 * The PCI core shouldn't set up a resource with a
>> +		 * type but zero size. But there may be bugs that
>> +		 * cause us to do that.
>> +		 */
>> +		if (!resource_size(res))
>> +			goto no_mmap;
>> +
>> +		if (resource_size(res) >= PAGE_SIZE) {
>> +			vdev->bar_mmap_supported[bar] = true;
>> +			continue;
>> +		}
>> +
>> +		if (!(res->start & ~PAGE_MASK)) {
>> +			/*
>> +			 * Add a dummy resource to reserve the remainder
>> +			 * of the exclusive page in case that hot-add
>> +			 * device's bar is assigned into it.
>> +			 */
>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
>> +			if (dummy_res == NULL)
>> +				goto no_mmap;
>> +
>> +			dummy_res->resource.start = res->end + 1;
>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
>> +			dummy_res->resource.flags = res->flags;
>> +			if (request_resource(res->parent,
>> +						&dummy_res->resource)) {
>> +				kfree(dummy_res);
>> +				goto no_mmap;
>> +			}
>> +			dummy_res->index = bar;
>> +			list_add(&dummy_res->res_next,
>> +					&vdev->dummy_resources_list);
>> +			vdev->bar_mmap_supported[bar] = true;
>> +			continue;
>> +		}
>> +		/*
>> +		 * Here we don't handle the case when the BAR is not page
>> +		 * aligned because we can't expect the BAR will be
>> +		 * assigned into the same location in a page in guest
>> +		 * when we passthrough the BAR. And it's hard to access
>> +		 * this BAR in userspace because we have no way to get
>> +		 * the BAR's location in a page.
>> +		 */
>> +no_mmap:
>> +		vdev->bar_mmap_supported[bar] = false;
>> +	}
>> +}
>> +
>>   static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
>>   static void vfio_pci_disable(struct vfio_pci_device *vdev);
>>   
>> @@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
>>   		}
>>   	}
>>   
>> +	vfio_pci_probe_mmaps(vdev);
>> +
>>   	return 0;
>>   }
>>   
>>   static void vfio_pci_disable(struct vfio_pci_device *vdev)
>>   {
>>   	struct pci_dev *pdev = vdev->pdev;
>> +	struct vfio_pci_dummy_resource *dummy_res, *tmp;
>>   	int i, bar;
>>   
>>   	/* Stop the device from further DMA */
>> @@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>>   		vdev->barmap[bar] = NULL;
>>   	}
>>   
>> +	list_for_each_entry_safe(dummy_res, tmp,
>> +				 &vdev->dummy_resources_list, res_next) {
>> +		list_del(&dummy_res->res_next);
>> +		release_resource(&dummy_res->resource);
>> +		kfree(dummy_res);
>> +	}
>> +
>>   	vdev->needs_reset = true;
>>   
>>   	/*
>> @@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data,
>>   
>>   			info.flags = VFIO_REGION_INFO_FLAG_READ |
>>   				     VFIO_REGION_INFO_FLAG_WRITE;
>> -			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
>> -			    pci_resource_flags(pdev, info.index) &
>> -			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
>> +			if (vdev->bar_mmap_supported[info.index]) {
>>   				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>>   				if (info.index == vdev->msix_bar) {
>>   					ret = msix_sparse_mmap_cap(vdev, &caps);
>> @@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>>   		return -EINVAL;
>>   	if (index >= VFIO_PCI_ROM_REGION_INDEX)
>>   		return -EINVAL;
>> -	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
>> +	if (!vdev->bar_mmap_supported[index])
>>   		return -EINVAL;
>>   
>> -	phys_len = pci_resource_len(pdev, index);
>> +	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
>>   	req_len = vma->vm_end - vma->vm_start;
>>   	pgoff = vma->vm_pgoff &
>>   		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
>>   	req_start = pgoff << PAGE_SHIFT;
>>   
>> -	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
>> +	if (req_start + req_len > phys_len)
>>   		return -EINVAL;
>>   
>>   	if (index == vdev->msix_bar) {
>> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
>> index 016c14a..2128de8 100644
>> --- a/drivers/vfio/pci/vfio_pci_private.h
>> +++ b/drivers/vfio/pci/vfio_pci_private.h
>> @@ -57,9 +57,16 @@ struct vfio_pci_region {
>>   	u32				flags;
>>   };
>>   
>> +struct vfio_pci_dummy_resource {
>> +	struct resource		resource;
>> +	int			index;
>> +	struct list_head	res_next;
>> +};
>> +
>>   struct vfio_pci_device {
>>   	struct pci_dev		*pdev;
>>   	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
>> +	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
>>   	u8			*pci_config_map;
>>   	u8			*vconfig;
>>   	struct perm_bits	*msi_perm;
>> @@ -88,6 +95,7 @@ struct vfio_pci_device {
>>   	int			refcnt;
>>   	struct eventfd_ctx	*err_trigger;
>>   	struct eventfd_ctx	*req_trigger;
>> +	struct list_head	dummy_resources_list;
>>   };
>>   
>>   #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-23  2:39   ` Yongji Xie
@ 2016-06-23  2:54     ` Alex Williamson
  0 siblings, 0 replies; 17+ messages in thread
From: Alex Williamson @ 2016-06-23  2:54 UTC (permalink / raw)
  To: Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On Thu, 23 Jun 2016 10:39:30 +0800
Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:

> Hi, Alex
> 
> On 2016/6/23 6:04, Alex Williamson wrote:
> 
> > On Mon, 30 May 2016 21:06:37 +0800
> > Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> >  
> >> Current vfio-pci implementation disallows to mmap
> >> sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
> >> page may be shared with other BARs. This will cause some
> >> performance issues when we passthrough a PCI device with
> >> this kind of BARs. Guest will be not able to handle the mmio
> >> accesses to the BARs which leads to mmio emulations in host.
> >>
> >> However, not all sub-page BARs will share page with other BARs.
> >> We should allow to mmap the sub-page MMIO BARs which we can
> >> make sure will not share page with other BARs.
> >>
> >> This patch adds support for this case. And we try to add a
> >> dummy resource to reserve the remainder of the page which
> >> hot-add device's BAR might be assigned into. But it's not
> >> necessary to handle the case when the BAR is not page aligned.
> >> Because we can't expect the BAR will be assigned into the same
> >> location in a page in guest when we passthrough the BAR. And
> >> it's hard to access this BAR in userspace because we have
> >> no way to get the BAR's location in a page.
> >>
> >> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
> >> ---  
> > Hi Yongji,
> >
> > On 5/22, message-id
> > <201605230345.u4N3dJip043323@mx0a-001b2d01.pphosted.com> you indicated
> > you'd post the QEMU code which is enabled by this patch "soon".  Have I
> > missed that?  I'm still waiting to see it.  Thanks,
> >
> > Alex  
> 
> I posted it on May 24th [1]. Do I need to resend it?
> 
> [1] http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04125.html

I found it.  Thanks,

Alex

> >>   drivers/vfio/pci/vfio_pci.c         |   87 ++++++++++++++++++++++++++++++++---
> >>   drivers/vfio/pci/vfio_pci_private.h |    8 ++++
> >>   2 files changed, 89 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> >> index 188b1ff..3cca2a7 100644
> >> --- a/drivers/vfio/pci/vfio_pci.c
> >> +++ b/drivers/vfio/pci/vfio_pci.c
> >> @@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
> >>   	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
> >>   }
> >>   
> >> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >> +{
> >> +	struct resource *res;
> >> +	int bar;
> >> +	struct vfio_pci_dummy_resource *dummy_res;
> >> +
> >> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> >> +
> >> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> >> +		res = vdev->pdev->resource + bar;
> >> +
> >> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> >> +			goto no_mmap;
> >> +
> >> +		if (!(res->flags & IORESOURCE_MEM))
> >> +			goto no_mmap;
> >> +
> >> +		/*
> >> +		 * The PCI core shouldn't set up a resource with a
> >> +		 * type but zero size. But there may be bugs that
> >> +		 * cause us to do that.
> >> +		 */
> >> +		if (!resource_size(res))
> >> +			goto no_mmap;
> >> +
> >> +		if (resource_size(res) >= PAGE_SIZE) {
> >> +			vdev->bar_mmap_supported[bar] = true;
> >> +			continue;
> >> +		}
> >> +
> >> +		if (!(res->start & ~PAGE_MASK)) {
> >> +			/*
> >> +			 * Add a dummy resource to reserve the remainder
> >> +			 * of the exclusive page in case that hot-add
> >> +			 * device's bar is assigned into it.
> >> +			 */
> >> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> >> +			if (dummy_res == NULL)
> >> +				goto no_mmap;
> >> +
> >> +			dummy_res->resource.start = res->end + 1;
> >> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> >> +			dummy_res->resource.flags = res->flags;
> >> +			if (request_resource(res->parent,
> >> +						&dummy_res->resource)) {
> >> +				kfree(dummy_res);
> >> +				goto no_mmap;
> >> +			}
> >> +			dummy_res->index = bar;
> >> +			list_add(&dummy_res->res_next,
> >> +					&vdev->dummy_resources_list);
> >> +			vdev->bar_mmap_supported[bar] = true;
> >> +			continue;
> >> +		}
> >> +		/*
> >> +		 * Here we don't handle the case when the BAR is not page
> >> +		 * aligned because we can't expect the BAR will be
> >> +		 * assigned into the same location in a page in guest
> >> +		 * when we passthrough the BAR. And it's hard to access
> >> +		 * this BAR in userspace because we have no way to get
> >> +		 * the BAR's location in a page.
> >> +		 */
> >> +no_mmap:
> >> +		vdev->bar_mmap_supported[bar] = false;
> >> +	}
> >> +}
> >> +
> >>   static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
> >>   static void vfio_pci_disable(struct vfio_pci_device *vdev);
> >>   
> >> @@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
> >>   		}
> >>   	}
> >>   
> >> +	vfio_pci_probe_mmaps(vdev);
> >> +
> >>   	return 0;
> >>   }
> >>   
> >>   static void vfio_pci_disable(struct vfio_pci_device *vdev)
> >>   {
> >>   	struct pci_dev *pdev = vdev->pdev;
> >> +	struct vfio_pci_dummy_resource *dummy_res, *tmp;
> >>   	int i, bar;
> >>   
> >>   	/* Stop the device from further DMA */
> >> @@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
> >>   		vdev->barmap[bar] = NULL;
> >>   	}
> >>   
> >> +	list_for_each_entry_safe(dummy_res, tmp,
> >> +				 &vdev->dummy_resources_list, res_next) {
> >> +		list_del(&dummy_res->res_next);
> >> +		release_resource(&dummy_res->resource);
> >> +		kfree(dummy_res);
> >> +	}
> >> +
> >>   	vdev->needs_reset = true;
> >>   
> >>   	/*
> >> @@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data,
> >>   
> >>   			info.flags = VFIO_REGION_INFO_FLAG_READ |
> >>   				     VFIO_REGION_INFO_FLAG_WRITE;
> >> -			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
> >> -			    pci_resource_flags(pdev, info.index) &
> >> -			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
> >> +			if (vdev->bar_mmap_supported[info.index]) {
> >>   				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
> >>   				if (info.index == vdev->msix_bar) {
> >>   					ret = msix_sparse_mmap_cap(vdev, &caps);
> >> @@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
> >>   		return -EINVAL;
> >>   	if (index >= VFIO_PCI_ROM_REGION_INDEX)
> >>   		return -EINVAL;
> >> -	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
> >> +	if (!vdev->bar_mmap_supported[index])
> >>   		return -EINVAL;
> >>   
> >> -	phys_len = pci_resource_len(pdev, index);
> >> +	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
> >>   	req_len = vma->vm_end - vma->vm_start;
> >>   	pgoff = vma->vm_pgoff &
> >>   		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> >>   	req_start = pgoff << PAGE_SHIFT;
> >>   
> >> -	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
> >> +	if (req_start + req_len > phys_len)
> >>   		return -EINVAL;
> >>   
> >>   	if (index == vdev->msix_bar) {
> >> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> >> index 016c14a..2128de8 100644
> >> --- a/drivers/vfio/pci/vfio_pci_private.h
> >> +++ b/drivers/vfio/pci/vfio_pci_private.h
> >> @@ -57,9 +57,16 @@ struct vfio_pci_region {
> >>   	u32				flags;
> >>   };
> >>   
> >> +struct vfio_pci_dummy_resource {
> >> +	struct resource		resource;
> >> +	int			index;
> >> +	struct list_head	res_next;
> >> +};
> >> +
> >>   struct vfio_pci_device {
> >>   	struct pci_dev		*pdev;
> >>   	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
> >> +	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
> >>   	u8			*pci_config_map;
> >>   	u8			*vconfig;
> >>   	struct perm_bits	*msi_perm;
> >> @@ -88,6 +95,7 @@ struct vfio_pci_device {
> >>   	int			refcnt;
> >>   	struct eventfd_ctx	*err_trigger;
> >>   	struct eventfd_ctx	*req_trigger;
> >> +	struct list_head	dummy_resources_list;
> >>   };
> >>   
> >>   #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)  
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
       [not found] <201605301311.u4UD99he028186@mx0a-001b2d01.pphosted.com>
  2016-06-22 22:04 ` [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive Alex Williamson
@ 2016-06-23 16:12 ` Alex Williamson
  2016-06-24  2:52   ` Yongji Xie
  1 sibling, 1 reply; 17+ messages in thread
From: Alex Williamson @ 2016-06-23 16:12 UTC (permalink / raw)
  To: Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On Mon, 30 May 2016 21:06:37 +0800
Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:

> Current vfio-pci implementation disallows to mmap
> sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
> page may be shared with other BARs. This will cause some
> performance issues when we passthrough a PCI device with
> this kind of BARs. Guest will be not able to handle the mmio
> accesses to the BARs which leads to mmio emulations in host.
> 
> However, not all sub-page BARs will share page with other BARs.
> We should allow to mmap the sub-page MMIO BARs which we can
> make sure will not share page with other BARs.
> 
> This patch adds support for this case. And we try to add a
> dummy resource to reserve the remainder of the page which
> hot-add device's BAR might be assigned into. But it's not
> necessary to handle the case when the BAR is not page aligned.
> Because we can't expect the BAR will be assigned into the same
> location in a page in guest when we passthrough the BAR. And
> it's hard to access this BAR in userspace because we have
> no way to get the BAR's location in a page.
> 
> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
> ---
>  drivers/vfio/pci/vfio_pci.c         |   87 ++++++++++++++++++++++++++++++++---
>  drivers/vfio/pci/vfio_pci_private.h |    8 ++++
>  2 files changed, 89 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 188b1ff..3cca2a7 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>  	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>  }
>  
> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> +{
> +	struct resource *res;
> +	int bar;
> +	struct vfio_pci_dummy_resource *dummy_res;
> +
> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> +
> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> +		res = vdev->pdev->resource + bar;
> +
> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> +			goto no_mmap;
> +
> +		if (!(res->flags & IORESOURCE_MEM))
> +			goto no_mmap;
> +
> +		/*
> +		 * The PCI core shouldn't set up a resource with a
> +		 * type but zero size. But there may be bugs that
> +		 * cause us to do that.
> +		 */
> +		if (!resource_size(res))
> +			goto no_mmap;
> +
> +		if (resource_size(res) >= PAGE_SIZE) {
> +			vdev->bar_mmap_supported[bar] = true;
> +			continue;
> +		}
> +
> +		if (!(res->start & ~PAGE_MASK)) {
> +			/*
> +			 * Add a dummy resource to reserve the remainder
> +			 * of the exclusive page in case that hot-add
> +			 * device's bar is assigned into it.
> +			 */
> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> +			if (dummy_res == NULL)
> +				goto no_mmap;
> +
> +			dummy_res->resource.start = res->end + 1;
> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> +			dummy_res->resource.flags = res->flags;
> +			if (request_resource(res->parent,
> +						&dummy_res->resource)) {
> +				kfree(dummy_res);
> +				goto no_mmap;
> +			}

Isn't it true that request_resource() only tells us that at a given
point in time, no other drivers have reserved that resource?  It seems
like it does not guarantee that the resource isn't routed to another
device or that another driver won't at some point attempt to request
that same resource.  So for example if a user constructs their initrd
to bind vfio-pci to devices before other modules load, this
request_resource() may succeed, at the expense of drivers loaded later
now failing.  The behavior will depend on driver load order and we're
not actually insuring that the overflow resource is unused, just that
we got it first.  Can we do better?  Am I missing something that
prevents this?  Thanks,

Alex

> +			dummy_res->index = bar;
> +			list_add(&dummy_res->res_next,
> +					&vdev->dummy_resources_list);
> +			vdev->bar_mmap_supported[bar] = true;
> +			continue;
> +		}
> +		/*
> +		 * Here we don't handle the case when the BAR is not page
> +		 * aligned because we can't expect the BAR will be
> +		 * assigned into the same location in a page in guest
> +		 * when we passthrough the BAR. And it's hard to access
> +		 * this BAR in userspace because we have no way to get
> +		 * the BAR's location in a page.
> +		 */
> +no_mmap:
> +		vdev->bar_mmap_supported[bar] = false;
> +	}
> +}
> +
>  static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
>  static void vfio_pci_disable(struct vfio_pci_device *vdev);
>  
> @@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
>  		}
>  	}
>  
> +	vfio_pci_probe_mmaps(vdev);
> +
>  	return 0;
>  }
>  
>  static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  {
>  	struct pci_dev *pdev = vdev->pdev;
> +	struct vfio_pci_dummy_resource *dummy_res, *tmp;
>  	int i, bar;
>  
>  	/* Stop the device from further DMA */
> @@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>  		vdev->barmap[bar] = NULL;
>  	}
>  
> +	list_for_each_entry_safe(dummy_res, tmp,
> +				 &vdev->dummy_resources_list, res_next) {
> +		list_del(&dummy_res->res_next);
> +		release_resource(&dummy_res->resource);
> +		kfree(dummy_res);
> +	}
> +
>  	vdev->needs_reset = true;
>  
>  	/*
> @@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data,
>  
>  			info.flags = VFIO_REGION_INFO_FLAG_READ |
>  				     VFIO_REGION_INFO_FLAG_WRITE;
> -			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
> -			    pci_resource_flags(pdev, info.index) &
> -			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
> +			if (vdev->bar_mmap_supported[info.index]) {
>  				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>  				if (info.index == vdev->msix_bar) {
>  					ret = msix_sparse_mmap_cap(vdev, &caps);
> @@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>  		return -EINVAL;
>  	if (index >= VFIO_PCI_ROM_REGION_INDEX)
>  		return -EINVAL;
> -	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
> +	if (!vdev->bar_mmap_supported[index])
>  		return -EINVAL;
>  
> -	phys_len = pci_resource_len(pdev, index);
> +	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
>  	req_len = vma->vm_end - vma->vm_start;
>  	pgoff = vma->vm_pgoff &
>  		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
>  	req_start = pgoff << PAGE_SHIFT;
>  
> -	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
> +	if (req_start + req_len > phys_len)
>  		return -EINVAL;
>  
>  	if (index == vdev->msix_bar) {
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index 016c14a..2128de8 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -57,9 +57,16 @@ struct vfio_pci_region {
>  	u32				flags;
>  };
>  
> +struct vfio_pci_dummy_resource {
> +	struct resource		resource;
> +	int			index;
> +	struct list_head	res_next;
> +};
> +
>  struct vfio_pci_device {
>  	struct pci_dev		*pdev;
>  	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
> +	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
>  	u8			*pci_config_map;
>  	u8			*vconfig;
>  	struct perm_bits	*msi_perm;
> @@ -88,6 +95,7 @@ struct vfio_pci_device {
>  	int			refcnt;
>  	struct eventfd_ctx	*err_trigger;
>  	struct eventfd_ctx	*req_trigger;
> +	struct list_head	dummy_resources_list;
>  };
>  
>  #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-23 16:12 ` Alex Williamson
@ 2016-06-24  2:52   ` Yongji Xie
  2016-06-24  3:37     ` Alex Williamson
  0 siblings, 1 reply; 17+ messages in thread
From: Yongji Xie @ 2016-06-24  2:52 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

Hi, Alex

On 2016/6/24 0:12, Alex Williamson wrote:

> On Mon, 30 May 2016 21:06:37 +0800
> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>
>> Current vfio-pci implementation disallows to mmap
>> sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
>> page may be shared with other BARs. This will cause some
>> performance issues when we passthrough a PCI device with
>> this kind of BARs. Guest will be not able to handle the mmio
>> accesses to the BARs which leads to mmio emulations in host.
>>
>> However, not all sub-page BARs will share page with other BARs.
>> We should allow to mmap the sub-page MMIO BARs which we can
>> make sure will not share page with other BARs.
>>
>> This patch adds support for this case. And we try to add a
>> dummy resource to reserve the remainder of the page which
>> hot-add device's BAR might be assigned into. But it's not
>> necessary to handle the case when the BAR is not page aligned.
>> Because we can't expect the BAR will be assigned into the same
>> location in a page in guest when we passthrough the BAR. And
>> it's hard to access this BAR in userspace because we have
>> no way to get the BAR's location in a page.
>>
>> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
>> ---
>>   drivers/vfio/pci/vfio_pci.c         |   87 ++++++++++++++++++++++++++++++++---
>>   drivers/vfio/pci/vfio_pci_private.h |    8 ++++
>>   2 files changed, 89 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index 188b1ff..3cca2a7 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>>   	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>>   }
>>   
>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>> +{
>> +	struct resource *res;
>> +	int bar;
>> +	struct vfio_pci_dummy_resource *dummy_res;
>> +
>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
>> +
>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
>> +		res = vdev->pdev->resource + bar;
>> +
>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
>> +			goto no_mmap;
>> +
>> +		if (!(res->flags & IORESOURCE_MEM))
>> +			goto no_mmap;
>> +
>> +		/*
>> +		 * The PCI core shouldn't set up a resource with a
>> +		 * type but zero size. But there may be bugs that
>> +		 * cause us to do that.
>> +		 */
>> +		if (!resource_size(res))
>> +			goto no_mmap;
>> +
>> +		if (resource_size(res) >= PAGE_SIZE) {
>> +			vdev->bar_mmap_supported[bar] = true;
>> +			continue;
>> +		}
>> +
>> +		if (!(res->start & ~PAGE_MASK)) {
>> +			/*
>> +			 * Add a dummy resource to reserve the remainder
>> +			 * of the exclusive page in case that hot-add
>> +			 * device's bar is assigned into it.
>> +			 */
>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
>> +			if (dummy_res == NULL)
>> +				goto no_mmap;
>> +
>> +			dummy_res->resource.start = res->end + 1;
>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
>> +			dummy_res->resource.flags = res->flags;
>> +			if (request_resource(res->parent,
>> +						&dummy_res->resource)) {
>> +				kfree(dummy_res);
>> +				goto no_mmap;
>> +			}
> Isn't it true that request_resource() only tells us that at a given
> point in time, no other drivers have reserved that resource?  It seems
> like it does not guarantee that the resource isn't routed to another
> device or that another driver won't at some point attempt to request
> that same resource.  So for example if a user constructs their initrd
> to bind vfio-pci to devices before other modules load, this
> request_resource() may succeed, at the expense of drivers loaded later
> now failing.  The behavior will depend on driver load order and we're
> not actually insuring that the overflow resource is unused, just that
> we got it first.  Can we do better?  Am I missing something that
> prevents this?  Thanks,
>
> Alex

Couldn't PCI resources allocator prevent this, which will find a
empty slot in the resource tree firstly, then try to request that
resource in allocate_resource() when a PCI device is probed.
And I'd like to know why a PCI device driver would attempt to
call request_resource()? Should this be done in PCI enumeration?

Thanks,
Yongji

>> +			dummy_res->index = bar;
>> +			list_add(&dummy_res->res_next,
>> +					&vdev->dummy_resources_list);
>> +			vdev->bar_mmap_supported[bar] = true;
>> +			continue;
>> +		}
>> +		/*
>> +		 * Here we don't handle the case when the BAR is not page
>> +		 * aligned because we can't expect the BAR will be
>> +		 * assigned into the same location in a page in guest
>> +		 * when we passthrough the BAR. And it's hard to access
>> +		 * this BAR in userspace because we have no way to get
>> +		 * the BAR's location in a page.
>> +		 */
>> +no_mmap:
>> +		vdev->bar_mmap_supported[bar] = false;
>> +	}
>> +}
>> +
>>   static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
>>   static void vfio_pci_disable(struct vfio_pci_device *vdev);
>>   
>> @@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
>>   		}
>>   	}
>>   
>> +	vfio_pci_probe_mmaps(vdev);
>> +
>>   	return 0;
>>   }
>>   
>>   static void vfio_pci_disable(struct vfio_pci_device *vdev)
>>   {
>>   	struct pci_dev *pdev = vdev->pdev;
>> +	struct vfio_pci_dummy_resource *dummy_res, *tmp;
>>   	int i, bar;
>>   
>>   	/* Stop the device from further DMA */
>> @@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
>>   		vdev->barmap[bar] = NULL;
>>   	}
>>   
>> +	list_for_each_entry_safe(dummy_res, tmp,
>> +				 &vdev->dummy_resources_list, res_next) {
>> +		list_del(&dummy_res->res_next);
>> +		release_resource(&dummy_res->resource);
>> +		kfree(dummy_res);
>> +	}
>> +
>>   	vdev->needs_reset = true;
>>   
>>   	/*
>> @@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data,
>>   
>>   			info.flags = VFIO_REGION_INFO_FLAG_READ |
>>   				     VFIO_REGION_INFO_FLAG_WRITE;
>> -			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
>> -			    pci_resource_flags(pdev, info.index) &
>> -			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
>> +			if (vdev->bar_mmap_supported[info.index]) {
>>   				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>>   				if (info.index == vdev->msix_bar) {
>>   					ret = msix_sparse_mmap_cap(vdev, &caps);
>> @@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>>   		return -EINVAL;
>>   	if (index >= VFIO_PCI_ROM_REGION_INDEX)
>>   		return -EINVAL;
>> -	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
>> +	if (!vdev->bar_mmap_supported[index])
>>   		return -EINVAL;
>>   
>> -	phys_len = pci_resource_len(pdev, index);
>> +	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
>>   	req_len = vma->vm_end - vma->vm_start;
>>   	pgoff = vma->vm_pgoff &
>>   		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
>>   	req_start = pgoff << PAGE_SHIFT;
>>   
>> -	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
>> +	if (req_start + req_len > phys_len)
>>   		return -EINVAL;
>>   
>>   	if (index == vdev->msix_bar) {
>> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
>> index 016c14a..2128de8 100644
>> --- a/drivers/vfio/pci/vfio_pci_private.h
>> +++ b/drivers/vfio/pci/vfio_pci_private.h
>> @@ -57,9 +57,16 @@ struct vfio_pci_region {
>>   	u32				flags;
>>   };
>>   
>> +struct vfio_pci_dummy_resource {
>> +	struct resource		resource;
>> +	int			index;
>> +	struct list_head	res_next;
>> +};
>> +
>>   struct vfio_pci_device {
>>   	struct pci_dev		*pdev;
>>   	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
>> +	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
>>   	u8			*pci_config_map;
>>   	u8			*vconfig;
>>   	struct perm_bits	*msi_perm;
>> @@ -88,6 +95,7 @@ struct vfio_pci_device {
>>   	int			refcnt;
>>   	struct eventfd_ctx	*err_trigger;
>>   	struct eventfd_ctx	*req_trigger;
>> +	struct list_head	dummy_resources_list;
>>   };
>>   
>>   #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-24  2:52   ` Yongji Xie
@ 2016-06-24  3:37     ` Alex Williamson
  2016-06-24  4:06       ` Tian, Kevin
  2016-06-24 15:37       ` Yongji Xie
  0 siblings, 2 replies; 17+ messages in thread
From: Alex Williamson @ 2016-06-24  3:37 UTC (permalink / raw)
  To: Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On Fri, 24 Jun 2016 10:52:58 +0800
Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> On 2016/6/24 0:12, Alex Williamson wrote:
> > On Mon, 30 May 2016 21:06:37 +0800
> > Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> >> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >> +{
> >> +	struct resource *res;
> >> +	int bar;
> >> +	struct vfio_pci_dummy_resource *dummy_res;
> >> +
> >> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> >> +
> >> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> >> +		res = vdev->pdev->resource + bar;
> >> +
> >> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> >> +			goto no_mmap;
> >> +
> >> +		if (!(res->flags & IORESOURCE_MEM))
> >> +			goto no_mmap;
> >> +
> >> +		/*
> >> +		 * The PCI core shouldn't set up a resource with a
> >> +		 * type but zero size. But there may be bugs that
> >> +		 * cause us to do that.
> >> +		 */
> >> +		if (!resource_size(res))
> >> +			goto no_mmap;
> >> +
> >> +		if (resource_size(res) >= PAGE_SIZE) {
> >> +			vdev->bar_mmap_supported[bar] = true;
> >> +			continue;
> >> +		}
> >> +
> >> +		if (!(res->start & ~PAGE_MASK)) {
> >> +			/*
> >> +			 * Add a dummy resource to reserve the remainder
> >> +			 * of the exclusive page in case that hot-add
> >> +			 * device's bar is assigned into it.
> >> +			 */
> >> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> >> +			if (dummy_res == NULL)
> >> +				goto no_mmap;
> >> +
> >> +			dummy_res->resource.start = res->end + 1;
> >> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> >> +			dummy_res->resource.flags = res->flags;
> >> +			if (request_resource(res->parent,
> >> +						&dummy_res->resource)) {
> >> +				kfree(dummy_res);
> >> +				goto no_mmap;
> >> +			}  
> > Isn't it true that request_resource() only tells us that at a given
> > point in time, no other drivers have reserved that resource?  It seems
> > like it does not guarantee that the resource isn't routed to another
> > device or that another driver won't at some point attempt to request
> > that same resource.  So for example if a user constructs their initrd
> > to bind vfio-pci to devices before other modules load, this
> > request_resource() may succeed, at the expense of drivers loaded later
> > now failing.  The behavior will depend on driver load order and we're
> > not actually insuring that the overflow resource is unused, just that
> > we got it first.  Can we do better?  Am I missing something that
> > prevents this?  Thanks,
> >
> > Alex  
> 
> Couldn't PCI resources allocator prevent this, which will find a
> empty slot in the resource tree firstly, then try to request that
> resource in allocate_resource() when a PCI device is probed.
> And I'd like to know why a PCI device driver would attempt to
> call request_resource()? Should this be done in PCI enumeration?

Hi Yongji,

Looks like most pci drivers call pci_request_regions().  From there the
call path is:

pci_request_selected_regions
  __pci_request_selected_regions
    __pci_request_region
      __request_mem_region
        __request_region
          __request_resource

We see this driver ordering issue sometimes with users attempting to
blacklist native pci drivers, trying to leave a device free for use by
vfio-pci.  If the device is a graphics card, the generic vesa or uefi
driver can request device resources causing a failure when vfio-pci
tries to request those same resources.  I expect that unless it's a
boot device, like vga in my example, the resources are not enabled
until the driver opens the device, therefore the request_resource() call
doesn't occur until that point.

For another trivial example, look at /proc/iomem as you load and unload
a driver, on my laptop with e1000e unloaded I see:

  e1200000-e121ffff : 0000:00:19.0
  e123e000-e123efff : 0000:00:19.0

When e1000e is loaded, each of these becomes claimed by the e1000e
driver:

  e1200000-e121ffff : 0000:00:19.0
    e1200000-e121ffff : e1000e
  e123e000-e123efff : 0000:00:19.0
    e123e000-e123efff : e1000e

Clearly pci core knows the resource is associated with the device, but
I don't think we're tapping into that with request_resource(), we're
just potentially stealing resources that another driver might have
claimed otherwise as I described above.  That's my suspicion at
least, feel free to show otherwise if it's incorrect.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-24  3:37     ` Alex Williamson
@ 2016-06-24  4:06       ` Tian, Kevin
  2016-06-24 15:37       ` Yongji Xie
  1 sibling, 0 replies; 17+ messages in thread
From: Tian, Kevin @ 2016-06-24  4:06 UTC (permalink / raw)
  To: Alex Williamson, Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan

> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Friday, June 24, 2016 11:37 AM
> 
> On Fri, 24 Jun 2016 10:52:58 +0800
> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> > On 2016/6/24 0:12, Alex Williamson wrote:
> > > On Mon, 30 May 2016 21:06:37 +0800
> > > Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> > >> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> > >> +{
> > >> +	struct resource *res;
> > >> +	int bar;
> > >> +	struct vfio_pci_dummy_resource *dummy_res;
> > >> +
> > >> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> > >> +
> > >> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> > >> +		res = vdev->pdev->resource + bar;
> > >> +
> > >> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> > >> +			goto no_mmap;
> > >> +
> > >> +		if (!(res->flags & IORESOURCE_MEM))
> > >> +			goto no_mmap;
> > >> +
> > >> +		/*
> > >> +		 * The PCI core shouldn't set up a resource with a
> > >> +		 * type but zero size. But there may be bugs that
> > >> +		 * cause us to do that.
> > >> +		 */
> > >> +		if (!resource_size(res))
> > >> +			goto no_mmap;
> > >> +
> > >> +		if (resource_size(res) >= PAGE_SIZE) {
> > >> +			vdev->bar_mmap_supported[bar] = true;
> > >> +			continue;
> > >> +		}
> > >> +
> > >> +		if (!(res->start & ~PAGE_MASK)) {
> > >> +			/*
> > >> +			 * Add a dummy resource to reserve the remainder
> > >> +			 * of the exclusive page in case that hot-add
> > >> +			 * device's bar is assigned into it.
> > >> +			 */
> > >> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> > >> +			if (dummy_res == NULL)
> > >> +				goto no_mmap;
> > >> +
> > >> +			dummy_res->resource.start = res->end + 1;
> > >> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> > >> +			dummy_res->resource.flags = res->flags;
> > >> +			if (request_resource(res->parent,
> > >> +						&dummy_res->resource)) {
> > >> +				kfree(dummy_res);
> > >> +				goto no_mmap;
> > >> +			}
> > > Isn't it true that request_resource() only tells us that at a given
> > > point in time, no other drivers have reserved that resource?  It seems
> > > like it does not guarantee that the resource isn't routed to another
> > > device or that another driver won't at some point attempt to request
> > > that same resource.  So for example if a user constructs their initrd
> > > to bind vfio-pci to devices before other modules load, this
> > > request_resource() may succeed, at the expense of drivers loaded later
> > > now failing.  The behavior will depend on driver load order and we're
> > > not actually insuring that the overflow resource is unused, just that
> > > we got it first.  Can we do better?  Am I missing something that
> > > prevents this?  Thanks,
> > >
> > > Alex
> >
> > Couldn't PCI resources allocator prevent this, which will find a
> > empty slot in the resource tree firstly, then try to request that
> > resource in allocate_resource() when a PCI device is probed.
> > And I'd like to know why a PCI device driver would attempt to
> > call request_resource()? Should this be done in PCI enumeration?
> 
> Hi Yongji,
> 
> Looks like most pci drivers call pci_request_regions().  From there the
> call path is:
> 
> pci_request_selected_regions
>   __pci_request_selected_regions
>     __pci_request_region
>       __request_mem_region
>         __request_region
>           __request_resource
> 
> We see this driver ordering issue sometimes with users attempting to
> blacklist native pci drivers, trying to leave a device free for use by
> vfio-pci.  If the device is a graphics card, the generic vesa or uefi
> driver can request device resources causing a failure when vfio-pci
> tries to request those same resources.  I expect that unless it's a
> boot device, like vga in my example, the resources are not enabled
> until the driver opens the device, therefore the request_resource() call
> doesn't occur until that point.
> 
> For another trivial example, look at /proc/iomem as you load and unload
> a driver, on my laptop with e1000e unloaded I see:
> 
>   e1200000-e121ffff : 0000:00:19.0
>   e123e000-e123efff : 0000:00:19.0
> 
> When e1000e is loaded, each of these becomes claimed by the e1000e
> driver:
> 
>   e1200000-e121ffff : 0000:00:19.0
>     e1200000-e121ffff : e1000e
>   e123e000-e123efff : 0000:00:19.0
>     e123e000-e123efff : e1000e
> 
> Clearly pci core knows the resource is associated with the device, but
> I don't think we're tapping into that with request_resource(), we're
> just potentially stealing resources that another driver might have
> claimed otherwise as I described above.  That's my suspicion at
> least, feel free to show otherwise if it's incorrect.  Thanks,
> 
> Alex

It's a problem unless there is a way to trigger resource re-assignment
(e.g. pci_assign_resource) on devices which if claim on same resource
by VFIO. But doing this for every request_resource failure looks an
overkill...

Thanks
Kevin

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-24  3:37     ` Alex Williamson
  2016-06-24  4:06       ` Tian, Kevin
@ 2016-06-24 15:37       ` Yongji Xie
  2016-06-24 16:43         ` Alex Williamson
  1 sibling, 1 reply; 17+ messages in thread
From: Yongji Xie @ 2016-06-24 15:37 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

Hi, Alex

On 2016/6/24 11:37, Alex Williamson wrote:

> On Fri, 24 Jun 2016 10:52:58 +0800
> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>> On 2016/6/24 0:12, Alex Williamson wrote:
>>> On Mon, 30 May 2016 21:06:37 +0800
>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>>> +{
>>>> +	struct resource *res;
>>>> +	int bar;
>>>> +	struct vfio_pci_dummy_resource *dummy_res;
>>>> +
>>>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
>>>> +
>>>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
>>>> +		res = vdev->pdev->resource + bar;
>>>> +
>>>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
>>>> +			goto no_mmap;
>>>> +
>>>> +		if (!(res->flags & IORESOURCE_MEM))
>>>> +			goto no_mmap;
>>>> +
>>>> +		/*
>>>> +		 * The PCI core shouldn't set up a resource with a
>>>> +		 * type but zero size. But there may be bugs that
>>>> +		 * cause us to do that.
>>>> +		 */
>>>> +		if (!resource_size(res))
>>>> +			goto no_mmap;
>>>> +
>>>> +		if (resource_size(res) >= PAGE_SIZE) {
>>>> +			vdev->bar_mmap_supported[bar] = true;
>>>> +			continue;
>>>> +		}
>>>> +
>>>> +		if (!(res->start & ~PAGE_MASK)) {
>>>> +			/*
>>>> +			 * Add a dummy resource to reserve the remainder
>>>> +			 * of the exclusive page in case that hot-add
>>>> +			 * device's bar is assigned into it.
>>>> +			 */
>>>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
>>>> +			if (dummy_res == NULL)
>>>> +				goto no_mmap;
>>>> +
>>>> +			dummy_res->resource.start = res->end + 1;
>>>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
>>>> +			dummy_res->resource.flags = res->flags;
>>>> +			if (request_resource(res->parent,
>>>> +						&dummy_res->resource)) {
>>>> +				kfree(dummy_res);
>>>> +				goto no_mmap;
>>>> +			}
>>> Isn't it true that request_resource() only tells us that at a given
>>> point in time, no other drivers have reserved that resource?  It seems
>>> like it does not guarantee that the resource isn't routed to another
>>> device or that another driver won't at some point attempt to request
>>> that same resource.  So for example if a user constructs their initrd
>>> to bind vfio-pci to devices before other modules load, this
>>> request_resource() may succeed, at the expense of drivers loaded later
>>> now failing.  The behavior will depend on driver load order and we're
>>> not actually insuring that the overflow resource is unused, just that
>>> we got it first.  Can we do better?  Am I missing something that
>>> prevents this?  Thanks,
>>>
>>> Alex
>> Couldn't PCI resources allocator prevent this, which will find a
>> empty slot in the resource tree firstly, then try to request that
>> resource in allocate_resource() when a PCI device is probed.
>> And I'd like to know why a PCI device driver would attempt to
>> call request_resource()? Should this be done in PCI enumeration?
> Hi Yongji,
>
> Looks like most pci drivers call pci_request_regions().  From there the
> call path is:
>
> pci_request_selected_regions
>    __pci_request_selected_regions
>      __pci_request_region
>        __request_mem_region
>          __request_region
>            __request_resource
>
> We see this driver ordering issue sometimes with users attempting to
> blacklist native pci drivers, trying to leave a device free for use by
> vfio-pci.  If the device is a graphics card, the generic vesa or uefi
> driver can request device resources causing a failure when vfio-pci
> tries to request those same resources.  I expect that unless it's a
> boot device, like vga in my example, the resources are not enabled
> until the driver opens the device, therefore the request_resource() call
> doesn't occur until that point.
>
> For another trivial example, look at /proc/iomem as you load and unload
> a driver, on my laptop with e1000e unloaded I see:
>
>    e1200000-e121ffff : 0000:00:19.0
>    e123e000-e123efff : 0000:00:19.0
>
> When e1000e is loaded, each of these becomes claimed by the e1000e
> driver:
>
>    e1200000-e121ffff : 0000:00:19.0
>      e1200000-e121ffff : e1000e
>    e123e000-e123efff : 0000:00:19.0
>      e123e000-e123efff : e1000e
>
> Clearly pci core knows the resource is associated with the device, but
> I don't think we're tapping into that with request_resource(), we're
> just potentially stealing resources that another driver might have
> claimed otherwise as I described above.  That's my suspicion at
> least, feel free to show otherwise if it's incorrect.  Thanks,
>
> Alex
>

Thanks for your explanation. But I still have one question.
Shouldn't PCI core have claimed all PCI device's resources
after probing those devices. If so, request_resource() will fail
when vfio-pci try to steal resources that another driver might
request later. Anything I missed here?  Some device resources
would not be claimed in PCI core?

Thanks,
Yongji

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-24 15:37       ` Yongji Xie
@ 2016-06-24 16:43         ` Alex Williamson
  2016-06-28 10:09           ` Yongji Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Alex Williamson @ 2016-06-24 16:43 UTC (permalink / raw)
  To: Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On Fri, 24 Jun 2016 23:37:02 +0800
Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:

> Hi, Alex
> 
> On 2016/6/24 11:37, Alex Williamson wrote:
> 
> > On Fri, 24 Jun 2016 10:52:58 +0800
> > Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:  
> >> On 2016/6/24 0:12, Alex Williamson wrote:  
> >>> On Mon, 30 May 2016 21:06:37 +0800
> >>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:  
> >>>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >>>> +{
> >>>> +	struct resource *res;
> >>>> +	int bar;
> >>>> +	struct vfio_pci_dummy_resource *dummy_res;
> >>>> +
> >>>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> >>>> +
> >>>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> >>>> +		res = vdev->pdev->resource + bar;
> >>>> +
> >>>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> >>>> +			goto no_mmap;
> >>>> +
> >>>> +		if (!(res->flags & IORESOURCE_MEM))
> >>>> +			goto no_mmap;
> >>>> +
> >>>> +		/*
> >>>> +		 * The PCI core shouldn't set up a resource with a
> >>>> +		 * type but zero size. But there may be bugs that
> >>>> +		 * cause us to do that.
> >>>> +		 */
> >>>> +		if (!resource_size(res))
> >>>> +			goto no_mmap;
> >>>> +
> >>>> +		if (resource_size(res) >= PAGE_SIZE) {
> >>>> +			vdev->bar_mmap_supported[bar] = true;
> >>>> +			continue;
> >>>> +		}
> >>>> +
> >>>> +		if (!(res->start & ~PAGE_MASK)) {
> >>>> +			/*
> >>>> +			 * Add a dummy resource to reserve the remainder
> >>>> +			 * of the exclusive page in case that hot-add
> >>>> +			 * device's bar is assigned into it.
> >>>> +			 */
> >>>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> >>>> +			if (dummy_res == NULL)
> >>>> +				goto no_mmap;
> >>>> +
> >>>> +			dummy_res->resource.start = res->end + 1;
> >>>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> >>>> +			dummy_res->resource.flags = res->flags;
> >>>> +			if (request_resource(res->parent,
> >>>> +						&dummy_res->resource)) {
> >>>> +				kfree(dummy_res);
> >>>> +				goto no_mmap;
> >>>> +			}  
> >>> Isn't it true that request_resource() only tells us that at a given
> >>> point in time, no other drivers have reserved that resource?  It seems
> >>> like it does not guarantee that the resource isn't routed to another
> >>> device or that another driver won't at some point attempt to request
> >>> that same resource.  So for example if a user constructs their initrd
> >>> to bind vfio-pci to devices before other modules load, this
> >>> request_resource() may succeed, at the expense of drivers loaded later
> >>> now failing.  The behavior will depend on driver load order and we're
> >>> not actually insuring that the overflow resource is unused, just that
> >>> we got it first.  Can we do better?  Am I missing something that
> >>> prevents this?  Thanks,
> >>>
> >>> Alex  
> >> Couldn't PCI resources allocator prevent this, which will find a
> >> empty slot in the resource tree firstly, then try to request that
> >> resource in allocate_resource() when a PCI device is probed.
> >> And I'd like to know why a PCI device driver would attempt to
> >> call request_resource()? Should this be done in PCI enumeration?  
> > Hi Yongji,
> >
> > Looks like most pci drivers call pci_request_regions().  From there the
> > call path is:
> >
> > pci_request_selected_regions
> >    __pci_request_selected_regions
> >      __pci_request_region
> >        __request_mem_region
> >          __request_region
> >            __request_resource
> >
> > We see this driver ordering issue sometimes with users attempting to
> > blacklist native pci drivers, trying to leave a device free for use by
> > vfio-pci.  If the device is a graphics card, the generic vesa or uefi
> > driver can request device resources causing a failure when vfio-pci
> > tries to request those same resources.  I expect that unless it's a
> > boot device, like vga in my example, the resources are not enabled
> > until the driver opens the device, therefore the request_resource() call
> > doesn't occur until that point.
> >
> > For another trivial example, look at /proc/iomem as you load and unload
> > a driver, on my laptop with e1000e unloaded I see:
> >
> >    e1200000-e121ffff : 0000:00:19.0
> >    e123e000-e123efff : 0000:00:19.0
> >
> > When e1000e is loaded, each of these becomes claimed by the e1000e
> > driver:
> >
> >    e1200000-e121ffff : 0000:00:19.0
> >      e1200000-e121ffff : e1000e
> >    e123e000-e123efff : 0000:00:19.0
> >      e123e000-e123efff : e1000e
> >
> > Clearly pci core knows the resource is associated with the device, but
> > I don't think we're tapping into that with request_resource(), we're
> > just potentially stealing resources that another driver might have
> > claimed otherwise as I described above.  That's my suspicion at
> > least, feel free to show otherwise if it's incorrect.  Thanks,
> >
> > Alex
> >  
> 
> Thanks for your explanation. But I still have one question.
> Shouldn't PCI core have claimed all PCI device's resources
> after probing those devices. If so, request_resource() will fail
> when vfio-pci try to steal resources that another driver might
> request later. Anything I missed here?  Some device resources
> would not be claimed in PCI core?

Hi Yongji,

I don't know what to say, this is not how the interface currently
works.  request_resource() is a driver level interface that tries to
prevent drivers from claiming conflicting resources.  In this patch
you're trying to use it to probe whether a resource maps to another
device.  This is not what it does.  request_resource() will happily let
you claim any resource you want, so long as nobody else claimed it
first.  So the only case where the assumptions in this patch are valid
is if we can guarantee that any potentially conflicting device has a
driver loaded that has claimed those resources.  As it is here,
vfio-pci will happily attempt to request resources that might overlap
with another device and might break drivers that haven't yet had a
chance to probe their devices.  I don't think that's acceptable.
Thanks,

Alex

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-24 16:43         ` Alex Williamson
@ 2016-06-28 10:09           ` Yongji Xie
  2016-06-28 19:47             ` Alex Williamson
  0 siblings, 1 reply; 17+ messages in thread
From: Yongji Xie @ 2016-06-28 10:09 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

Hi, Alex

On 2016/6/25 0:43, Alex Williamson wrote:

> On Fri, 24 Jun 2016 23:37:02 +0800
> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>
>> Hi, Alex
>>
>> On 2016/6/24 11:37, Alex Williamson wrote:
>>
>>> On Fri, 24 Jun 2016 10:52:58 +0800
>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>> On 2016/6/24 0:12, Alex Williamson wrote:
>>>>> On Mon, 30 May 2016 21:06:37 +0800
>>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>>>>> +{
>>>>>> +	struct resource *res;
>>>>>> +	int bar;
>>>>>> +	struct vfio_pci_dummy_resource *dummy_res;
>>>>>> +
>>>>>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
>>>>>> +
>>>>>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
>>>>>> +		res = vdev->pdev->resource + bar;
>>>>>> +
>>>>>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
>>>>>> +			goto no_mmap;
>>>>>> +
>>>>>> +		if (!(res->flags & IORESOURCE_MEM))
>>>>>> +			goto no_mmap;
>>>>>> +
>>>>>> +		/*
>>>>>> +		 * The PCI core shouldn't set up a resource with a
>>>>>> +		 * type but zero size. But there may be bugs that
>>>>>> +		 * cause us to do that.
>>>>>> +		 */
>>>>>> +		if (!resource_size(res))
>>>>>> +			goto no_mmap;
>>>>>> +
>>>>>> +		if (resource_size(res) >= PAGE_SIZE) {
>>>>>> +			vdev->bar_mmap_supported[bar] = true;
>>>>>> +			continue;
>>>>>> +		}
>>>>>> +
>>>>>> +		if (!(res->start & ~PAGE_MASK)) {
>>>>>> +			/*
>>>>>> +			 * Add a dummy resource to reserve the remainder
>>>>>> +			 * of the exclusive page in case that hot-add
>>>>>> +			 * device's bar is assigned into it.
>>>>>> +			 */
>>>>>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
>>>>>> +			if (dummy_res == NULL)
>>>>>> +				goto no_mmap;
>>>>>> +
>>>>>> +			dummy_res->resource.start = res->end + 1;
>>>>>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
>>>>>> +			dummy_res->resource.flags = res->flags;
>>>>>> +			if (request_resource(res->parent,
>>>>>> +						&dummy_res->resource)) {
>>>>>> +				kfree(dummy_res);
>>>>>> +				goto no_mmap;
>>>>>> +			}
>>>>> Isn't it true that request_resource() only tells us that at a given
>>>>> point in time, no other drivers have reserved that resource?  It seems
>>>>> like it does not guarantee that the resource isn't routed to another
>>>>> device or that another driver won't at some point attempt to request
>>>>> that same resource.  So for example if a user constructs their initrd
>>>>> to bind vfio-pci to devices before other modules load, this
>>>>> request_resource() may succeed, at the expense of drivers loaded later
>>>>> now failing.  The behavior will depend on driver load order and we're
>>>>> not actually insuring that the overflow resource is unused, just that
>>>>> we got it first.  Can we do better?  Am I missing something that
>>>>> prevents this?  Thanks,
>>>>>
>>>>> Alex
>>>> Couldn't PCI resources allocator prevent this, which will find a
>>>> empty slot in the resource tree firstly, then try to request that
>>>> resource in allocate_resource() when a PCI device is probed.
>>>> And I'd like to know why a PCI device driver would attempt to
>>>> call request_resource()? Should this be done in PCI enumeration?
>>> Hi Yongji,
>>>
>>> Looks like most pci drivers call pci_request_regions().  From there the
>>> call path is:
>>>
>>> pci_request_selected_regions
>>>     __pci_request_selected_regions
>>>       __pci_request_region
>>>         __request_mem_region
>>>           __request_region
>>>             __request_resource
>>>
>>> We see this driver ordering issue sometimes with users attempting to
>>> blacklist native pci drivers, trying to leave a device free for use by
>>> vfio-pci.  If the device is a graphics card, the generic vesa or uefi
>>> driver can request device resources causing a failure when vfio-pci
>>> tries to request those same resources.  I expect that unless it's a
>>> boot device, like vga in my example, the resources are not enabled
>>> until the driver opens the device, therefore the request_resource() call
>>> doesn't occur until that point.
>>>
>>> For another trivial example, look at /proc/iomem as you load and unload
>>> a driver, on my laptop with e1000e unloaded I see:
>>>
>>>     e1200000-e121ffff : 0000:00:19.0
>>>     e123e000-e123efff : 0000:00:19.0
>>>
>>> When e1000e is loaded, each of these becomes claimed by the e1000e
>>> driver:
>>>
>>>     e1200000-e121ffff : 0000:00:19.0
>>>       e1200000-e121ffff : e1000e
>>>     e123e000-e123efff : 0000:00:19.0
>>>       e123e000-e123efff : e1000e
>>>
>>> Clearly pci core knows the resource is associated with the device, but
>>> I don't think we're tapping into that with request_resource(), we're
>>> just potentially stealing resources that another driver might have
>>> claimed otherwise as I described above.  That's my suspicion at
>>> least, feel free to show otherwise if it's incorrect.  Thanks,
>>>
>>> Alex
>>>   
>> Thanks for your explanation. But I still have one question.
>> Shouldn't PCI core have claimed all PCI device's resources
>> after probing those devices. If so, request_resource() will fail
>> when vfio-pci try to steal resources that another driver might
>> request later. Anything I missed here?  Some device resources
>> would not be claimed in PCI core?
> Hi Yongji,
>
> I don't know what to say, this is not how the interface currently
> works.  request_resource() is a driver level interface that tries to
> prevent drivers from claiming conflicting resources.  In this patch
> you're trying to use it to probe whether a resource maps to another
> device.  This is not what it does.  request_resource() will happily let
> you claim any resource you want, so long as nobody else claimed it
> first.  So the only case where the assumptions in this patch are valid
> is if we can guarantee that any potentially conflicting device has a
> driver loaded that has claimed those resources.  As it is here,
> vfio-pci will happily attempt to request resources that might overlap
> with another device and might break drivers that haven't yet had a
> chance to probe their devices.  I don't think that's acceptable.
> Thanks,
>
> Alex
>

I'm trying to get your point. Let me give an example here.
I'm not sure whether my understanding is right. Please
point it out if I'm wrong.

We assume that there are two PCI devices like this:

240000000000-24feffffffff : /pciex@3fffe40400000
   240000000000-2400ffffffff : PCI Bus 0002:01
     240000000000-240000007fff : 0002:01:00.0
       240000000000-240000007fff : vfio-pci
     240000008000-24000000ffff : 0002:01:01.0
       240000008000-24000000ffff : lpfc

Do you mean vfio-pci driver will succeed in requesting
dummy_res: [240000008000-24000000ffff] (PAGE_SIZE is 64K)
if it is loaded before lpfc driver? Like this:

240000000000-24feffffffff : /pciex@3fffe40400000
   240000000000-2400ffffffff : PCI Bus 0002:01
     240000000000-240000007fff : 0002:01:00.0
       240000000000-240000007fff : vfio-pci
     240000008000-24000000ffff : 0002:01:01.0
       240000008000-24000000ffff : <BAD>    --> vfio-pci call 
request_resource()

Then lpfc driver will fail when it attempts to call
pci_request_regions() later.

But is it possible that the dummy_res become the child of
the res: 0002:01:01.0? Wouldn't request_resource() fail when
it found parent res: PCI Bus 0002:01 already have conflict
child res: 0002:01:01.0.

And I think the case that request_resource() will succeed
should like this:

240000000000-24feffffffff : /pciex@3fffe40400000
   240000000000-2400ffffffff : PCI Bus 0002:01
     240000000000-240000007fff : 0002:01:00.0
     240000010000-240000017fff : 0002:01:01.0

There is a mem hole: [240000008000-24000000ffff] after
PCI probing.  After loading drivers, the resources tree
will be:

240000000000-24feffffffff : /pciex@3fffe40400000
   240000000000-2400ffffffff : PCI Bus 0002:01
     240000000000-240000007fff : 0002:01:00.0
       240000000000-240000007fff : vfio-pci
     240000008000-24000000ffff : <BAD>    ---> vfio-pci call 
request_resource()
     240000010000-240000017fff : 0002:01:01.0
       240000010000-240000017fff : lpfc

Thanks,
Yongji

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-28 10:09           ` Yongji Xie
@ 2016-06-28 19:47             ` Alex Williamson
  2016-06-29 20:03               ` Alex Williamson
  0 siblings, 1 reply; 17+ messages in thread
From: Alex Williamson @ 2016-06-28 19:47 UTC (permalink / raw)
  To: Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On Tue, 28 Jun 2016 18:09:46 +0800
Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:

> Hi, Alex
> 
> On 2016/6/25 0:43, Alex Williamson wrote:
> 
> > On Fri, 24 Jun 2016 23:37:02 +0800
> > Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> >  
> >> Hi, Alex
> >>
> >> On 2016/6/24 11:37, Alex Williamson wrote:
> >>  
> >>> On Fri, 24 Jun 2016 10:52:58 +0800
> >>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:  
> >>>> On 2016/6/24 0:12, Alex Williamson wrote:  
> >>>>> On Mon, 30 May 2016 21:06:37 +0800
> >>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:  
> >>>>>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >>>>>> +{
> >>>>>> +	struct resource *res;
> >>>>>> +	int bar;
> >>>>>> +	struct vfio_pci_dummy_resource *dummy_res;
> >>>>>> +
> >>>>>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> >>>>>> +
> >>>>>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> >>>>>> +		res = vdev->pdev->resource + bar;
> >>>>>> +
> >>>>>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> >>>>>> +			goto no_mmap;
> >>>>>> +
> >>>>>> +		if (!(res->flags & IORESOURCE_MEM))
> >>>>>> +			goto no_mmap;
> >>>>>> +
> >>>>>> +		/*
> >>>>>> +		 * The PCI core shouldn't set up a resource with a
> >>>>>> +		 * type but zero size. But there may be bugs that
> >>>>>> +		 * cause us to do that.
> >>>>>> +		 */
> >>>>>> +		if (!resource_size(res))
> >>>>>> +			goto no_mmap;
> >>>>>> +
> >>>>>> +		if (resource_size(res) >= PAGE_SIZE) {
> >>>>>> +			vdev->bar_mmap_supported[bar] = true;
> >>>>>> +			continue;
> >>>>>> +		}
> >>>>>> +
> >>>>>> +		if (!(res->start & ~PAGE_MASK)) {
> >>>>>> +			/*
> >>>>>> +			 * Add a dummy resource to reserve the remainder
> >>>>>> +			 * of the exclusive page in case that hot-add
> >>>>>> +			 * device's bar is assigned into it.
> >>>>>> +			 */
> >>>>>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> >>>>>> +			if (dummy_res == NULL)
> >>>>>> +				goto no_mmap;
> >>>>>> +
> >>>>>> +			dummy_res->resource.start = res->end + 1;
> >>>>>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> >>>>>> +			dummy_res->resource.flags = res->flags;
> >>>>>> +			if (request_resource(res->parent,
> >>>>>> +						&dummy_res->resource)) {
> >>>>>> +				kfree(dummy_res);
> >>>>>> +				goto no_mmap;
> >>>>>> +			}  
> >>>>> Isn't it true that request_resource() only tells us that at a given
> >>>>> point in time, no other drivers have reserved that resource?  It seems
> >>>>> like it does not guarantee that the resource isn't routed to another
> >>>>> device or that another driver won't at some point attempt to request
> >>>>> that same resource.  So for example if a user constructs their initrd
> >>>>> to bind vfio-pci to devices before other modules load, this
> >>>>> request_resource() may succeed, at the expense of drivers loaded later
> >>>>> now failing.  The behavior will depend on driver load order and we're
> >>>>> not actually insuring that the overflow resource is unused, just that
> >>>>> we got it first.  Can we do better?  Am I missing something that
> >>>>> prevents this?  Thanks,
> >>>>>
> >>>>> Alex  
> >>>> Couldn't PCI resources allocator prevent this, which will find a
> >>>> empty slot in the resource tree firstly, then try to request that
> >>>> resource in allocate_resource() when a PCI device is probed.
> >>>> And I'd like to know why a PCI device driver would attempt to
> >>>> call request_resource()? Should this be done in PCI enumeration?  
> >>> Hi Yongji,
> >>>
> >>> Looks like most pci drivers call pci_request_regions().  From there the
> >>> call path is:
> >>>
> >>> pci_request_selected_regions
> >>>     __pci_request_selected_regions
> >>>       __pci_request_region
> >>>         __request_mem_region
> >>>           __request_region
> >>>             __request_resource
> >>>
> >>> We see this driver ordering issue sometimes with users attempting to
> >>> blacklist native pci drivers, trying to leave a device free for use by
> >>> vfio-pci.  If the device is a graphics card, the generic vesa or uefi
> >>> driver can request device resources causing a failure when vfio-pci
> >>> tries to request those same resources.  I expect that unless it's a
> >>> boot device, like vga in my example, the resources are not enabled
> >>> until the driver opens the device, therefore the request_resource() call
> >>> doesn't occur until that point.
> >>>
> >>> For another trivial example, look at /proc/iomem as you load and unload
> >>> a driver, on my laptop with e1000e unloaded I see:
> >>>
> >>>     e1200000-e121ffff : 0000:00:19.0
> >>>     e123e000-e123efff : 0000:00:19.0
> >>>
> >>> When e1000e is loaded, each of these becomes claimed by the e1000e
> >>> driver:
> >>>
> >>>     e1200000-e121ffff : 0000:00:19.0
> >>>       e1200000-e121ffff : e1000e
> >>>     e123e000-e123efff : 0000:00:19.0
> >>>       e123e000-e123efff : e1000e
> >>>
> >>> Clearly pci core knows the resource is associated with the device, but
> >>> I don't think we're tapping into that with request_resource(), we're
> >>> just potentially stealing resources that another driver might have
> >>> claimed otherwise as I described above.  That's my suspicion at
> >>> least, feel free to show otherwise if it's incorrect.  Thanks,
> >>>
> >>> Alex
> >>>     
> >> Thanks for your explanation. But I still have one question.
> >> Shouldn't PCI core have claimed all PCI device's resources
> >> after probing those devices. If so, request_resource() will fail
> >> when vfio-pci try to steal resources that another driver might
> >> request later. Anything I missed here?  Some device resources
> >> would not be claimed in PCI core?  
> > Hi Yongji,
> >
> > I don't know what to say, this is not how the interface currently
> > works.  request_resource() is a driver level interface that tries to
> > prevent drivers from claiming conflicting resources.  In this patch
> > you're trying to use it to probe whether a resource maps to another
> > device.  This is not what it does.  request_resource() will happily let
> > you claim any resource you want, so long as nobody else claimed it
> > first.  So the only case where the assumptions in this patch are valid
> > is if we can guarantee that any potentially conflicting device has a
> > driver loaded that has claimed those resources.  As it is here,
> > vfio-pci will happily attempt to request resources that might overlap
> > with another device and might break drivers that haven't yet had a
> > chance to probe their devices.  I don't think that's acceptable.
> > Thanks,
> >
> > Alex
> >  
> 
> I'm trying to get your point. Let me give an example here.
> I'm not sure whether my understanding is right. Please
> point it out if I'm wrong.
> 
> We assume that there are two PCI devices like this:
> 
> 240000000000-24feffffffff : /pciex@3fffe40400000
>    240000000000-2400ffffffff : PCI Bus 0002:01
>      240000000000-240000007fff : 0002:01:00.0
>        240000000000-240000007fff : vfio-pci
>      240000008000-24000000ffff : 0002:01:01.0
>        240000008000-24000000ffff : lpfc
> 
> Do you mean vfio-pci driver will succeed in requesting
> dummy_res: [240000008000-24000000ffff] (PAGE_SIZE is 64K)
> if it is loaded before lpfc driver? Like this:
> 
> 240000000000-24feffffffff : /pciex@3fffe40400000
>    240000000000-2400ffffffff : PCI Bus 0002:01
>      240000000000-240000007fff : 0002:01:00.0
>        240000000000-240000007fff : vfio-pci
>      240000008000-24000000ffff : 0002:01:01.0
>        240000008000-24000000ffff : <BAD>    --> vfio-pci call 
> request_resource()
> 
> Then lpfc driver will fail when it attempts to call
> pci_request_regions() later.

Yes, that is my supposition.
 
> But is it possible that the dummy_res become the child of
> the res: 0002:01:01.0? Wouldn't request_resource() fail when
> it found parent res: PCI Bus 0002:01 already have conflict
> child res: 0002:01:01.0.
> 
> And I think the case that request_resource() will succeed
> should like this:
> 
> 240000000000-24feffffffff : /pciex@3fffe40400000
>    240000000000-2400ffffffff : PCI Bus 0002:01
>      240000000000-240000007fff : 0002:01:00.0
>      240000010000-240000017fff : 0002:01:01.0
> 
> There is a mem hole: [240000008000-24000000ffff] after
> PCI probing.  After loading drivers, the resources tree
> will be:
> 
> 240000000000-24feffffffff : /pciex@3fffe40400000
>    240000000000-2400ffffffff : PCI Bus 0002:01
>      240000000000-240000007fff : 0002:01:00.0
>        240000000000-240000007fff : vfio-pci
>      240000008000-24000000ffff : <BAD>    ---> vfio-pci call 
> request_resource()
>      240000010000-240000017fff : 0002:01:01.0
>        240000010000-240000017fff : lpfc

Ok, let's stop guessing about this.  I modified your patch as follows
so I could easily test it on a 4k host:

--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -110,6 +110,9 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
+#define VFIO_64K_PAGE_SIZE (64*1024)
+#define VFIO_64K_PAGE_MASK (~(VFIO_64K_PAGE_SIZE-1))
+
 static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 {
 	struct resource *res;
@@ -135,12 +138,13 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 		if (!resource_size(res))
 			goto no_mmap;
 
-		if (resource_size(res) >= PAGE_SIZE) {
+		if (resource_size(res) >= VFIO_64K_PAGE_SIZE) {
 			vdev->bar_mmap_supported[bar] = true;
 			continue;
 		}
 
-		if (!(res->start & ~PAGE_MASK)) {
+		if (!(res->start & ~VFIO_64K_PAGE_MASK)) {
+			int ret;
 			/*
 			 * Add a dummy resource to reserve the remainder
 			 * of the exclusive page in case that hot-add
@@ -151,10 +155,12 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
 				goto no_mmap;
 
 			dummy_res->resource.start = res->end + 1;
-			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
+			dummy_res->resource.end = res->start + VFIO_64K_PAGE_SIZE - 1;
 			dummy_res->resource.flags = res->flags;
-			if (request_resource(res->parent,
-						&dummy_res->resource)) {
+			ret = request_resource(res->parent,
+						&dummy_res->resource);
+			if (ret) {
+dev_info(&vdev->pdev->dev, "Failed to request_resource %lx-%lx (%d)\n", dummy_res->resource.start, dummy_res->resource.end, ret);
 				kfree(dummy_res);
 				goto no_mmap;
 			}

IOW, enforce 64k for mmap regardless of PAGE_SIZE.  Then I find a
system configuration to test it:

  ee400000-ef4fffff : PCI Bus 0000:07
    ef480000-ef49ffff : 0000:07:00.0
      ef480000-ef483fff : 0000:08:10.0
      ef484000-ef487fff : 0000:08:10.2
      ef488000-ef48bfff : 0000:08:10.4
      ef48c000-ef48ffff : 0000:08:10.6
      ef490000-ef493fff : 0000:08:11.0
      ef494000-ef497fff : 0000:08:11.2
      ef498000-ef49bfff : 0000:08:11.4
    ef4a0000-ef4bffff : 0000:07:00.0
      ef4a0000-ef4a3fff : 0000:08:10.0
      ef4a4000-ef4a7fff : 0000:08:10.2
      ef4a8000-ef4abfff : 0000:08:10.4
      ef4ac000-ef4affff : 0000:08:10.6
      ef4b0000-ef4b3fff : 0000:08:11.0
      ef4b4000-ef4b7fff : 0000:08:11.2
      ef4b8000-ef4bbfff : 0000:08:11.4

This is an 82576 NIC where each VF has two 16k BARs (0 & 3), where all
the VF BAR0s are in a contiguous range and all the VF BAR3s are in a
separate contiguous range.  The igbvf driver is not loaded, so the
other resources are free to be claimed, what happens?

It looks like you're right, the request_resource() fails with:

vfio-pci 0000:08:10.0: Failed to request_resource ef4a4000-ef4affff (-16)
vfio-pci 0000:08:10.0: Failed to request_resource ef484000-ef48ffff (-16)

So we get back -EBUSY which means we hit a conflict.  I would have
expected that this means our res->parent that we're using for
request_resource() is only, for instance, ef480000-ef483fff (ie. the
BAR itself) thus our request for ef484000-ef48ffff exceeds the end of
the parent, but adding the parent resource range to the dev_info(), it
actually shows the range being ef480000-ef49ffff, so the parent is
actually the 07:00.0 resource.  In fact, we can't even use
request_resource() like this to claim the BAR itself, which is why we
use pci_request_selected_regions(), which allows conflicts, putting the
requested resource at the leaf of the tree.

So I guess I retract this concern about the use of request_resource(),
it does seem to behave as intended.  Unless I can spot anything else or
other reviewers have comments, I'll queue this into my next branch for
v4.8.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-28 19:47             ` Alex Williamson
@ 2016-06-29 20:03               ` Alex Williamson
  2016-06-30  2:40                 ` Yongji Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Alex Williamson @ 2016-06-29 20:03 UTC (permalink / raw)
  To: Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On Tue, 28 Jun 2016 13:47:23 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Tue, 28 Jun 2016 18:09:46 +0800
> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> 
> > Hi, Alex
> > 
> > On 2016/6/25 0:43, Alex Williamson wrote:
> >   
> > > On Fri, 24 Jun 2016 23:37:02 +0800
> > > Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> > >    
> > >> Hi, Alex
> > >>
> > >> On 2016/6/24 11:37, Alex Williamson wrote:
> > >>    
> > >>> On Fri, 24 Jun 2016 10:52:58 +0800
> > >>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:    
> > >>>> On 2016/6/24 0:12, Alex Williamson wrote:    
> > >>>>> On Mon, 30 May 2016 21:06:37 +0800
> > >>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:    
> > >>>>>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> > >>>>>> +{
> > >>>>>> +	struct resource *res;
> > >>>>>> +	int bar;
> > >>>>>> +	struct vfio_pci_dummy_resource *dummy_res;
> > >>>>>> +
> > >>>>>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> > >>>>>> +
> > >>>>>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> > >>>>>> +		res = vdev->pdev->resource + bar;
> > >>>>>> +
> > >>>>>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> > >>>>>> +			goto no_mmap;
> > >>>>>> +
> > >>>>>> +		if (!(res->flags & IORESOURCE_MEM))
> > >>>>>> +			goto no_mmap;
> > >>>>>> +
> > >>>>>> +		/*
> > >>>>>> +		 * The PCI core shouldn't set up a resource with a
> > >>>>>> +		 * type but zero size. But there may be bugs that
> > >>>>>> +		 * cause us to do that.
> > >>>>>> +		 */
> > >>>>>> +		if (!resource_size(res))
> > >>>>>> +			goto no_mmap;
> > >>>>>> +
> > >>>>>> +		if (resource_size(res) >= PAGE_SIZE) {
> > >>>>>> +			vdev->bar_mmap_supported[bar] = true;
> > >>>>>> +			continue;
> > >>>>>> +		}
> > >>>>>> +
> > >>>>>> +		if (!(res->start & ~PAGE_MASK)) {
> > >>>>>> +			/*
> > >>>>>> +			 * Add a dummy resource to reserve the remainder
> > >>>>>> +			 * of the exclusive page in case that hot-add
> > >>>>>> +			 * device's bar is assigned into it.
> > >>>>>> +			 */
> > >>>>>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> > >>>>>> +			if (dummy_res == NULL)
> > >>>>>> +				goto no_mmap;
> > >>>>>> +
> > >>>>>> +			dummy_res->resource.start = res->end + 1;
> > >>>>>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> > >>>>>> +			dummy_res->resource.flags = res->flags;
> > >>>>>> +			if (request_resource(res->parent,
> > >>>>>> +						&dummy_res->resource)) {
> > >>>>>> +				kfree(dummy_res);
> > >>>>>> +				goto no_mmap;
> > >>>>>> +			}    
> > >>>>> Isn't it true that request_resource() only tells us that at a given
> > >>>>> point in time, no other drivers have reserved that resource?  It seems
> > >>>>> like it does not guarantee that the resource isn't routed to another
> > >>>>> device or that another driver won't at some point attempt to request
> > >>>>> that same resource.  So for example if a user constructs their initrd
> > >>>>> to bind vfio-pci to devices before other modules load, this
> > >>>>> request_resource() may succeed, at the expense of drivers loaded later
> > >>>>> now failing.  The behavior will depend on driver load order and we're
> > >>>>> not actually insuring that the overflow resource is unused, just that
> > >>>>> we got it first.  Can we do better?  Am I missing something that
> > >>>>> prevents this?  Thanks,
> > >>>>>
> > >>>>> Alex    
> > >>>> Couldn't PCI resources allocator prevent this, which will find a
> > >>>> empty slot in the resource tree firstly, then try to request that
> > >>>> resource in allocate_resource() when a PCI device is probed.
> > >>>> And I'd like to know why a PCI device driver would attempt to
> > >>>> call request_resource()? Should this be done in PCI enumeration?    
> > >>> Hi Yongji,
> > >>>
> > >>> Looks like most pci drivers call pci_request_regions().  From there the
> > >>> call path is:
> > >>>
> > >>> pci_request_selected_regions
> > >>>     __pci_request_selected_regions
> > >>>       __pci_request_region
> > >>>         __request_mem_region
> > >>>           __request_region
> > >>>             __request_resource
> > >>>
> > >>> We see this driver ordering issue sometimes with users attempting to
> > >>> blacklist native pci drivers, trying to leave a device free for use by
> > >>> vfio-pci.  If the device is a graphics card, the generic vesa or uefi
> > >>> driver can request device resources causing a failure when vfio-pci
> > >>> tries to request those same resources.  I expect that unless it's a
> > >>> boot device, like vga in my example, the resources are not enabled
> > >>> until the driver opens the device, therefore the request_resource() call
> > >>> doesn't occur until that point.
> > >>>
> > >>> For another trivial example, look at /proc/iomem as you load and unload
> > >>> a driver, on my laptop with e1000e unloaded I see:
> > >>>
> > >>>     e1200000-e121ffff : 0000:00:19.0
> > >>>     e123e000-e123efff : 0000:00:19.0
> > >>>
> > >>> When e1000e is loaded, each of these becomes claimed by the e1000e
> > >>> driver:
> > >>>
> > >>>     e1200000-e121ffff : 0000:00:19.0
> > >>>       e1200000-e121ffff : e1000e
> > >>>     e123e000-e123efff : 0000:00:19.0
> > >>>       e123e000-e123efff : e1000e
> > >>>
> > >>> Clearly pci core knows the resource is associated with the device, but
> > >>> I don't think we're tapping into that with request_resource(), we're
> > >>> just potentially stealing resources that another driver might have
> > >>> claimed otherwise as I described above.  That's my suspicion at
> > >>> least, feel free to show otherwise if it's incorrect.  Thanks,
> > >>>
> > >>> Alex
> > >>>       
> > >> Thanks for your explanation. But I still have one question.
> > >> Shouldn't PCI core have claimed all PCI device's resources
> > >> after probing those devices. If so, request_resource() will fail
> > >> when vfio-pci try to steal resources that another driver might
> > >> request later. Anything I missed here?  Some device resources
> > >> would not be claimed in PCI core?    
> > > Hi Yongji,
> > >
> > > I don't know what to say, this is not how the interface currently
> > > works.  request_resource() is a driver level interface that tries to
> > > prevent drivers from claiming conflicting resources.  In this patch
> > > you're trying to use it to probe whether a resource maps to another
> > > device.  This is not what it does.  request_resource() will happily let
> > > you claim any resource you want, so long as nobody else claimed it
> > > first.  So the only case where the assumptions in this patch are valid
> > > is if we can guarantee that any potentially conflicting device has a
> > > driver loaded that has claimed those resources.  As it is here,
> > > vfio-pci will happily attempt to request resources that might overlap
> > > with another device and might break drivers that haven't yet had a
> > > chance to probe their devices.  I don't think that's acceptable.
> > > Thanks,
> > >
> > > Alex
> > >    
> > 
> > I'm trying to get your point. Let me give an example here.
> > I'm not sure whether my understanding is right. Please
> > point it out if I'm wrong.
> > 
> > We assume that there are two PCI devices like this:
> > 
> > 240000000000-24feffffffff : /pciex@3fffe40400000
> >    240000000000-2400ffffffff : PCI Bus 0002:01
> >      240000000000-240000007fff : 0002:01:00.0
> >        240000000000-240000007fff : vfio-pci
> >      240000008000-24000000ffff : 0002:01:01.0
> >        240000008000-24000000ffff : lpfc
> > 
> > Do you mean vfio-pci driver will succeed in requesting
> > dummy_res: [240000008000-24000000ffff] (PAGE_SIZE is 64K)
> > if it is loaded before lpfc driver? Like this:
> > 
> > 240000000000-24feffffffff : /pciex@3fffe40400000
> >    240000000000-2400ffffffff : PCI Bus 0002:01
> >      240000000000-240000007fff : 0002:01:00.0
> >        240000000000-240000007fff : vfio-pci
> >      240000008000-24000000ffff : 0002:01:01.0
> >        240000008000-24000000ffff : <BAD>    --> vfio-pci call 
> > request_resource()
> > 
> > Then lpfc driver will fail when it attempts to call
> > pci_request_regions() later.  
> 
> Yes, that is my supposition.
>  
> > But is it possible that the dummy_res become the child of
> > the res: 0002:01:01.0? Wouldn't request_resource() fail when
> > it found parent res: PCI Bus 0002:01 already have conflict
> > child res: 0002:01:01.0.
> > 
> > And I think the case that request_resource() will succeed
> > should like this:
> > 
> > 240000000000-24feffffffff : /pciex@3fffe40400000
> >    240000000000-2400ffffffff : PCI Bus 0002:01
> >      240000000000-240000007fff : 0002:01:00.0
> >      240000010000-240000017fff : 0002:01:01.0
> > 
> > There is a mem hole: [240000008000-24000000ffff] after
> > PCI probing.  After loading drivers, the resources tree
> > will be:
> > 
> > 240000000000-24feffffffff : /pciex@3fffe40400000
> >    240000000000-2400ffffffff : PCI Bus 0002:01
> >      240000000000-240000007fff : 0002:01:00.0
> >        240000000000-240000007fff : vfio-pci
> >      240000008000-24000000ffff : <BAD>    ---> vfio-pci call 
> > request_resource()
> >      240000010000-240000017fff : 0002:01:01.0
> >        240000010000-240000017fff : lpfc  
> 
> Ok, let's stop guessing about this.  I modified your patch as follows
> so I could easily test it on a 4k host:
> 
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -110,6 +110,9 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>  	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>  }
>  
> +#define VFIO_64K_PAGE_SIZE (64*1024)
> +#define VFIO_64K_PAGE_MASK (~(VFIO_64K_PAGE_SIZE-1))
> +
>  static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>  {
>  	struct resource *res;
> @@ -135,12 +138,13 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>  		if (!resource_size(res))
>  			goto no_mmap;
>  
> -		if (resource_size(res) >= PAGE_SIZE) {
> +		if (resource_size(res) >= VFIO_64K_PAGE_SIZE) {
>  			vdev->bar_mmap_supported[bar] = true;
>  			continue;
>  		}
>  
> -		if (!(res->start & ~PAGE_MASK)) {
> +		if (!(res->start & ~VFIO_64K_PAGE_MASK)) {
> +			int ret;
>  			/*
>  			 * Add a dummy resource to reserve the remainder
>  			 * of the exclusive page in case that hot-add
> @@ -151,10 +155,12 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>  				goto no_mmap;
>  
>  			dummy_res->resource.start = res->end + 1;
> -			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> +			dummy_res->resource.end = res->start + VFIO_64K_PAGE_SIZE - 1;
>  			dummy_res->resource.flags = res->flags;
> -			if (request_resource(res->parent,
> -						&dummy_res->resource)) {
> +			ret = request_resource(res->parent,
> +						&dummy_res->resource);
> +			if (ret) {
> +dev_info(&vdev->pdev->dev, "Failed to request_resource %lx-%lx (%d)\n", dummy_res->resource.start, dummy_res->resource.end, ret);
>  				kfree(dummy_res);
>  				goto no_mmap;
>  			}
> 
> IOW, enforce 64k for mmap regardless of PAGE_SIZE.  Then I find a
> system configuration to test it:
> 
>   ee400000-ef4fffff : PCI Bus 0000:07
>     ef480000-ef49ffff : 0000:07:00.0
>       ef480000-ef483fff : 0000:08:10.0
>       ef484000-ef487fff : 0000:08:10.2
>       ef488000-ef48bfff : 0000:08:10.4
>       ef48c000-ef48ffff : 0000:08:10.6
>       ef490000-ef493fff : 0000:08:11.0
>       ef494000-ef497fff : 0000:08:11.2
>       ef498000-ef49bfff : 0000:08:11.4
>     ef4a0000-ef4bffff : 0000:07:00.0
>       ef4a0000-ef4a3fff : 0000:08:10.0
>       ef4a4000-ef4a7fff : 0000:08:10.2
>       ef4a8000-ef4abfff : 0000:08:10.4
>       ef4ac000-ef4affff : 0000:08:10.6
>       ef4b0000-ef4b3fff : 0000:08:11.0
>       ef4b4000-ef4b7fff : 0000:08:11.2
>       ef4b8000-ef4bbfff : 0000:08:11.4
> 
> This is an 82576 NIC where each VF has two 16k BARs (0 & 3), where all
> the VF BAR0s are in a contiguous range and all the VF BAR3s are in a
> separate contiguous range.  The igbvf driver is not loaded, so the
> other resources are free to be claimed, what happens?
> 
> It looks like you're right, the request_resource() fails with:
> 
> vfio-pci 0000:08:10.0: Failed to request_resource ef4a4000-ef4affff (-16)
> vfio-pci 0000:08:10.0: Failed to request_resource ef484000-ef48ffff (-16)
> 
> So we get back -EBUSY which means we hit a conflict.  I would have
> expected that this means our res->parent that we're using for
> request_resource() is only, for instance, ef480000-ef483fff (ie. the
> BAR itself) thus our request for ef484000-ef48ffff exceeds the end of
> the parent, but adding the parent resource range to the dev_info(), it
> actually shows the range being ef480000-ef49ffff, so the parent is
> actually the 07:00.0 resource.  In fact, we can't even use
> request_resource() like this to claim the BAR itself, which is why we
> use pci_request_selected_regions(), which allows conflicts, putting the
> requested resource at the leaf of the tree.
> 
> So I guess I retract this concern about the use of request_resource(),
> it does seem to behave as intended.  Unless I can spot anything else or
> other reviewers have comments, I'll queue this into my next branch for
> v4.8.  Thanks,


Ok, one more test, I found that I have access to the following USB
devices:

00:1a.0 USB controller: Intel Corporation 6 Series/C200 Series Chipset Family USB Enhanced Host Controller #2 (rev 05) (prog-if 20 [EHCI])
	Region 0: Memory at f7a08000 (32-bit, non-prefetchable) [size=1K]

00:1d.0 USB controller: Intel Corporation 6 Series/C200 Series Chipset Family USB Enhanced Host Controller #1 (rev 05) (prog-if 20 [EHCI])
	Region 0: Memory at f7a07000 (32-bit, non-prefetchable) [size=1K]

These are nicely mapped such that vfio-pci can claim the residual space
from the page, which results in the following in /proc/iomem:

  f7a07000-f7a073ff : 0000:00:1d.0
    f7a07000-f7a073ff : vfio
  f7a07400-f7a07fff : <BAD>
  f7a08000-f7a083ff : 0000:00:1a.0
    f7a08000-f7a083ff : vfio
  f7a08400-f7a08fff : <BAD>

I should have looked more closely at your previous reply, I didn't
think that "<BAD>" was literally the owner of these dummy resources.
Please fix this to report something that isn't going frighten users
and make small children cry.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-29 20:03               ` Alex Williamson
@ 2016-06-30  2:40                 ` Yongji Xie
  2016-06-30  2:53                   ` Alex Williamson
  0 siblings, 1 reply; 17+ messages in thread
From: Yongji Xie @ 2016-06-30  2:40 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

Hi Alex,

On 2016/6/30 4:03, Alex Williamson wrote:

> On Tue, 28 Jun 2016 13:47:23 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
>
>> On Tue, 28 Jun 2016 18:09:46 +0800
>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>
>>> Hi, Alex
>>>
>>> On 2016/6/25 0:43, Alex Williamson wrote:
>>>    
>>>> On Fri, 24 Jun 2016 23:37:02 +0800
>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>     
>>>>> Hi, Alex
>>>>>
>>>>> On 2016/6/24 11:37, Alex Williamson wrote:
>>>>>     
>>>>>> On Fri, 24 Jun 2016 10:52:58 +0800
>>>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>>>> On 2016/6/24 0:12, Alex Williamson wrote:
>>>>>>>> On Mon, 30 May 2016 21:06:37 +0800
>>>>>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>>>>>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>>>>>>>> +{
>>>>>>>>> +	struct resource *res;
>>>>>>>>> +	int bar;
>>>>>>>>> +	struct vfio_pci_dummy_resource *dummy_res;
>>>>>>>>> +
>>>>>>>>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
>>>>>>>>> +
>>>>>>>>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
>>>>>>>>> +		res = vdev->pdev->resource + bar;
>>>>>>>>> +
>>>>>>>>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
>>>>>>>>> +			goto no_mmap;
>>>>>>>>> +
>>>>>>>>> +		if (!(res->flags & IORESOURCE_MEM))
>>>>>>>>> +			goto no_mmap;
>>>>>>>>> +
>>>>>>>>> +		/*
>>>>>>>>> +		 * The PCI core shouldn't set up a resource with a
>>>>>>>>> +		 * type but zero size. But there may be bugs that
>>>>>>>>> +		 * cause us to do that.
>>>>>>>>> +		 */
>>>>>>>>> +		if (!resource_size(res))
>>>>>>>>> +			goto no_mmap;
>>>>>>>>> +
>>>>>>>>> +		if (resource_size(res) >= PAGE_SIZE) {
>>>>>>>>> +			vdev->bar_mmap_supported[bar] = true;
>>>>>>>>> +			continue;
>>>>>>>>> +		}
>>>>>>>>> +
>>>>>>>>> +		if (!(res->start & ~PAGE_MASK)) {
>>>>>>>>> +			/*
>>>>>>>>> +			 * Add a dummy resource to reserve the remainder
>>>>>>>>> +			 * of the exclusive page in case that hot-add
>>>>>>>>> +			 * device's bar is assigned into it.
>>>>>>>>> +			 */
>>>>>>>>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
>>>>>>>>> +			if (dummy_res == NULL)
>>>>>>>>> +				goto no_mmap;
>>>>>>>>> +
>>>>>>>>> +			dummy_res->resource.start = res->end + 1;
>>>>>>>>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
>>>>>>>>> +			dummy_res->resource.flags = res->flags;
>>>>>>>>> +			if (request_resource(res->parent,
>>>>>>>>> +						&dummy_res->resource)) {
>>>>>>>>> +				kfree(dummy_res);
>>>>>>>>> +				goto no_mmap;
>>>>>>>>> +			}
>>>>>>>> Isn't it true that request_resource() only tells us that at a given
>>>>>>>> point in time, no other drivers have reserved that resource?  It seems
>>>>>>>> like it does not guarantee that the resource isn't routed to another
>>>>>>>> device or that another driver won't at some point attempt to request
>>>>>>>> that same resource.  So for example if a user constructs their initrd
>>>>>>>> to bind vfio-pci to devices before other modules load, this
>>>>>>>> request_resource() may succeed, at the expense of drivers loaded later
>>>>>>>> now failing.  The behavior will depend on driver load order and we're
>>>>>>>> not actually insuring that the overflow resource is unused, just that
>>>>>>>> we got it first.  Can we do better?  Am I missing something that
>>>>>>>> prevents this?  Thanks,
>>>>>>>>
>>>>>>>> Alex
>>>>>>> Couldn't PCI resources allocator prevent this, which will find a
>>>>>>> empty slot in the resource tree firstly, then try to request that
>>>>>>> resource in allocate_resource() when a PCI device is probed.
>>>>>>> And I'd like to know why a PCI device driver would attempt to
>>>>>>> call request_resource()? Should this be done in PCI enumeration?
>>>>>> Hi Yongji,
>>>>>>
>>>>>> Looks like most pci drivers call pci_request_regions().  From there the
>>>>>> call path is:
>>>>>>
>>>>>> pci_request_selected_regions
>>>>>>      __pci_request_selected_regions
>>>>>>        __pci_request_region
>>>>>>          __request_mem_region
>>>>>>            __request_region
>>>>>>              __request_resource
>>>>>>
>>>>>> We see this driver ordering issue sometimes with users attempting to
>>>>>> blacklist native pci drivers, trying to leave a device free for use by
>>>>>> vfio-pci.  If the device is a graphics card, the generic vesa or uefi
>>>>>> driver can request device resources causing a failure when vfio-pci
>>>>>> tries to request those same resources.  I expect that unless it's a
>>>>>> boot device, like vga in my example, the resources are not enabled
>>>>>> until the driver opens the device, therefore the request_resource() call
>>>>>> doesn't occur until that point.
>>>>>>
>>>>>> For another trivial example, look at /proc/iomem as you load and unload
>>>>>> a driver, on my laptop with e1000e unloaded I see:
>>>>>>
>>>>>>      e1200000-e121ffff : 0000:00:19.0
>>>>>>      e123e000-e123efff : 0000:00:19.0
>>>>>>
>>>>>> When e1000e is loaded, each of these becomes claimed by the e1000e
>>>>>> driver:
>>>>>>
>>>>>>      e1200000-e121ffff : 0000:00:19.0
>>>>>>        e1200000-e121ffff : e1000e
>>>>>>      e123e000-e123efff : 0000:00:19.0
>>>>>>        e123e000-e123efff : e1000e
>>>>>>
>>>>>> Clearly pci core knows the resource is associated with the device, but
>>>>>> I don't think we're tapping into that with request_resource(), we're
>>>>>> just potentially stealing resources that another driver might have
>>>>>> claimed otherwise as I described above.  That's my suspicion at
>>>>>> least, feel free to show otherwise if it's incorrect.  Thanks,
>>>>>>
>>>>>> Alex
>>>>>>        
>>>>> Thanks for your explanation. But I still have one question.
>>>>> Shouldn't PCI core have claimed all PCI device's resources
>>>>> after probing those devices. If so, request_resource() will fail
>>>>> when vfio-pci try to steal resources that another driver might
>>>>> request later. Anything I missed here?  Some device resources
>>>>> would not be claimed in PCI core?
>>>> Hi Yongji,
>>>>
>>>> I don't know what to say, this is not how the interface currently
>>>> works.  request_resource() is a driver level interface that tries to
>>>> prevent drivers from claiming conflicting resources.  In this patch
>>>> you're trying to use it to probe whether a resource maps to another
>>>> device.  This is not what it does.  request_resource() will happily let
>>>> you claim any resource you want, so long as nobody else claimed it
>>>> first.  So the only case where the assumptions in this patch are valid
>>>> is if we can guarantee that any potentially conflicting device has a
>>>> driver loaded that has claimed those resources.  As it is here,
>>>> vfio-pci will happily attempt to request resources that might overlap
>>>> with another device and might break drivers that haven't yet had a
>>>> chance to probe their devices.  I don't think that's acceptable.
>>>> Thanks,
>>>>
>>>> Alex
>>>>     
>>> I'm trying to get your point. Let me give an example here.
>>> I'm not sure whether my understanding is right. Please
>>> point it out if I'm wrong.
>>>
>>> We assume that there are two PCI devices like this:
>>>
>>> 240000000000-24feffffffff : /pciex@3fffe40400000
>>>     240000000000-2400ffffffff : PCI Bus 0002:01
>>>       240000000000-240000007fff : 0002:01:00.0
>>>         240000000000-240000007fff : vfio-pci
>>>       240000008000-24000000ffff : 0002:01:01.0
>>>         240000008000-24000000ffff : lpfc
>>>
>>> Do you mean vfio-pci driver will succeed in requesting
>>> dummy_res: [240000008000-24000000ffff] (PAGE_SIZE is 64K)
>>> if it is loaded before lpfc driver? Like this:
>>>
>>> 240000000000-24feffffffff : /pciex@3fffe40400000
>>>     240000000000-2400ffffffff : PCI Bus 0002:01
>>>       240000000000-240000007fff : 0002:01:00.0
>>>         240000000000-240000007fff : vfio-pci
>>>       240000008000-24000000ffff : 0002:01:01.0
>>>         240000008000-24000000ffff : <BAD>    --> vfio-pci call
>>> request_resource()
>>>
>>> Then lpfc driver will fail when it attempts to call
>>> pci_request_regions() later.
>> Yes, that is my supposition.
>>   
>>> But is it possible that the dummy_res become the child of
>>> the res: 0002:01:01.0? Wouldn't request_resource() fail when
>>> it found parent res: PCI Bus 0002:01 already have conflict
>>> child res: 0002:01:01.0.
>>>
>>> And I think the case that request_resource() will succeed
>>> should like this:
>>>
>>> 240000000000-24feffffffff : /pciex@3fffe40400000
>>>     240000000000-2400ffffffff : PCI Bus 0002:01
>>>       240000000000-240000007fff : 0002:01:00.0
>>>       240000010000-240000017fff : 0002:01:01.0
>>>
>>> There is a mem hole: [240000008000-24000000ffff] after
>>> PCI probing.  After loading drivers, the resources tree
>>> will be:
>>>
>>> 240000000000-24feffffffff : /pciex@3fffe40400000
>>>     240000000000-2400ffffffff : PCI Bus 0002:01
>>>       240000000000-240000007fff : 0002:01:00.0
>>>         240000000000-240000007fff : vfio-pci
>>>       240000008000-24000000ffff : <BAD>    ---> vfio-pci call
>>> request_resource()
>>>       240000010000-240000017fff : 0002:01:01.0
>>>         240000010000-240000017fff : lpfc
>> Ok, let's stop guessing about this.  I modified your patch as follows
>> so I could easily test it on a 4k host:
>>
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -110,6 +110,9 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>>   	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>>   }
>>   
>> +#define VFIO_64K_PAGE_SIZE (64*1024)
>> +#define VFIO_64K_PAGE_MASK (~(VFIO_64K_PAGE_SIZE-1))
>> +
>>   static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>   {
>>   	struct resource *res;
>> @@ -135,12 +138,13 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>   		if (!resource_size(res))
>>   			goto no_mmap;
>>   
>> -		if (resource_size(res) >= PAGE_SIZE) {
>> +		if (resource_size(res) >= VFIO_64K_PAGE_SIZE) {
>>   			vdev->bar_mmap_supported[bar] = true;
>>   			continue;
>>   		}
>>   
>> -		if (!(res->start & ~PAGE_MASK)) {
>> +		if (!(res->start & ~VFIO_64K_PAGE_MASK)) {
>> +			int ret;
>>   			/*
>>   			 * Add a dummy resource to reserve the remainder
>>   			 * of the exclusive page in case that hot-add
>> @@ -151,10 +155,12 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>   				goto no_mmap;
>>   
>>   			dummy_res->resource.start = res->end + 1;
>> -			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
>> +			dummy_res->resource.end = res->start + VFIO_64K_PAGE_SIZE - 1;
>>   			dummy_res->resource.flags = res->flags;
>> -			if (request_resource(res->parent,
>> -						&dummy_res->resource)) {
>> +			ret = request_resource(res->parent,
>> +						&dummy_res->resource);
>> +			if (ret) {
>> +dev_info(&vdev->pdev->dev, "Failed to request_resource %lx-%lx (%d)\n", dummy_res->resource.start, dummy_res->resource.end, ret);
>>   				kfree(dummy_res);
>>   				goto no_mmap;
>>   			}
>>
>> IOW, enforce 64k for mmap regardless of PAGE_SIZE.  Then I find a
>> system configuration to test it:
>>
>>    ee400000-ef4fffff : PCI Bus 0000:07
>>      ef480000-ef49ffff : 0000:07:00.0
>>        ef480000-ef483fff : 0000:08:10.0
>>        ef484000-ef487fff : 0000:08:10.2
>>        ef488000-ef48bfff : 0000:08:10.4
>>        ef48c000-ef48ffff : 0000:08:10.6
>>        ef490000-ef493fff : 0000:08:11.0
>>        ef494000-ef497fff : 0000:08:11.2
>>        ef498000-ef49bfff : 0000:08:11.4
>>      ef4a0000-ef4bffff : 0000:07:00.0
>>        ef4a0000-ef4a3fff : 0000:08:10.0
>>        ef4a4000-ef4a7fff : 0000:08:10.2
>>        ef4a8000-ef4abfff : 0000:08:10.4
>>        ef4ac000-ef4affff : 0000:08:10.6
>>        ef4b0000-ef4b3fff : 0000:08:11.0
>>        ef4b4000-ef4b7fff : 0000:08:11.2
>>        ef4b8000-ef4bbfff : 0000:08:11.4
>>
>> This is an 82576 NIC where each VF has two 16k BARs (0 & 3), where all
>> the VF BAR0s are in a contiguous range and all the VF BAR3s are in a
>> separate contiguous range.  The igbvf driver is not loaded, so the
>> other resources are free to be claimed, what happens?
>>
>> It looks like you're right, the request_resource() fails with:
>>
>> vfio-pci 0000:08:10.0: Failed to request_resource ef4a4000-ef4affff (-16)
>> vfio-pci 0000:08:10.0: Failed to request_resource ef484000-ef48ffff (-16)
>>
>> So we get back -EBUSY which means we hit a conflict.  I would have
>> expected that this means our res->parent that we're using for
>> request_resource() is only, for instance, ef480000-ef483fff (ie. the
>> BAR itself) thus our request for ef484000-ef48ffff exceeds the end of
>> the parent, but adding the parent resource range to the dev_info(), it
>> actually shows the range being ef480000-ef49ffff, so the parent is
>> actually the 07:00.0 resource.  In fact, we can't even use
>> request_resource() like this to claim the BAR itself, which is why we
>> use pci_request_selected_regions(), which allows conflicts, putting the
>> requested resource at the leaf of the tree.
>>
>> So I guess I retract this concern about the use of request_resource(),
>> it does seem to behave as intended.  Unless I can spot anything else or
>> other reviewers have comments, I'll queue this into my next branch for
>> v4.8.  Thanks,
>
> Ok, one more test, I found that I have access to the following USB
> devices:
>
> 00:1a.0 USB controller: Intel Corporation 6 Series/C200 Series Chipset Family USB Enhanced Host Controller #2 (rev 05) (prog-if 20 [EHCI])
> 	Region 0: Memory at f7a08000 (32-bit, non-prefetchable) [size=1K]
>
> 00:1d.0 USB controller: Intel Corporation 6 Series/C200 Series Chipset Family USB Enhanced Host Controller #1 (rev 05) (prog-if 20 [EHCI])
> 	Region 0: Memory at f7a07000 (32-bit, non-prefetchable) [size=1K]
>
> These are nicely mapped such that vfio-pci can claim the residual space
> from the page, which results in the following in /proc/iomem:
>
>    f7a07000-f7a073ff : 0000:00:1d.0
>      f7a07000-f7a073ff : vfio
>    f7a07400-f7a07fff : <BAD>
>    f7a08000-f7a083ff : 0000:00:1a.0
>      f7a08000-f7a083ff : vfio
>    f7a08400-f7a08fff : <BAD>
>
> I should have looked more closely at your previous reply, I didn't
> think that "<BAD>" was literally the owner of these dummy resources.
> Please fix this to report something that isn't going frighten users
> and make small children cry.  Thanks,

Yeah, I also noticed that:-). Now I'm trying to find a proper
name. What do you think about "vfio-pci (dummy)"?

Thanks,
Yongji

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-30  2:40                 ` Yongji Xie
@ 2016-06-30  2:53                   ` Alex Williamson
  2016-06-30  3:29                     ` Yongji Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Alex Williamson @ 2016-06-30  2:53 UTC (permalink / raw)
  To: Yongji Xie
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On Thu, 30 Jun 2016 10:40:23 +0800
Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:

> Hi Alex,
> 
> On 2016/6/30 4:03, Alex Williamson wrote:
> 
> > On Tue, 28 Jun 2016 13:47:23 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> >  
> >> On Tue, 28 Jun 2016 18:09:46 +0800
> >> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> >>  
> >>> Hi, Alex
> >>>
> >>> On 2016/6/25 0:43, Alex Williamson wrote:
> >>>      
> >>>> On Fri, 24 Jun 2016 23:37:02 +0800
> >>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> >>>>       
> >>>>> Hi, Alex
> >>>>>
> >>>>> On 2016/6/24 11:37, Alex Williamson wrote:
> >>>>>       
> >>>>>> On Fri, 24 Jun 2016 10:52:58 +0800
> >>>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:  
> >>>>>>> On 2016/6/24 0:12, Alex Williamson wrote:  
> >>>>>>>> On Mon, 30 May 2016 21:06:37 +0800
> >>>>>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:  
> >>>>>>>>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >>>>>>>>> +{
> >>>>>>>>> +	struct resource *res;
> >>>>>>>>> +	int bar;
> >>>>>>>>> +	struct vfio_pci_dummy_resource *dummy_res;
> >>>>>>>>> +
> >>>>>>>>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> >>>>>>>>> +
> >>>>>>>>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
> >>>>>>>>> +		res = vdev->pdev->resource + bar;
> >>>>>>>>> +
> >>>>>>>>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> >>>>>>>>> +			goto no_mmap;
> >>>>>>>>> +
> >>>>>>>>> +		if (!(res->flags & IORESOURCE_MEM))
> >>>>>>>>> +			goto no_mmap;
> >>>>>>>>> +
> >>>>>>>>> +		/*
> >>>>>>>>> +		 * The PCI core shouldn't set up a resource with a
> >>>>>>>>> +		 * type but zero size. But there may be bugs that
> >>>>>>>>> +		 * cause us to do that.
> >>>>>>>>> +		 */
> >>>>>>>>> +		if (!resource_size(res))
> >>>>>>>>> +			goto no_mmap;
> >>>>>>>>> +
> >>>>>>>>> +		if (resource_size(res) >= PAGE_SIZE) {
> >>>>>>>>> +			vdev->bar_mmap_supported[bar] = true;
> >>>>>>>>> +			continue;
> >>>>>>>>> +		}
> >>>>>>>>> +
> >>>>>>>>> +		if (!(res->start & ~PAGE_MASK)) {
> >>>>>>>>> +			/*
> >>>>>>>>> +			 * Add a dummy resource to reserve the remainder
> >>>>>>>>> +			 * of the exclusive page in case that hot-add
> >>>>>>>>> +			 * device's bar is assigned into it.
> >>>>>>>>> +			 */
> >>>>>>>>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> >>>>>>>>> +			if (dummy_res == NULL)
> >>>>>>>>> +				goto no_mmap;
> >>>>>>>>> +
> >>>>>>>>> +			dummy_res->resource.start = res->end + 1;
> >>>>>>>>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> >>>>>>>>> +			dummy_res->resource.flags = res->flags;
> >>>>>>>>> +			if (request_resource(res->parent,
> >>>>>>>>> +						&dummy_res->resource)) {
> >>>>>>>>> +				kfree(dummy_res);
> >>>>>>>>> +				goto no_mmap;
> >>>>>>>>> +			}  
> >>>>>>>> Isn't it true that request_resource() only tells us that at a given
> >>>>>>>> point in time, no other drivers have reserved that resource?  It seems
> >>>>>>>> like it does not guarantee that the resource isn't routed to another
> >>>>>>>> device or that another driver won't at some point attempt to request
> >>>>>>>> that same resource.  So for example if a user constructs their initrd
> >>>>>>>> to bind vfio-pci to devices before other modules load, this
> >>>>>>>> request_resource() may succeed, at the expense of drivers loaded later
> >>>>>>>> now failing.  The behavior will depend on driver load order and we're
> >>>>>>>> not actually insuring that the overflow resource is unused, just that
> >>>>>>>> we got it first.  Can we do better?  Am I missing something that
> >>>>>>>> prevents this?  Thanks,
> >>>>>>>>
> >>>>>>>> Alex  
> >>>>>>> Couldn't PCI resources allocator prevent this, which will find a
> >>>>>>> empty slot in the resource tree firstly, then try to request that
> >>>>>>> resource in allocate_resource() when a PCI device is probed.
> >>>>>>> And I'd like to know why a PCI device driver would attempt to
> >>>>>>> call request_resource()? Should this be done in PCI enumeration?  
> >>>>>> Hi Yongji,
> >>>>>>
> >>>>>> Looks like most pci drivers call pci_request_regions().  From there the
> >>>>>> call path is:
> >>>>>>
> >>>>>> pci_request_selected_regions
> >>>>>>      __pci_request_selected_regions
> >>>>>>        __pci_request_region
> >>>>>>          __request_mem_region
> >>>>>>            __request_region
> >>>>>>              __request_resource
> >>>>>>
> >>>>>> We see this driver ordering issue sometimes with users attempting to
> >>>>>> blacklist native pci drivers, trying to leave a device free for use by
> >>>>>> vfio-pci.  If the device is a graphics card, the generic vesa or uefi
> >>>>>> driver can request device resources causing a failure when vfio-pci
> >>>>>> tries to request those same resources.  I expect that unless it's a
> >>>>>> boot device, like vga in my example, the resources are not enabled
> >>>>>> until the driver opens the device, therefore the request_resource() call
> >>>>>> doesn't occur until that point.
> >>>>>>
> >>>>>> For another trivial example, look at /proc/iomem as you load and unload
> >>>>>> a driver, on my laptop with e1000e unloaded I see:
> >>>>>>
> >>>>>>      e1200000-e121ffff : 0000:00:19.0
> >>>>>>      e123e000-e123efff : 0000:00:19.0
> >>>>>>
> >>>>>> When e1000e is loaded, each of these becomes claimed by the e1000e
> >>>>>> driver:
> >>>>>>
> >>>>>>      e1200000-e121ffff : 0000:00:19.0
> >>>>>>        e1200000-e121ffff : e1000e
> >>>>>>      e123e000-e123efff : 0000:00:19.0
> >>>>>>        e123e000-e123efff : e1000e
> >>>>>>
> >>>>>> Clearly pci core knows the resource is associated with the device, but
> >>>>>> I don't think we're tapping into that with request_resource(), we're
> >>>>>> just potentially stealing resources that another driver might have
> >>>>>> claimed otherwise as I described above.  That's my suspicion at
> >>>>>> least, feel free to show otherwise if it's incorrect.  Thanks,
> >>>>>>
> >>>>>> Alex
> >>>>>>          
> >>>>> Thanks for your explanation. But I still have one question.
> >>>>> Shouldn't PCI core have claimed all PCI device's resources
> >>>>> after probing those devices. If so, request_resource() will fail
> >>>>> when vfio-pci try to steal resources that another driver might
> >>>>> request later. Anything I missed here?  Some device resources
> >>>>> would not be claimed in PCI core?  
> >>>> Hi Yongji,
> >>>>
> >>>> I don't know what to say, this is not how the interface currently
> >>>> works.  request_resource() is a driver level interface that tries to
> >>>> prevent drivers from claiming conflicting resources.  In this patch
> >>>> you're trying to use it to probe whether a resource maps to another
> >>>> device.  This is not what it does.  request_resource() will happily let
> >>>> you claim any resource you want, so long as nobody else claimed it
> >>>> first.  So the only case where the assumptions in this patch are valid
> >>>> is if we can guarantee that any potentially conflicting device has a
> >>>> driver loaded that has claimed those resources.  As it is here,
> >>>> vfio-pci will happily attempt to request resources that might overlap
> >>>> with another device and might break drivers that haven't yet had a
> >>>> chance to probe their devices.  I don't think that's acceptable.
> >>>> Thanks,
> >>>>
> >>>> Alex
> >>>>       
> >>> I'm trying to get your point. Let me give an example here.
> >>> I'm not sure whether my understanding is right. Please
> >>> point it out if I'm wrong.
> >>>
> >>> We assume that there are two PCI devices like this:
> >>>
> >>> 240000000000-24feffffffff : /pciex@3fffe40400000
> >>>     240000000000-2400ffffffff : PCI Bus 0002:01
> >>>       240000000000-240000007fff : 0002:01:00.0
> >>>         240000000000-240000007fff : vfio-pci
> >>>       240000008000-24000000ffff : 0002:01:01.0
> >>>         240000008000-24000000ffff : lpfc
> >>>
> >>> Do you mean vfio-pci driver will succeed in requesting
> >>> dummy_res: [240000008000-24000000ffff] (PAGE_SIZE is 64K)
> >>> if it is loaded before lpfc driver? Like this:
> >>>
> >>> 240000000000-24feffffffff : /pciex@3fffe40400000
> >>>     240000000000-2400ffffffff : PCI Bus 0002:01
> >>>       240000000000-240000007fff : 0002:01:00.0
> >>>         240000000000-240000007fff : vfio-pci
> >>>       240000008000-24000000ffff : 0002:01:01.0
> >>>         240000008000-24000000ffff : <BAD>    --> vfio-pci call
> >>> request_resource()
> >>>
> >>> Then lpfc driver will fail when it attempts to call
> >>> pci_request_regions() later.  
> >> Yes, that is my supposition.
> >>     
> >>> But is it possible that the dummy_res become the child of
> >>> the res: 0002:01:01.0? Wouldn't request_resource() fail when
> >>> it found parent res: PCI Bus 0002:01 already have conflict
> >>> child res: 0002:01:01.0.
> >>>
> >>> And I think the case that request_resource() will succeed
> >>> should like this:
> >>>
> >>> 240000000000-24feffffffff : /pciex@3fffe40400000
> >>>     240000000000-2400ffffffff : PCI Bus 0002:01
> >>>       240000000000-240000007fff : 0002:01:00.0
> >>>       240000010000-240000017fff : 0002:01:01.0
> >>>
> >>> There is a mem hole: [240000008000-24000000ffff] after
> >>> PCI probing.  After loading drivers, the resources tree
> >>> will be:
> >>>
> >>> 240000000000-24feffffffff : /pciex@3fffe40400000
> >>>     240000000000-2400ffffffff : PCI Bus 0002:01
> >>>       240000000000-240000007fff : 0002:01:00.0
> >>>         240000000000-240000007fff : vfio-pci
> >>>       240000008000-24000000ffff : <BAD>    ---> vfio-pci call
> >>> request_resource()
> >>>       240000010000-240000017fff : 0002:01:01.0
> >>>         240000010000-240000017fff : lpfc  
> >> Ok, let's stop guessing about this.  I modified your patch as follows
> >> so I could easily test it on a 4k host:
> >>
> >> --- a/drivers/vfio/pci/vfio_pci.c
> >> +++ b/drivers/vfio/pci/vfio_pci.c
> >> @@ -110,6 +110,9 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
> >>   	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
> >>   }
> >>   
> >> +#define VFIO_64K_PAGE_SIZE (64*1024)
> >> +#define VFIO_64K_PAGE_MASK (~(VFIO_64K_PAGE_SIZE-1))
> >> +
> >>   static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >>   {
> >>   	struct resource *res;
> >> @@ -135,12 +138,13 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >>   		if (!resource_size(res))
> >>   			goto no_mmap;
> >>   
> >> -		if (resource_size(res) >= PAGE_SIZE) {
> >> +		if (resource_size(res) >= VFIO_64K_PAGE_SIZE) {
> >>   			vdev->bar_mmap_supported[bar] = true;
> >>   			continue;
> >>   		}
> >>   
> >> -		if (!(res->start & ~PAGE_MASK)) {
> >> +		if (!(res->start & ~VFIO_64K_PAGE_MASK)) {
> >> +			int ret;
> >>   			/*
> >>   			 * Add a dummy resource to reserve the remainder
> >>   			 * of the exclusive page in case that hot-add
> >> @@ -151,10 +155,12 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> >>   				goto no_mmap;
> >>   
> >>   			dummy_res->resource.start = res->end + 1;
> >> -			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> >> +			dummy_res->resource.end = res->start + VFIO_64K_PAGE_SIZE - 1;
> >>   			dummy_res->resource.flags = res->flags;
> >> -			if (request_resource(res->parent,
> >> -						&dummy_res->resource)) {
> >> +			ret = request_resource(res->parent,
> >> +						&dummy_res->resource);
> >> +			if (ret) {
> >> +dev_info(&vdev->pdev->dev, "Failed to request_resource %lx-%lx (%d)\n", dummy_res->resource.start, dummy_res->resource.end, ret);
> >>   				kfree(dummy_res);
> >>   				goto no_mmap;
> >>   			}
> >>
> >> IOW, enforce 64k for mmap regardless of PAGE_SIZE.  Then I find a
> >> system configuration to test it:
> >>
> >>    ee400000-ef4fffff : PCI Bus 0000:07
> >>      ef480000-ef49ffff : 0000:07:00.0
> >>        ef480000-ef483fff : 0000:08:10.0
> >>        ef484000-ef487fff : 0000:08:10.2
> >>        ef488000-ef48bfff : 0000:08:10.4
> >>        ef48c000-ef48ffff : 0000:08:10.6
> >>        ef490000-ef493fff : 0000:08:11.0
> >>        ef494000-ef497fff : 0000:08:11.2
> >>        ef498000-ef49bfff : 0000:08:11.4
> >>      ef4a0000-ef4bffff : 0000:07:00.0
> >>        ef4a0000-ef4a3fff : 0000:08:10.0
> >>        ef4a4000-ef4a7fff : 0000:08:10.2
> >>        ef4a8000-ef4abfff : 0000:08:10.4
> >>        ef4ac000-ef4affff : 0000:08:10.6
> >>        ef4b0000-ef4b3fff : 0000:08:11.0
> >>        ef4b4000-ef4b7fff : 0000:08:11.2
> >>        ef4b8000-ef4bbfff : 0000:08:11.4
> >>
> >> This is an 82576 NIC where each VF has two 16k BARs (0 & 3), where all
> >> the VF BAR0s are in a contiguous range and all the VF BAR3s are in a
> >> separate contiguous range.  The igbvf driver is not loaded, so the
> >> other resources are free to be claimed, what happens?
> >>
> >> It looks like you're right, the request_resource() fails with:
> >>
> >> vfio-pci 0000:08:10.0: Failed to request_resource ef4a4000-ef4affff (-16)
> >> vfio-pci 0000:08:10.0: Failed to request_resource ef484000-ef48ffff (-16)
> >>
> >> So we get back -EBUSY which means we hit a conflict.  I would have
> >> expected that this means our res->parent that we're using for
> >> request_resource() is only, for instance, ef480000-ef483fff (ie. the
> >> BAR itself) thus our request for ef484000-ef48ffff exceeds the end of
> >> the parent, but adding the parent resource range to the dev_info(), it
> >> actually shows the range being ef480000-ef49ffff, so the parent is
> >> actually the 07:00.0 resource.  In fact, we can't even use
> >> request_resource() like this to claim the BAR itself, which is why we
> >> use pci_request_selected_regions(), which allows conflicts, putting the
> >> requested resource at the leaf of the tree.
> >>
> >> So I guess I retract this concern about the use of request_resource(),
> >> it does seem to behave as intended.  Unless I can spot anything else or
> >> other reviewers have comments, I'll queue this into my next branch for
> >> v4.8.  Thanks,  
> >
> > Ok, one more test, I found that I have access to the following USB
> > devices:
> >
> > 00:1a.0 USB controller: Intel Corporation 6 Series/C200 Series Chipset Family USB Enhanced Host Controller #2 (rev 05) (prog-if 20 [EHCI])
> > 	Region 0: Memory at f7a08000 (32-bit, non-prefetchable) [size=1K]
> >
> > 00:1d.0 USB controller: Intel Corporation 6 Series/C200 Series Chipset Family USB Enhanced Host Controller #1 (rev 05) (prog-if 20 [EHCI])
> > 	Region 0: Memory at f7a07000 (32-bit, non-prefetchable) [size=1K]
> >
> > These are nicely mapped such that vfio-pci can claim the residual space
> > from the page, which results in the following in /proc/iomem:
> >
> >    f7a07000-f7a073ff : 0000:00:1d.0
> >      f7a07000-f7a073ff : vfio
> >    f7a07400-f7a07fff : <BAD>
> >    f7a08000-f7a083ff : 0000:00:1a.0
> >      f7a08000-f7a083ff : vfio
> >    f7a08400-f7a08fff : <BAD>
> >
> > I should have looked more closely at your previous reply, I didn't
> > think that "<BAD>" was literally the owner of these dummy resources.
> > Please fix this to report something that isn't going frighten users
> > and make small children cry.  Thanks,  
> 
> Yeah, I also noticed that:-). Now I'm trying to find a proper
> name. What do you think about "vfio-pci (dummy)"?

How about "vfio sub-page reserved"?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
  2016-06-30  2:53                   ` Alex Williamson
@ 2016-06-30  3:29                     ` Yongji Xie
  0 siblings, 0 replies; 17+ messages in thread
From: Yongji Xie @ 2016-06-30  3:29 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

On 2016/6/30 10:53, Alex Williamson wrote:

> On Thu, 30 Jun 2016 10:40:23 +0800
> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>
>> Hi Alex,
>>
>> On 2016/6/30 4:03, Alex Williamson wrote:
>>
>>> On Tue, 28 Jun 2016 13:47:23 -0600
>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>   
>>>> On Tue, 28 Jun 2016 18:09:46 +0800
>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>   
>>>>> Hi, Alex
>>>>>
>>>>> On 2016/6/25 0:43, Alex Williamson wrote:
>>>>>       
>>>>>> On Fri, 24 Jun 2016 23:37:02 +0800
>>>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>>>        
>>>>>>> Hi, Alex
>>>>>>>
>>>>>>> On 2016/6/24 11:37, Alex Williamson wrote:
>>>>>>>        
>>>>>>>> On Fri, 24 Jun 2016 10:52:58 +0800
>>>>>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>>>>>> On 2016/6/24 0:12, Alex Williamson wrote:
>>>>>>>>>> On Mon, 30 May 2016 21:06:37 +0800
>>>>>>>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>>>>>>>> +static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>>>>>>>>>> +{
>>>>>>>>>>> +	struct resource *res;
>>>>>>>>>>> +	int bar;
>>>>>>>>>>> +	struct vfio_pci_dummy_resource *dummy_res;
>>>>>>>>>>> +
>>>>>>>>>>> +	INIT_LIST_HEAD(&vdev->dummy_resources_list);
>>>>>>>>>>> +
>>>>>>>>>>> +	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
>>>>>>>>>>> +		res = vdev->pdev->resource + bar;
>>>>>>>>>>> +
>>>>>>>>>>> +		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
>>>>>>>>>>> +			goto no_mmap;
>>>>>>>>>>> +
>>>>>>>>>>> +		if (!(res->flags & IORESOURCE_MEM))
>>>>>>>>>>> +			goto no_mmap;
>>>>>>>>>>> +
>>>>>>>>>>> +		/*
>>>>>>>>>>> +		 * The PCI core shouldn't set up a resource with a
>>>>>>>>>>> +		 * type but zero size. But there may be bugs that
>>>>>>>>>>> +		 * cause us to do that.
>>>>>>>>>>> +		 */
>>>>>>>>>>> +		if (!resource_size(res))
>>>>>>>>>>> +			goto no_mmap;
>>>>>>>>>>> +
>>>>>>>>>>> +		if (resource_size(res) >= PAGE_SIZE) {
>>>>>>>>>>> +			vdev->bar_mmap_supported[bar] = true;
>>>>>>>>>>> +			continue;
>>>>>>>>>>> +		}
>>>>>>>>>>> +
>>>>>>>>>>> +		if (!(res->start & ~PAGE_MASK)) {
>>>>>>>>>>> +			/*
>>>>>>>>>>> +			 * Add a dummy resource to reserve the remainder
>>>>>>>>>>> +			 * of the exclusive page in case that hot-add
>>>>>>>>>>> +			 * device's bar is assigned into it.
>>>>>>>>>>> +			 */
>>>>>>>>>>> +			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
>>>>>>>>>>> +			if (dummy_res == NULL)
>>>>>>>>>>> +				goto no_mmap;
>>>>>>>>>>> +
>>>>>>>>>>> +			dummy_res->resource.start = res->end + 1;
>>>>>>>>>>> +			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
>>>>>>>>>>> +			dummy_res->resource.flags = res->flags;
>>>>>>>>>>> +			if (request_resource(res->parent,
>>>>>>>>>>> +						&dummy_res->resource)) {
>>>>>>>>>>> +				kfree(dummy_res);
>>>>>>>>>>> +				goto no_mmap;
>>>>>>>>>>> +			}
>>>>>>>>>> Isn't it true that request_resource() only tells us that at a given
>>>>>>>>>> point in time, no other drivers have reserved that resource?  It seems
>>>>>>>>>> like it does not guarantee that the resource isn't routed to another
>>>>>>>>>> device or that another driver won't at some point attempt to request
>>>>>>>>>> that same resource.  So for example if a user constructs their initrd
>>>>>>>>>> to bind vfio-pci to devices before other modules load, this
>>>>>>>>>> request_resource() may succeed, at the expense of drivers loaded later
>>>>>>>>>> now failing.  The behavior will depend on driver load order and we're
>>>>>>>>>> not actually insuring that the overflow resource is unused, just that
>>>>>>>>>> we got it first.  Can we do better?  Am I missing something that
>>>>>>>>>> prevents this?  Thanks,
>>>>>>>>>>
>>>>>>>>>> Alex
>>>>>>>>> Couldn't PCI resources allocator prevent this, which will find a
>>>>>>>>> empty slot in the resource tree firstly, then try to request that
>>>>>>>>> resource in allocate_resource() when a PCI device is probed.
>>>>>>>>> And I'd like to know why a PCI device driver would attempt to
>>>>>>>>> call request_resource()? Should this be done in PCI enumeration?
>>>>>>>> Hi Yongji,
>>>>>>>>
>>>>>>>> Looks like most pci drivers call pci_request_regions().  From there the
>>>>>>>> call path is:
>>>>>>>>
>>>>>>>> pci_request_selected_regions
>>>>>>>>       __pci_request_selected_regions
>>>>>>>>         __pci_request_region
>>>>>>>>           __request_mem_region
>>>>>>>>             __request_region
>>>>>>>>               __request_resource
>>>>>>>>
>>>>>>>> We see this driver ordering issue sometimes with users attempting to
>>>>>>>> blacklist native pci drivers, trying to leave a device free for use by
>>>>>>>> vfio-pci.  If the device is a graphics card, the generic vesa or uefi
>>>>>>>> driver can request device resources causing a failure when vfio-pci
>>>>>>>> tries to request those same resources.  I expect that unless it's a
>>>>>>>> boot device, like vga in my example, the resources are not enabled
>>>>>>>> until the driver opens the device, therefore the request_resource() call
>>>>>>>> doesn't occur until that point.
>>>>>>>>
>>>>>>>> For another trivial example, look at /proc/iomem as you load and unload
>>>>>>>> a driver, on my laptop with e1000e unloaded I see:
>>>>>>>>
>>>>>>>>       e1200000-e121ffff : 0000:00:19.0
>>>>>>>>       e123e000-e123efff : 0000:00:19.0
>>>>>>>>
>>>>>>>> When e1000e is loaded, each of these becomes claimed by the e1000e
>>>>>>>> driver:
>>>>>>>>
>>>>>>>>       e1200000-e121ffff : 0000:00:19.0
>>>>>>>>         e1200000-e121ffff : e1000e
>>>>>>>>       e123e000-e123efff : 0000:00:19.0
>>>>>>>>         e123e000-e123efff : e1000e
>>>>>>>>
>>>>>>>> Clearly pci core knows the resource is associated with the device, but
>>>>>>>> I don't think we're tapping into that with request_resource(), we're
>>>>>>>> just potentially stealing resources that another driver might have
>>>>>>>> claimed otherwise as I described above.  That's my suspicion at
>>>>>>>> least, feel free to show otherwise if it's incorrect.  Thanks,
>>>>>>>>
>>>>>>>> Alex
>>>>>>>>           
>>>>>>> Thanks for your explanation. But I still have one question.
>>>>>>> Shouldn't PCI core have claimed all PCI device's resources
>>>>>>> after probing those devices. If so, request_resource() will fail
>>>>>>> when vfio-pci try to steal resources that another driver might
>>>>>>> request later. Anything I missed here?  Some device resources
>>>>>>> would not be claimed in PCI core?
>>>>>> Hi Yongji,
>>>>>>
>>>>>> I don't know what to say, this is not how the interface currently
>>>>>> works.  request_resource() is a driver level interface that tries to
>>>>>> prevent drivers from claiming conflicting resources.  In this patch
>>>>>> you're trying to use it to probe whether a resource maps to another
>>>>>> device.  This is not what it does.  request_resource() will happily let
>>>>>> you claim any resource you want, so long as nobody else claimed it
>>>>>> first.  So the only case where the assumptions in this patch are valid
>>>>>> is if we can guarantee that any potentially conflicting device has a
>>>>>> driver loaded that has claimed those resources.  As it is here,
>>>>>> vfio-pci will happily attempt to request resources that might overlap
>>>>>> with another device and might break drivers that haven't yet had a
>>>>>> chance to probe their devices.  I don't think that's acceptable.
>>>>>> Thanks,
>>>>>>
>>>>>> Alex
>>>>>>        
>>>>> I'm trying to get your point. Let me give an example here.
>>>>> I'm not sure whether my understanding is right. Please
>>>>> point it out if I'm wrong.
>>>>>
>>>>> We assume that there are two PCI devices like this:
>>>>>
>>>>> 240000000000-24feffffffff : /pciex@3fffe40400000
>>>>>      240000000000-2400ffffffff : PCI Bus 0002:01
>>>>>        240000000000-240000007fff : 0002:01:00.0
>>>>>          240000000000-240000007fff : vfio-pci
>>>>>        240000008000-24000000ffff : 0002:01:01.0
>>>>>          240000008000-24000000ffff : lpfc
>>>>>
>>>>> Do you mean vfio-pci driver will succeed in requesting
>>>>> dummy_res: [240000008000-24000000ffff] (PAGE_SIZE is 64K)
>>>>> if it is loaded before lpfc driver? Like this:
>>>>>
>>>>> 240000000000-24feffffffff : /pciex@3fffe40400000
>>>>>      240000000000-2400ffffffff : PCI Bus 0002:01
>>>>>        240000000000-240000007fff : 0002:01:00.0
>>>>>          240000000000-240000007fff : vfio-pci
>>>>>        240000008000-24000000ffff : 0002:01:01.0
>>>>>          240000008000-24000000ffff : <BAD>    --> vfio-pci call
>>>>> request_resource()
>>>>>
>>>>> Then lpfc driver will fail when it attempts to call
>>>>> pci_request_regions() later.
>>>> Yes, that is my supposition.
>>>>      
>>>>> But is it possible that the dummy_res become the child of
>>>>> the res: 0002:01:01.0? Wouldn't request_resource() fail when
>>>>> it found parent res: PCI Bus 0002:01 already have conflict
>>>>> child res: 0002:01:01.0.
>>>>>
>>>>> And I think the case that request_resource() will succeed
>>>>> should like this:
>>>>>
>>>>> 240000000000-24feffffffff : /pciex@3fffe40400000
>>>>>      240000000000-2400ffffffff : PCI Bus 0002:01
>>>>>        240000000000-240000007fff : 0002:01:00.0
>>>>>        240000010000-240000017fff : 0002:01:01.0
>>>>>
>>>>> There is a mem hole: [240000008000-24000000ffff] after
>>>>> PCI probing.  After loading drivers, the resources tree
>>>>> will be:
>>>>>
>>>>> 240000000000-24feffffffff : /pciex@3fffe40400000
>>>>>      240000000000-2400ffffffff : PCI Bus 0002:01
>>>>>        240000000000-240000007fff : 0002:01:00.0
>>>>>          240000000000-240000007fff : vfio-pci
>>>>>        240000008000-24000000ffff : <BAD>    ---> vfio-pci call
>>>>> request_resource()
>>>>>        240000010000-240000017fff : 0002:01:01.0
>>>>>          240000010000-240000017fff : lpfc
>>>> Ok, let's stop guessing about this.  I modified your patch as follows
>>>> so I could easily test it on a 4k host:
>>>>
>>>> --- a/drivers/vfio/pci/vfio_pci.c
>>>> +++ b/drivers/vfio/pci/vfio_pci.c
>>>> @@ -110,6 +110,9 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
>>>>    	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>>>>    }
>>>>    
>>>> +#define VFIO_64K_PAGE_SIZE (64*1024)
>>>> +#define VFIO_64K_PAGE_MASK (~(VFIO_64K_PAGE_SIZE-1))
>>>> +
>>>>    static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>>>    {
>>>>    	struct resource *res;
>>>> @@ -135,12 +138,13 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>>>    		if (!resource_size(res))
>>>>    			goto no_mmap;
>>>>    
>>>> -		if (resource_size(res) >= PAGE_SIZE) {
>>>> +		if (resource_size(res) >= VFIO_64K_PAGE_SIZE) {
>>>>    			vdev->bar_mmap_supported[bar] = true;
>>>>    			continue;
>>>>    		}
>>>>    
>>>> -		if (!(res->start & ~PAGE_MASK)) {
>>>> +		if (!(res->start & ~VFIO_64K_PAGE_MASK)) {
>>>> +			int ret;
>>>>    			/*
>>>>    			 * Add a dummy resource to reserve the remainder
>>>>    			 * of the exclusive page in case that hot-add
>>>> @@ -151,10 +155,12 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
>>>>    				goto no_mmap;
>>>>    
>>>>    			dummy_res->resource.start = res->end + 1;
>>>> -			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
>>>> +			dummy_res->resource.end = res->start + VFIO_64K_PAGE_SIZE - 1;
>>>>    			dummy_res->resource.flags = res->flags;
>>>> -			if (request_resource(res->parent,
>>>> -						&dummy_res->resource)) {
>>>> +			ret = request_resource(res->parent,
>>>> +						&dummy_res->resource);
>>>> +			if (ret) {
>>>> +dev_info(&vdev->pdev->dev, "Failed to request_resource %lx-%lx (%d)\n", dummy_res->resource.start, dummy_res->resource.end, ret);
>>>>    				kfree(dummy_res);
>>>>    				goto no_mmap;
>>>>    			}
>>>>
>>>> IOW, enforce 64k for mmap regardless of PAGE_SIZE.  Then I find a
>>>> system configuration to test it:
>>>>
>>>>     ee400000-ef4fffff : PCI Bus 0000:07
>>>>       ef480000-ef49ffff : 0000:07:00.0
>>>>         ef480000-ef483fff : 0000:08:10.0
>>>>         ef484000-ef487fff : 0000:08:10.2
>>>>         ef488000-ef48bfff : 0000:08:10.4
>>>>         ef48c000-ef48ffff : 0000:08:10.6
>>>>         ef490000-ef493fff : 0000:08:11.0
>>>>         ef494000-ef497fff : 0000:08:11.2
>>>>         ef498000-ef49bfff : 0000:08:11.4
>>>>       ef4a0000-ef4bffff : 0000:07:00.0
>>>>         ef4a0000-ef4a3fff : 0000:08:10.0
>>>>         ef4a4000-ef4a7fff : 0000:08:10.2
>>>>         ef4a8000-ef4abfff : 0000:08:10.4
>>>>         ef4ac000-ef4affff : 0000:08:10.6
>>>>         ef4b0000-ef4b3fff : 0000:08:11.0
>>>>         ef4b4000-ef4b7fff : 0000:08:11.2
>>>>         ef4b8000-ef4bbfff : 0000:08:11.4
>>>>
>>>> This is an 82576 NIC where each VF has two 16k BARs (0 & 3), where all
>>>> the VF BAR0s are in a contiguous range and all the VF BAR3s are in a
>>>> separate contiguous range.  The igbvf driver is not loaded, so the
>>>> other resources are free to be claimed, what happens?
>>>>
>>>> It looks like you're right, the request_resource() fails with:
>>>>
>>>> vfio-pci 0000:08:10.0: Failed to request_resource ef4a4000-ef4affff (-16)
>>>> vfio-pci 0000:08:10.0: Failed to request_resource ef484000-ef48ffff (-16)
>>>>
>>>> So we get back -EBUSY which means we hit a conflict.  I would have
>>>> expected that this means our res->parent that we're using for
>>>> request_resource() is only, for instance, ef480000-ef483fff (ie. the
>>>> BAR itself) thus our request for ef484000-ef48ffff exceeds the end of
>>>> the parent, but adding the parent resource range to the dev_info(), it
>>>> actually shows the range being ef480000-ef49ffff, so the parent is
>>>> actually the 07:00.0 resource.  In fact, we can't even use
>>>> request_resource() like this to claim the BAR itself, which is why we
>>>> use pci_request_selected_regions(), which allows conflicts, putting the
>>>> requested resource at the leaf of the tree.
>>>>
>>>> So I guess I retract this concern about the use of request_resource(),
>>>> it does seem to behave as intended.  Unless I can spot anything else or
>>>> other reviewers have comments, I'll queue this into my next branch for
>>>> v4.8.  Thanks,
>>> Ok, one more test, I found that I have access to the following USB
>>> devices:
>>>
>>> 00:1a.0 USB controller: Intel Corporation 6 Series/C200 Series Chipset Family USB Enhanced Host Controller #2 (rev 05) (prog-if 20 [EHCI])
>>> 	Region 0: Memory at f7a08000 (32-bit, non-prefetchable) [size=1K]
>>>
>>> 00:1d.0 USB controller: Intel Corporation 6 Series/C200 Series Chipset Family USB Enhanced Host Controller #1 (rev 05) (prog-if 20 [EHCI])
>>> 	Region 0: Memory at f7a07000 (32-bit, non-prefetchable) [size=1K]
>>>
>>> These are nicely mapped such that vfio-pci can claim the residual space
>>> from the page, which results in the following in /proc/iomem:
>>>
>>>     f7a07000-f7a073ff : 0000:00:1d.0
>>>       f7a07000-f7a073ff : vfio
>>>     f7a07400-f7a07fff : <BAD>
>>>     f7a08000-f7a083ff : 0000:00:1a.0
>>>       f7a08000-f7a083ff : vfio
>>>     f7a08400-f7a08fff : <BAD>
>>>
>>> I should have looked more closely at your previous reply, I didn't
>>> think that "<BAD>" was literally the owner of these dummy resources.
>>> Please fix this to report something that isn't going frighten users
>>> and make small children cry.  Thanks,
>> Yeah, I also noticed that:-). Now I'm trying to find a proper
>> name. What do you think about "vfio-pci (dummy)"?
> How about "vfio sub-page reserved"?  Thanks,

Sounds good. I'll send a new version soon.

Thanks,
Yongji

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
@ 2016-05-30 13:06 Yongji Xie
  0 siblings, 0 replies; 17+ messages in thread
From: Yongji Xie @ 2016-05-30 13:06 UTC (permalink / raw)
  To: kvm, linux-kernel
  Cc: alex.williamson, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

Current vfio-pci implementation disallows to mmap
sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
page may be shared with other BARs. This will cause some
performance issues when we passthrough a PCI device with
this kind of BARs. Guest will be not able to handle the mmio
accesses to the BARs which leads to mmio emulations in host.

However, not all sub-page BARs will share page with other BARs.
We should allow to mmap the sub-page MMIO BARs which we can
make sure will not share page with other BARs.

This patch adds support for this case. And we try to add a
dummy resource to reserve the remainder of the page which
hot-add device's BAR might be assigned into. But it's not
necessary to handle the case when the BAR is not page aligned.
Because we can't expect the BAR will be assigned into the same
location in a page in guest when we passthrough the BAR. And
it's hard to access this BAR in userspace because we have
no way to get the BAR's location in a page.

Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
---
 drivers/vfio/pci/vfio_pci.c         |   87 ++++++++++++++++++++++++++++++++---
 drivers/vfio/pci/vfio_pci_private.h |    8 ++++
 2 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 188b1ff..3cca2a7 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
+static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
+{
+	struct resource *res;
+	int bar;
+	struct vfio_pci_dummy_resource *dummy_res;
+
+	INIT_LIST_HEAD(&vdev->dummy_resources_list);
+
+	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
+		res = vdev->pdev->resource + bar;
+
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
+			goto no_mmap;
+
+		if (!(res->flags & IORESOURCE_MEM))
+			goto no_mmap;
+
+		/*
+		 * The PCI core shouldn't set up a resource with a
+		 * type but zero size. But there may be bugs that
+		 * cause us to do that.
+		 */
+		if (!resource_size(res))
+			goto no_mmap;
+
+		if (resource_size(res) >= PAGE_SIZE) {
+			vdev->bar_mmap_supported[bar] = true;
+			continue;
+		}
+
+		if (!(res->start & ~PAGE_MASK)) {
+			/*
+			 * Add a dummy resource to reserve the remainder
+			 * of the exclusive page in case that hot-add
+			 * device's bar is assigned into it.
+			 */
+			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
+			if (dummy_res == NULL)
+				goto no_mmap;
+
+			dummy_res->resource.start = res->end + 1;
+			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
+			dummy_res->resource.flags = res->flags;
+			if (request_resource(res->parent,
+						&dummy_res->resource)) {
+				kfree(dummy_res);
+				goto no_mmap;
+			}
+			dummy_res->index = bar;
+			list_add(&dummy_res->res_next,
+					&vdev->dummy_resources_list);
+			vdev->bar_mmap_supported[bar] = true;
+			continue;
+		}
+		/*
+		 * Here we don't handle the case when the BAR is not page
+		 * aligned because we can't expect the BAR will be
+		 * assigned into the same location in a page in guest
+		 * when we passthrough the BAR. And it's hard to access
+		 * this BAR in userspace because we have no way to get
+		 * the BAR's location in a page.
+		 */
+no_mmap:
+		vdev->bar_mmap_supported[bar] = false;
+	}
+}
+
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
 
@@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 		}
 	}
 
+	vfio_pci_probe_mmaps(vdev);
+
 	return 0;
 }
 
 static void vfio_pci_disable(struct vfio_pci_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_dummy_resource *dummy_res, *tmp;
 	int i, bar;
 
 	/* Stop the device from further DMA */
@@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 		vdev->barmap[bar] = NULL;
 	}
 
+	list_for_each_entry_safe(dummy_res, tmp,
+				 &vdev->dummy_resources_list, res_next) {
+		list_del(&dummy_res->res_next);
+		release_resource(&dummy_res->resource);
+		kfree(dummy_res);
+	}
+
 	vdev->needs_reset = true;
 
 	/*
@@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data,
 
 			info.flags = VFIO_REGION_INFO_FLAG_READ |
 				     VFIO_REGION_INFO_FLAG_WRITE;
-			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
-			    pci_resource_flags(pdev, info.index) &
-			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
+			if (vdev->bar_mmap_supported[info.index]) {
 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 				if (info.index == vdev->msix_bar) {
 					ret = msix_sparse_mmap_cap(vdev, &caps);
@@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 		return -EINVAL;
 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
 		return -EINVAL;
-	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
+	if (!vdev->bar_mmap_supported[index])
 		return -EINVAL;
 
-	phys_len = pci_resource_len(pdev, index);
+	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
 	req_len = vma->vm_end - vma->vm_start;
 	pgoff = vma->vm_pgoff &
 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
 	req_start = pgoff << PAGE_SHIFT;
 
-	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
+	if (req_start + req_len > phys_len)
 		return -EINVAL;
 
 	if (index == vdev->msix_bar) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 016c14a..2128de8 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -57,9 +57,16 @@ struct vfio_pci_region {
 	u32				flags;
 };
 
+struct vfio_pci_dummy_resource {
+	struct resource		resource;
+	int			index;
+	struct list_head	res_next;
+};
+
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
+	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
 	u8			*pci_config_map;
 	u8			*vconfig;
 	struct perm_bits	*msi_perm;
@@ -88,6 +95,7 @@ struct vfio_pci_device {
 	int			refcnt;
 	struct eventfd_ctx	*err_trigger;
 	struct eventfd_ctx	*req_trigger;
+	struct list_head	dummy_resources_list;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive
@ 2016-05-30 13:06 Yongji Xie
  0 siblings, 0 replies; 17+ messages in thread
From: Yongji Xie @ 2016-05-30 13:06 UTC (permalink / raw)
  To: kvm, linux-kernel
  Cc: alex.williamson, bhelgaas, aik, benh, paulus, mpe, warrier,
	zhong, nikunj, gwshan, kevin.tian

Current vfio-pci implementation disallows to mmap
sub-page(size < PAGE_SIZE) MMIO BARs because these BARs' mmio
page may be shared with other BARs. This will cause some
performance issues when we passthrough a PCI device with
this kind of BARs. Guest will be not able to handle the mmio
accesses to the BARs which leads to mmio emulations in host.

However, not all sub-page BARs will share page with other BARs.
We should allow to mmap the sub-page MMIO BARs which we can
make sure will not share page with other BARs.

This patch adds support for this case. And we try to add a
dummy resource to reserve the remainder of the page which
hot-add device's BAR might be assigned into. But it's not
necessary to handle the case when the BAR is not page aligned.
Because we can't expect the BAR will be assigned into the same
location in a page in guest when we passthrough the BAR. And
it's hard to access this BAR in userspace because we have
no way to get the BAR's location in a page.

Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
---
 drivers/vfio/pci/vfio_pci.c         |   87 ++++++++++++++++++++++++++++++++---
 drivers/vfio/pci/vfio_pci_private.h |    8 ++++
 2 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 188b1ff..3cca2a7 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -110,6 +110,73 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
+static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
+{
+	struct resource *res;
+	int bar;
+	struct vfio_pci_dummy_resource *dummy_res;
+
+	INIT_LIST_HEAD(&vdev->dummy_resources_list);
+
+	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
+		res = vdev->pdev->resource + bar;
+
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
+			goto no_mmap;
+
+		if (!(res->flags & IORESOURCE_MEM))
+			goto no_mmap;
+
+		/*
+		 * The PCI core shouldn't set up a resource with a
+		 * type but zero size. But there may be bugs that
+		 * cause us to do that.
+		 */
+		if (!resource_size(res))
+			goto no_mmap;
+
+		if (resource_size(res) >= PAGE_SIZE) {
+			vdev->bar_mmap_supported[bar] = true;
+			continue;
+		}
+
+		if (!(res->start & ~PAGE_MASK)) {
+			/*
+			 * Add a dummy resource to reserve the remainder
+			 * of the exclusive page in case that hot-add
+			 * device's bar is assigned into it.
+			 */
+			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
+			if (dummy_res == NULL)
+				goto no_mmap;
+
+			dummy_res->resource.start = res->end + 1;
+			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
+			dummy_res->resource.flags = res->flags;
+			if (request_resource(res->parent,
+						&dummy_res->resource)) {
+				kfree(dummy_res);
+				goto no_mmap;
+			}
+			dummy_res->index = bar;
+			list_add(&dummy_res->res_next,
+					&vdev->dummy_resources_list);
+			vdev->bar_mmap_supported[bar] = true;
+			continue;
+		}
+		/*
+		 * Here we don't handle the case when the BAR is not page
+		 * aligned because we can't expect the BAR will be
+		 * assigned into the same location in a page in guest
+		 * when we passthrough the BAR. And it's hard to access
+		 * this BAR in userspace because we have no way to get
+		 * the BAR's location in a page.
+		 */
+no_mmap:
+		vdev->bar_mmap_supported[bar] = false;
+	}
+}
+
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
 
@@ -218,12 +285,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 		}
 	}
 
+	vfio_pci_probe_mmaps(vdev);
+
 	return 0;
 }
 
 static void vfio_pci_disable(struct vfio_pci_device *vdev)
 {
 	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_dummy_resource *dummy_res, *tmp;
 	int i, bar;
 
 	/* Stop the device from further DMA */
@@ -252,6 +322,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
 		vdev->barmap[bar] = NULL;
 	}
 
+	list_for_each_entry_safe(dummy_res, tmp,
+				 &vdev->dummy_resources_list, res_next) {
+		list_del(&dummy_res->res_next);
+		release_resource(&dummy_res->resource);
+		kfree(dummy_res);
+	}
+
 	vdev->needs_reset = true;
 
 	/*
@@ -623,9 +700,7 @@ static long vfio_pci_ioctl(void *device_data,
 
 			info.flags = VFIO_REGION_INFO_FLAG_READ |
 				     VFIO_REGION_INFO_FLAG_WRITE;
-			if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
-			    pci_resource_flags(pdev, info.index) &
-			    IORESOURCE_MEM && info.size >= PAGE_SIZE) {
+			if (vdev->bar_mmap_supported[info.index]) {
 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 				if (info.index == vdev->msix_bar) {
 					ret = msix_sparse_mmap_cap(vdev, &caps);
@@ -1049,16 +1124,16 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 		return -EINVAL;
 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
 		return -EINVAL;
-	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
+	if (!vdev->bar_mmap_supported[index])
 		return -EINVAL;
 
-	phys_len = pci_resource_len(pdev, index);
+	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
 	req_len = vma->vm_end - vma->vm_start;
 	pgoff = vma->vm_pgoff &
 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
 	req_start = pgoff << PAGE_SHIFT;
 
-	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
+	if (req_start + req_len > phys_len)
 		return -EINVAL;
 
 	if (index == vdev->msix_bar) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 016c14a..2128de8 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -57,9 +57,16 @@ struct vfio_pci_region {
 	u32				flags;
 };
 
+struct vfio_pci_dummy_resource {
+	struct resource		resource;
+	int			index;
+	struct list_head	res_next;
+};
+
 struct vfio_pci_device {
 	struct pci_dev		*pdev;
 	void __iomem		*barmap[PCI_STD_RESOURCE_END + 1];
+	bool			bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
 	u8			*pci_config_map;
 	u8			*vconfig;
 	struct perm_bits	*msi_perm;
@@ -88,6 +95,7 @@ struct vfio_pci_device {
 	int			refcnt;
 	struct eventfd_ctx	*err_trigger;
 	struct eventfd_ctx	*req_trigger;
+	struct list_head	dummy_resources_list;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-06-30  3:30 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <201605301311.u4UD99he028186@mx0a-001b2d01.pphosted.com>
2016-06-22 22:04 ` [PATCH v4] vfio-pci: Allow to mmap sub-page MMIO BARs if the mmio page is exclusive Alex Williamson
2016-06-23  2:39   ` Yongji Xie
2016-06-23  2:54     ` Alex Williamson
2016-06-23 16:12 ` Alex Williamson
2016-06-24  2:52   ` Yongji Xie
2016-06-24  3:37     ` Alex Williamson
2016-06-24  4:06       ` Tian, Kevin
2016-06-24 15:37       ` Yongji Xie
2016-06-24 16:43         ` Alex Williamson
2016-06-28 10:09           ` Yongji Xie
2016-06-28 19:47             ` Alex Williamson
2016-06-29 20:03               ` Alex Williamson
2016-06-30  2:40                 ` Yongji Xie
2016-06-30  2:53                   ` Alex Williamson
2016-06-30  3:29                     ` Yongji Xie
2016-05-30 13:06 Yongji Xie
  -- strict thread matches above, loose matches on Subject: below --
2016-05-30 13:06 Yongji Xie

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.