All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH kvmtool 1/1] vfio/pci: Support NVM Express device passthrough
@ 2023-01-28  7:35 Dongli Si
  2023-01-30 11:01 ` Alexandru Elisei
  0 siblings, 1 reply; 2+ messages in thread
From: Dongli Si @ 2023-01-28  7:35 UTC (permalink / raw)
  To: kvm

From: Dongli Si <sidongli1997@gmail.com>

When passthrough nvme SSD, the guest kernel will report the error:

[   18.339460] nvme nvme0: failed to register the CMB

This is because the mmio data of region 0 of the nvme device is
not mapped, causing the nvme driver to read the wrong cmb size.

Nvme devices have only one region, we need to setup the mmio data
and msix table to this region, and prevent them from overlay.

Signed-off-by: Dongli Si <sidongli1997@gmail.com>
---
 include/kvm/vfio.h |  1 +
 vfio/pci.c         | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
index 764ab9b..c30a0d3 100644
--- a/include/kvm/vfio.h
+++ b/include/kvm/vfio.h
@@ -43,6 +43,7 @@ struct vfio_pci_msi_entry {
 struct vfio_pci_msix_table {
 	size_t				size;
 	unsigned int			bar;
+	u32				bar_offset; /* in the shared BAR */
 	u32				guest_phys_addr;
 };
 
diff --git a/vfio/pci.c b/vfio/pci.c
index 78f5ca5..f38c0b5 100644
--- a/vfio/pci.c
+++ b/vfio/pci.c
@@ -497,10 +497,31 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
 		region->guest_phys_addr = bar_addr;
 
 	if (has_msix && (u32)bar_num == table->bar) {
-		table->guest_phys_addr = region->guest_phys_addr;
+		table->guest_phys_addr = region->guest_phys_addr + table->bar_offset;
 		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
 					 table->size, false,
 					 vfio_pci_msix_table_access, pdev);
+
+		/*
+		 * This is to support nvme devices, because the msix table
+		 * shares a region with the mmio data, we need to avoid overlay
+		 * the memory of the msix table during the vfio_map_region.
+		 *
+		 * Here let the end address of the vfio_map_region mapped memory
+		 * not exceed the start address of the msix table. In theory,
+		 * we should also map the memory between the end address of the
+		 * msix table to the end address of the region, but the linux
+		 * nvme driver does not use the latter.
+		 *
+		 * Because the linux nvme driver does not use pba, so skip the
+		 * pba simulation directly.
+		 */
+		if (pdev->hdr.class[0] == 2 && pdev->hdr.class[1] == 8
+		    && pdev->hdr.class[2] == 1) {
+			region->info.size = table->bar_offset;
+			goto map;
+		}
+
 		/*
 		 * The MSIX table and the PBA structure can share the same BAR,
 		 * but for convenience we register different regions for mmio
@@ -522,6 +543,7 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
 		goto out;
 	}
 
+map:
 	ret = vfio_map_region(kvm, vdev, region);
 out:
 	return ret;
@@ -548,6 +570,12 @@ static int vfio_pci_bar_deactivate(struct kvm *kvm,
 		success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
 		/* kvm__deregister_mmio fails when the region is not found. */
 		ret = (success ? 0 : -ENOENT);
+
+		/* See vfio_pci_bar_activate(). */
+		if (pdev->hdr.class[0] == 2 && pdev->hdr.class[1] == 8
+		    && pdev->hdr.class[2] == 1)
+			goto unmap;
+
 		/* See vfio_pci_bar_activate(). */
 		if (ret < 0 || table->bar!= pba->bar)
 			goto out;
@@ -559,6 +587,7 @@ static int vfio_pci_bar_deactivate(struct kvm *kvm,
 		goto out;
 	}
 
+unmap:
 	vfio_unmap_region(kvm, region);
 	ret = 0;
 
@@ -832,7 +861,6 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
 					   pba_bar_offset;
 
 		/* Tidy up the capability */
-		msix->table_offset &= PCI_MSIX_TABLE_BIR;
 		if (pdev->msix_table.bar == pdev->msix_pba.bar) {
 			/* Keep the same offset as the MSIX cap. */
 			pdev->msix_pba.bar_offset = pba_bar_offset;
@@ -907,6 +935,7 @@ static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
 	struct vfio_region_info info;
 
 	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
+	table->bar_offset = msix->table_offset & PCI_MSIX_TABLE_OFFSET;
 	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
 
 	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
-- 
2.37.3


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH kvmtool 1/1] vfio/pci: Support NVM Express device passthrough
  2023-01-28  7:35 [PATCH kvmtool 1/1] vfio/pci: Support NVM Express device passthrough Dongli Si
@ 2023-01-30 11:01 ` Alexandru Elisei
  0 siblings, 0 replies; 2+ messages in thread
From: Alexandru Elisei @ 2023-01-30 11:01 UTC (permalink / raw)
  To: Dongli Si; +Cc: kvm, will, julien.thierry.kdev

Hi,

(Adding the maintainers)

On Sat, Jan 28, 2023 at 03:35:51PM +0800, Dongli Si wrote:
> From: Dongli Si <sidongli1997@gmail.com>
> 
> When passthrough nvme SSD, the guest kernel will report the error:
> 
> [   18.339460] nvme nvme0: failed to register the CMB
> 
> This is because the mmio data of region 0 of the nvme device is
> not mapped, causing the nvme driver to read the wrong cmb size.
> 
> Nvme devices have only one region, we need to setup the mmio data
> and msix table to this region, and prevent them from overlay.

Thank you for doing this, this has been a known issue for years.

This is actually more than about CMB (controller memory buffers), the root
problem is that the NVME controller registers are not mapped because
kvmtool doesn't support mapping something else alongside MSIX/PBA in the
same BAR.

> 
> Signed-off-by: Dongli Si <sidongli1997@gmail.com>
> ---
>  include/kvm/vfio.h |  1 +
>  vfio/pci.c         | 33 +++++++++++++++++++++++++++++++--
>  2 files changed, 32 insertions(+), 2 deletions(-)
> 
> diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
> index 764ab9b..c30a0d3 100644
> --- a/include/kvm/vfio.h
> +++ b/include/kvm/vfio.h
> @@ -43,6 +43,7 @@ struct vfio_pci_msi_entry {
>  struct vfio_pci_msix_table {
>  	size_t				size;
>  	unsigned int			bar;
> +	u32				bar_offset; /* in the shared BAR */
>  	u32				guest_phys_addr;
>  };
>  
> diff --git a/vfio/pci.c b/vfio/pci.c
> index 78f5ca5..f38c0b5 100644
> --- a/vfio/pci.c
> +++ b/vfio/pci.c
> @@ -497,10 +497,31 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>  		region->guest_phys_addr = bar_addr;
>  
>  	if (has_msix && (u32)bar_num == table->bar) {
> -		table->guest_phys_addr = region->guest_phys_addr;
> +		table->guest_phys_addr = region->guest_phys_addr + table->bar_offset;
>  		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
>  					 table->size, false,
>  					 vfio_pci_msix_table_access, pdev);
> +
> +		/*
> +		 * This is to support nvme devices, because the msix table
> +		 * shares a region with the mmio data, we need to avoid overlay
> +		 * the memory of the msix table during the vfio_map_region.
> +		 *
> +		 * Here let the end address of the vfio_map_region mapped memory
> +		 * not exceed the start address of the msix table. In theory,
> +		 * we should also map the memory between the end address of the
> +		 * msix table to the end address of the region, but the linux
> +		 * nvme driver does not use the latter.
> +		 *
> +		 * Because the linux nvme driver does not use pba, so skip the
> +		 * pba simulation directly.

There is no need to remove PBA emulation. This patch adds the MSIX table
offset and kvmtool already tracks the PBA offset in the BAR, so kvmtool has
everything it needs (guest physical address and size) to trap and emulate
accesses.

> +		 */
> +		if (pdev->hdr.class[0] == 2 && pdev->hdr.class[1] == 8
> +		    && pdev->hdr.class[2] == 1) {
> +			region->info.size = table->bar_offset;
> +			goto map;
> +		}

I would prefer this to be more generic, so any device can put a MMIO region
in the same bar as the MSIX table/PBA. Do you have concerns about this?

kvmtool can check that region.info.size is larger than the size of the MSIX
table + PBA (if they share the same BAR), and call vfio_map_region() for
the rest of the BAR.

Thanks,
Alex

> +
>  		/*
>  		 * The MSIX table and the PBA structure can share the same BAR,
>  		 * but for convenience we register different regions for mmio
> @@ -522,6 +543,7 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>  		goto out;
>  	}
>  
> +map:
>  	ret = vfio_map_region(kvm, vdev, region);
>  out:
>  	return ret;
> @@ -548,6 +570,12 @@ static int vfio_pci_bar_deactivate(struct kvm *kvm,
>  		success = kvm__deregister_mmio(kvm, table->guest_phys_addr);
>  		/* kvm__deregister_mmio fails when the region is not found. */
>  		ret = (success ? 0 : -ENOENT);
> +
> +		/* See vfio_pci_bar_activate(). */
> +		if (pdev->hdr.class[0] == 2 && pdev->hdr.class[1] == 8
> +		    && pdev->hdr.class[2] == 1)
> +			goto unmap;
> +
>  		/* See vfio_pci_bar_activate(). */
>  		if (ret < 0 || table->bar!= pba->bar)
>  			goto out;
> @@ -559,6 +587,7 @@ static int vfio_pci_bar_deactivate(struct kvm *kvm,
>  		goto out;
>  	}
>  
> +unmap:
>  	vfio_unmap_region(kvm, region);
>  	ret = 0;
>  
> @@ -832,7 +861,6 @@ static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev)
>  					   pba_bar_offset;
>  
>  		/* Tidy up the capability */
> -		msix->table_offset &= PCI_MSIX_TABLE_BIR;
>  		if (pdev->msix_table.bar == pdev->msix_pba.bar) {
>  			/* Keep the same offset as the MSIX cap. */
>  			pdev->msix_pba.bar_offset = pba_bar_offset;
> @@ -907,6 +935,7 @@ static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev)
>  	struct vfio_region_info info;
>  
>  	table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR;
> +	table->bar_offset = msix->table_offset & PCI_MSIX_TABLE_OFFSET;
>  	pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR;
>  
>  	nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
> -- 
> 2.37.3
> 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-01-30 11:02 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-28  7:35 [PATCH kvmtool 1/1] vfio/pci: Support NVM Express device passthrough Dongli Si
2023-01-30 11:01 ` Alexandru Elisei

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.