All of lore.kernel.org
 help / color / mirror / Atom feed
From: Brett Creeley <brett.creeley@amd.com>
To: <jgg@ziepe.ca>, <yishaih@nvidia.com>,
	<shameerali.kolothum.thodi@huawei.com>, <kevin.tian@intel.com>,
	<alex.williamson@redhat.com>, <kvm@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>
Cc: <shannon.nelson@amd.com>, <brett.creeley@amd.com>
Subject: [PATCH v2 vfio 6/6] vfio/pds: Add multi-region support
Date: Thu, 16 Nov 2023 16:12:07 -0800	[thread overview]
Message-ID: <20231117001207.2793-7-brett.creeley@amd.com> (raw)
In-Reply-To: <20231117001207.2793-1-brett.creeley@amd.com>

Only supporting a single region/range is limiting,
wasteful, and in some cases broken (i.e. when there
are large gaps in the iova memory ranges). Fix this
by adding support for multiple regions based on
what the device tells the driver it can support.

Signed-off-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
---
 drivers/vfio/pci/pds/dirty.c | 220 ++++++++++++++++++++++++-----------
 drivers/vfio/pci/pds/dirty.h |   4 +-
 2 files changed, 156 insertions(+), 68 deletions(-)

diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
index 4824cdfe01ed..8ddf4346fcd5 100644
--- a/drivers/vfio/pci/pds/dirty.c
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -70,7 +70,7 @@ pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
 	kfree(region_info);
 }
 
-static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
+static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_region *region,
 					unsigned long bytes)
 {
 	unsigned long *host_seq_bmp, *host_ack_bmp;
@@ -85,20 +85,27 @@ static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
 		return -ENOMEM;
 	}
 
-	dirty->region.host_seq = host_seq_bmp;
-	dirty->region.host_ack = host_ack_bmp;
-	dirty->region.bmp_bytes = bytes;
+	region->host_seq = host_seq_bmp;
+	region->host_ack = host_ack_bmp;
+	region->bmp_bytes = bytes;
 
 	return 0;
 }
 
 static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
 {
-	vfree(dirty->region.host_seq);
-	vfree(dirty->region.host_ack);
-	dirty->region.host_seq = NULL;
-	dirty->region.host_ack = NULL;
-	dirty->region.bmp_bytes = 0;
+	if (!dirty->regions)
+		return;
+
+	for (int i = 0; i < dirty->num_regions; i++) {
+		struct pds_vfio_region *region = &dirty->regions[i];
+
+		vfree(region->host_seq);
+		vfree(region->host_ack);
+		region->host_seq = NULL;
+		region->host_ack = NULL;
+		region->bmp_bytes = 0;
+	}
 }
 
 static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
@@ -119,10 +126,17 @@ static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
 
 static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
 {
-	struct pds_vfio_region *region = &pds_vfio->dirty.region;
+	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
 
-	if (region->sgl)
-		__pds_vfio_dirty_free_sgl(pds_vfio, region);
+	if (!dirty->regions)
+		return;
+
+	for (int i = 0; i < dirty->num_regions; i++) {
+		struct pds_vfio_region *region = &dirty->regions[i];
+
+		if (region->sgl)
+			__pds_vfio_dirty_free_sgl(pds_vfio, region);
+	}
 }
 
 static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
@@ -156,22 +170,90 @@ static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
 	return 0;
 }
 
+static void pds_vfio_dirty_free_regions(struct pds_vfio_dirty *dirty)
+{
+	vfree(dirty->regions);
+	dirty->regions = NULL;
+	dirty->num_regions = 0;
+}
+
+static int pds_vfio_dirty_alloc_regions(struct pds_vfio_pci_device *pds_vfio,
+					struct pds_lm_dirty_region_info *region_info,
+					u64 region_page_size, u8 num_regions)
+{
+	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+	u32 dev_bmp_offset_byte = 0;
+	int err;
+
+	dirty->regions = vcalloc(num_regions, sizeof(struct pds_vfio_region));
+	if (!dirty->regions)
+		return -ENOMEM;
+	dirty->num_regions = num_regions;
+
+	for (int i = 0; i < num_regions; i++) {
+		struct pds_lm_dirty_region_info *ri = &region_info[i];
+		struct pds_vfio_region *region = &dirty->regions[i];
+		u64 region_size, region_start;
+		u32 page_count;
+
+		/* page_count might be adjusted by the device */
+		page_count = le32_to_cpu(ri->page_count);
+		region_start = le64_to_cpu(ri->dma_base);
+		region_size = page_count * region_page_size;
+
+		err = pds_vfio_dirty_alloc_bitmaps(region,
+						   page_count / BITS_PER_BYTE);
+		if (err) {
+			dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
+				ERR_PTR(err));
+			goto out_free_regions;
+		}
+
+		err = pds_vfio_dirty_alloc_sgl(pds_vfio, region, page_count);
+		if (err) {
+			dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
+				ERR_PTR(err));
+			goto out_free_regions;
+		}
+
+		region->size = region_size;
+		region->start = region_start;
+		region->page_size = region_page_size;
+		region->dev_bmp_offset_start_byte = dev_bmp_offset_byte;
+
+		dev_bmp_offset_byte += page_count / BITS_PER_BYTE;
+		if (dev_bmp_offset_byte % BITS_PER_BYTE) {
+			dev_err(&pdev->dev, "Device bitmap offset is mis-aligned\n");
+			err = -EINVAL;
+			goto out_free_regions;
+		}
+	}
+
+	return 0;
+
+out_free_regions:
+	pds_vfio_dirty_free_bitmaps(dirty);
+	pds_vfio_dirty_free_sgl(pds_vfio);
+	pds_vfio_dirty_free_regions(dirty);
+
+	return err;
+}
+
 static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
 				 struct rb_root_cached *ranges, u32 nnodes,
 				 u64 *page_size)
 {
 	struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
 	struct device *pdsc_dev = &pci_physfn(pdev)->dev;
-	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
-	u64 region_start, region_size, region_page_size;
 	struct pds_lm_dirty_region_info *region_info;
 	struct interval_tree_node *node = NULL;
+	u64 region_page_size = *page_size;
 	u8 max_regions = 0, num_regions;
 	dma_addr_t regions_dma = 0;
 	u32 num_ranges = nnodes;
-	u32 page_count;
-	u16 len;
 	int err;
+	u16 len;
 
 	dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
 		pds_vfio->vf_id);
@@ -198,39 +280,38 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
 		return -EOPNOTSUPP;
 	}
 
-	/*
-	 * Only support 1 region for now. If there are any large gaps in the
-	 * VM's address regions, then this would be a waste of memory as we are
-	 * generating 2 bitmaps (ack/seq) from the min address to the max
-	 * address of the VM's address regions. In the future, if we support
-	 * more than one region in the device/driver we can split the bitmaps
-	 * on the largest address region gaps. We can do this split up to the
-	 * max_regions times returned from the dirty_status command.
-	 */
-	max_regions = 1;
 	if (num_ranges > max_regions) {
 		vfio_combine_iova_ranges(ranges, nnodes, max_regions);
 		num_ranges = max_regions;
 	}
 
+	region_info = kcalloc(num_ranges, sizeof(*region_info), GFP_KERNEL);
+	if (!region_info)
+		return -ENOMEM;
+	len = num_ranges * sizeof(*region_info);
+
 	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
 	if (!node)
 		return -EINVAL;
+	for (int i = 0; i < num_ranges; i++) {
+		struct pds_lm_dirty_region_info *ri = &region_info[i];
+		u64 region_size = node->last - node->start + 1;
+		u64 region_start = node->start;
+		u32 page_count;
 
-	region_size = node->last - node->start + 1;
-	region_start = node->start;
-	region_page_size = *page_size;
+		page_count = DIV_ROUND_UP(region_size, region_page_size);
 
-	len = sizeof(*region_info);
-	region_info = kzalloc(len, GFP_KERNEL);
-	if (!region_info)
-		return -ENOMEM;
+		ri->dma_base = cpu_to_le64(region_start);
+		ri->page_count = cpu_to_le32(page_count);
+		ri->page_size_log2 = ilog2(region_page_size);
 
-	page_count = DIV_ROUND_UP(region_size, region_page_size);
+		dev_dbg(&pdev->dev,
+			"region_info[%d]: region_start 0x%llx region_end 0x%lx region_size 0x%llx page_count %u page_size %llu\n",
+			i, region_start, node->last, region_size, page_count,
+			region_page_size);
 
-	region_info->dma_base = cpu_to_le64(region_start);
-	region_info->page_count = cpu_to_le32(page_count);
-	region_info->page_size_log2 = ilog2(region_page_size);
+		node = interval_tree_iter_next(node, 0, ULONG_MAX);
+	}
 
 	regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
 				     DMA_BIDIRECTIONAL);
@@ -239,39 +320,20 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
 		goto out_free_region_info;
 	}
 
-	err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
+	err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, num_ranges);
 	dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
 	if (err)
 		goto out_free_region_info;
 
-	/*
-	 * page_count might be adjusted by the device,
-	 * update it before freeing region_info DMA
-	 */
-	page_count = le32_to_cpu(region_info->page_count);
-
-	dev_dbg(&pdev->dev,
-		"region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n",
-		regions_dma, region_start, page_count,
-		(u8)ilog2(region_page_size));
-
-	err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
-	if (err) {
-		dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
-			ERR_PTR(err));
-		goto out_free_region_info;
-	}
-
-	err = pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->region, page_count);
+	err = pds_vfio_dirty_alloc_regions(pds_vfio, region_info,
+					   region_page_size, num_ranges);
 	if (err) {
-		dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
-			ERR_PTR(err));
-		goto out_free_bitmaps;
+		dev_err(&pdev->dev,
+			"Failed to allocate %d regions for tracking dirty regions: %pe\n",
+			num_regions, ERR_PTR(err));
+		goto out_dirty_disable;
 	}
 
-	dirty->region.start = region_start;
-	dirty->region.size = region_size;
-	dirty->region.page_size = region_page_size;
 	pds_vfio_dirty_set_enabled(pds_vfio);
 
 	pds_vfio_print_guest_region_info(pds_vfio, max_regions);
@@ -280,8 +342,8 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
 
 	return 0;
 
-out_free_bitmaps:
-	pds_vfio_dirty_free_bitmaps(dirty);
+out_dirty_disable:
+	pds_vfio_dirty_disable_cmd(pds_vfio);
 out_free_region_info:
 	kfree(region_info);
 	return err;
@@ -295,6 +357,7 @@ void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
 			pds_vfio_dirty_disable_cmd(pds_vfio);
 		pds_vfio_dirty_free_sgl(pds_vfio);
 		pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
+		pds_vfio_dirty_free_regions(&pds_vfio->dirty);
 	}
 
 	if (send_cmd)
@@ -365,6 +428,7 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
 
 	num_sge = sg_table.nents;
 	size = num_sge * sizeof(struct pds_lm_sg_elem);
+	offset += region->dev_bmp_offset_start_byte;
 	dma_sync_single_for_device(pdsc_dev, region->sgl_addr, size, dma_dir);
 	err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, region->sgl_addr, num_sge,
 					 offset, bmp_bytes, read_seq);
@@ -437,13 +501,28 @@ static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
 	return 0;
 }
 
+static struct pds_vfio_region *
+pds_vfio_get_region(struct pds_vfio_pci_device *pds_vfio, unsigned long iova)
+{
+	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+
+	for (int i = 0; i < dirty->num_regions; i++) {
+		struct pds_vfio_region *region = &dirty->regions[i];
+
+		if (iova >= region->start &&
+		    iova < (region->start + region->size))
+			return region;
+	}
+
+	return NULL;
+}
+
 static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
 			       struct iova_bitmap *dirty_bitmap,
 			       unsigned long iova, unsigned long length)
 {
 	struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
-	struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
-	struct pds_vfio_region *region = &dirty->region;
+	struct pds_vfio_region *region;
 	u64 bmp_offset, bmp_bytes;
 	u64 bitmap_size, pages;
 	int err;
@@ -456,6 +535,13 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
 		return -EINVAL;
 	}
 
+	region = pds_vfio_get_region(pds_vfio, iova);
+	if (!region) {
+		dev_err(dev, "vf%u: Failed to find region that contains iova 0x%lx length 0x%lx\n",
+			pds_vfio->vf_id, iova, length);
+		return -EINVAL;
+	}
+
 	pages = DIV_ROUND_UP(length, region->page_size);
 	bitmap_size =
 		round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h
index a1f6d894f913..c8e23018b801 100644
--- a/drivers/vfio/pci/pds/dirty.h
+++ b/drivers/vfio/pci/pds/dirty.h
@@ -13,11 +13,13 @@ struct pds_vfio_region {
 	u64 page_size;
 	struct pds_lm_sg_elem *sgl;
 	dma_addr_t sgl_addr;
+	u32 dev_bmp_offset_start_byte;
 	u16 num_sge;
 };
 
 struct pds_vfio_dirty {
-	struct pds_vfio_region region;
+	struct pds_vfio_region *regions;
+	u8 num_regions;
 	bool is_enabled;
 };
 
-- 
2.17.1


  parent reply	other threads:[~2023-11-17  0:13 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-17  0:12 [PATCH v2 vfio 0/6] vfio/pds: Clean-ups and multi-region support Brett Creeley
2023-11-17  0:12 ` [PATCH v2 vfio 1/6] vfio/pds: Fix calculations in pds_vfio_dirty_sync Brett Creeley
2023-11-17  0:12 ` [PATCH v2 vfio 2/6] vfio/pds: Only use a single SGL for both seq and ack Brett Creeley
2023-11-17  0:12 ` [PATCH v2 vfio 3/6] vfio/pds: Move and rename region specific info Brett Creeley
2023-11-17  0:12 ` [PATCH v2 vfio 4/6] vfio/pds: Pass region info to relevant functions Brett Creeley
2023-11-17  0:12 ` [PATCH v2 vfio 5/6] vfio/pds: Move seq/ack bitmaps into region struct Brett Creeley
2023-11-17  0:12 ` Brett Creeley [this message]
2023-11-29 20:18 ` [PATCH v2 vfio 0/6] vfio/pds: Clean-ups and multi-region support Jason Gunthorpe
2023-12-05  0:00 ` Alex Williamson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231117001207.2793-7-brett.creeley@amd.com \
    --to=brett.creeley@amd.com \
    --cc=alex.williamson@redhat.com \
    --cc=jgg@ziepe.ca \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=shameerali.kolothum.thodi@huawei.com \
    --cc=shannon.nelson@amd.com \
    --cc=yishaih@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.