All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chenbo Xia <chenbo.xia@intel.com>
To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com,
	jingjing.wu@intel.com
Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu,
	nhorman@tuxdriver.com, bruce.richardson@intel.com,
	david.marchand@redhat.com, stephen@networkplumber.org,
	konstantin.ananyev@intel.com
Subject: [dpdk-dev] [RFC v3 6/6] bus/pci: add sparse mmap support for mediated PCI devices
Date: Tue,  1 Jun 2021 11:06:44 +0800	[thread overview]
Message-ID: <20210601030644.3318-7-chenbo.xia@intel.com> (raw)
In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com>

This patch adds sparse mmap support in PCI bus. Sparse mmap is a
capability defined in VFIO which allows multiple mmap areas in one
VFIO region. Mediated pci devices could use this capability to let
mdev parent driver have control over access of non-mmapable part
of regions.

Signed-off-by: Chenbo Xia <chenbo.xia@intel.com>
---
 drivers/bus/pci/linux/pci_vfio.c | 229 +++++++++++++++++++++++++++----
 drivers/bus/pci/private.h        |   2 +
 drivers/bus/pci/rte_bus_pci.h    |  18 ++-
 3 files changed, 218 insertions(+), 31 deletions(-)

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 00ba5db03a..e68eccb63f 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -654,6 +654,82 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 	return 0;
 }
 
+static int
+pci_vfio_sparse_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
+		struct vfio_region_sparse_mmap_area *vfio_areas,
+		uint32_t nr_areas, int bar_index, int additional_flags,
+		int numa_node)
+{
+	struct pci_map *map = &vfio_res->maps[bar_index];
+	struct rte_mem_map_area *area;
+	struct vfio_region_sparse_mmap_area *sparse;
+	void *bar_addr;
+	uint32_t i, j;
+
+	map->nr_areas = nr_areas;
+
+	if (map->size == 0) {
+		RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index);
+		return 0;
+	}
+
+	if (!map->nr_areas) {
+		RTE_LOG(DEBUG, EAL, "Skip bar %d with no sparse mmap areas\n",
+			bar_index);
+		map->areas = NULL;
+		return 0;
+	}
+
+	if (map->areas == NULL) {
+		map->areas = rte_zmalloc_socket(NULL,
+				sizeof(*map->areas) * nr_areas,
+				RTE_CACHE_LINE_SIZE, numa_node);
+		if (map->areas == NULL) {
+			RTE_LOG(ERR, EAL,
+				"Cannot alloc memory for sparse map areas\n");
+			return -1;
+		}
+	}
+
+	for (i = 0; i < map->nr_areas; i++) {
+		area = &map->areas[i];
+		sparse = &vfio_areas[i];
+
+		bar_addr = mmap(map->addr, sparse->size, 0, MAP_PRIVATE |
+				MAP_ANONYMOUS | additional_flags, -1, 0);
+		if (bar_addr != MAP_FAILED) {
+			area->addr = pci_map_resource(bar_addr, vfio_dev_fd,
+				map->offset + sparse->offset, sparse->size,
+				RTE_MAP_FORCE_ADDRESS);
+			if (area->addr == NULL) {
+				munmap(bar_addr, sparse->size);
+				RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n",
+					bar_index);
+				goto err_map;
+			}
+
+			area->offset = sparse->offset;
+			area->size = sparse->size;
+		} else {
+			RTE_LOG(ERR, EAL, "Failed to create inaccessible mapping for BAR%d\n",
+				bar_index);
+			goto err_map;
+		}
+	}
+
+	return 0;
+
+err_map:
+	for (j = 0; j < i; j++) {
+		pci_unmap_resource(map->areas[j].addr, map->areas[j].size);
+		map->areas[j].offset = 0;
+		map->areas[j].size = 0;
+	}
+	rte_free(map->areas);
+	map->nr_areas = 0;
+	return -1;
+}
+
 /*
  * region info may contain capability headers, so we need to keep reallocating
  * the memory until we match allocated memory size with argsz.
@@ -770,6 +846,31 @@ pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd,
 	return 0;
 }
 
+static void
+clean_up_pci_resource(struct mapped_pci_resource *vfio_res)
+{
+	struct pci_map *map;
+	uint32_t i, j;
+
+	for (i = 0; i < PCI_MAX_RESOURCE; i++) {
+		map = &vfio_res->maps[i];
+		if (map->nr_areas > 1) {
+			for (j = 0; j < map->nr_areas; j++)
+				pci_unmap_resource(map->areas[j].addr,
+					map->areas[j].size);
+		} else {
+			/*
+			 * We do not need to be aware of MSI-X BAR mappings.
+			 * Using current maps array is enough.
+			 */
+			if (map->addr)
+				pci_unmap_resource(map->addr, map->size);
+		}
+	}
+
+	rte_free(map->areas);
+}
+
 static int
 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 {
@@ -866,6 +967,8 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 
 	for (i = 0; i < vfio_res->nb_maps; i++) {
 		void *bar_addr;
+		struct vfio_info_cap_header *hdr;
+		struct vfio_region_info_cap_sparse_mmap *sparse;
 
 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
 		if (ret < 0) {
@@ -911,15 +1014,59 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 		maps[i].size = reg->size;
 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
 
-		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
-		if (ret < 0) {
-			RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
-					pci_addr, i, strerror(errno));
-			free(reg);
-			goto err_vfio_res;
-		}
+		hdr = pci_vfio_info_cap(reg, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
+
+		if (dev->is_mdev && hdr != NULL) {
+			sparse = container_of(hdr,
+				struct vfio_region_info_cap_sparse_mmap,
+				header);
+
+			ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res,
+				sparse->areas, sparse->nr_areas, i, 0,
+				dev->device.numa_node);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n",
+						pci_addr, i, strerror(errno));
+				free(reg);
+				goto err_vfio_res;
+			}
 
-		dev->mem_resource[i].addr = maps[i].addr;
+			dev->sparse_mem[i].size = reg->size;
+			dev->sparse_mem[i].nr_maps = vfio_res->maps[i].nr_areas;
+			dev->sparse_mem[i].areas = vfio_res->maps[i].areas;
+		} else {
+			ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
+						pci_addr, i, strerror(errno));
+				free(reg);
+				goto err_vfio_res;
+			}
+
+			if (dev->is_mdev) {
+				struct pci_map *mdev_map = &maps[i];
+				mdev_map->nr_areas = 1;
+				mdev_map->areas = rte_zmalloc_socket(NULL,
+					sizeof(*mdev_map->areas),
+					RTE_CACHE_LINE_SIZE,
+					dev->device.numa_node);
+				if (maps[i].areas == NULL) {
+					RTE_LOG(ERR, EAL,
+						"Cannot allocate memory for sparse map areas\n");
+					goto err_vfio_res;
+				}
+				mdev_map->areas[0].addr = maps[i].addr;
+				mdev_map->areas[0].offset = 0;
+				mdev_map->areas[0].size = reg->size;
+				dev->sparse_mem[i].size = reg->size;
+				dev->sparse_mem[i].nr_maps = 1;
+				dev->sparse_mem[i].areas = mdev_map->areas;
+			} else {
+				maps[i].nr_areas = 0;
+				maps[i].areas = NULL;
+				dev->mem_resource[i].addr = maps[i].addr;
+			}
+		}
 
 		free(reg);
 	}
@@ -940,6 +1087,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 
 	return 0;
 err_vfio_res:
+	clean_up_pci_resource(vfio_res);
 	rte_free(vfio_res);
 err_vfio_dev_fd:
 	rte_vfio_release_device(rte_pci_get_sysfs_path(),
@@ -960,7 +1108,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 	struct mapped_pci_res_list *vfio_res_list =
 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
 
-	struct pci_map *maps;
+	struct pci_map *maps, *cur;
 
 	dev->intr_handle.fd = -1;
 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
@@ -1012,14 +1160,49 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 	maps = vfio_res->maps;
 
 	for (i = 0; i < vfio_res->nb_maps; i++) {
-		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
-		if (ret < 0) {
-			RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
-					pci_addr, i, strerror(errno));
-			goto err_vfio_dev_fd;
+		cur = &maps[i];
+		if (cur->nr_areas > 1) {
+			struct vfio_region_sparse_mmap_area *areas;
+			uint32_t i;
+
+			areas = malloc(sizeof(*areas) * cur->nr_areas);
+			if (areas == NULL) {
+				RTE_LOG(ERR, EAL, "Failed to alloc vfio areas for %s\n",
+					pci_addr);
+				goto err_vfio_dev_fd;
+			}
+
+			for (i = 0; i < cur->nr_areas; i++) {
+				areas[i].offset = cur->areas[i].offset;
+				areas[i].size = cur->areas[i].size;
+			}
+
+			ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res,
+				areas, cur->nr_areas, i, MAP_FIXED,
+				dev->device.numa_node);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n",
+						pci_addr, i, strerror(errno));
+				free(areas);
+				goto err_vfio_dev_fd;
+			}
+
+			free(areas);
+		} else {
+			ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res,
+				i, MAP_FIXED);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
+						pci_addr, i, strerror(errno));
+				goto err_vfio_dev_fd;
+			}
+
+			if (dev->is_mdev)
+				cur->areas[0].addr = cur->addr;
+			else
+				dev->mem_resource[i].addr = cur->addr;
 		}
 
-		dev->mem_resource[i].addr = maps[i].addr;
 	}
 
 	/* we need save vfio_dev_fd, so it can be used during release */
@@ -1054,8 +1237,6 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
 			const char *pci_addr)
 {
 	struct mapped_pci_resource *vfio_res = NULL;
-	struct pci_map *maps;
-	int i;
 
 	/* Get vfio_res */
 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
@@ -1079,19 +1260,7 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
 	RTE_LOG(INFO, EAL, "Releasing PCI mapped resource for %s\n",
 		pci_addr);
 
-	maps = vfio_res->maps;
-	for (i = 0; i < vfio_res->nb_maps; i++) {
-
-		/*
-		 * We do not need to be aware of MSI-X table BAR mappings as
-		 * when mapping. Just using current maps array is enough
-		 */
-		if (maps[i].addr) {
-			RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
-				pci_addr, maps[i].addr);
-			pci_unmap_resource(maps[i].addr, maps[i].size);
-		}
-	}
+	clean_up_pci_resource(vfio_res);
 
 	return vfio_res;
 }
diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
index 3515c086aa..8d94d8acf8 100644
--- a/drivers/bus/pci/private.h
+++ b/drivers/bus/pci/private.h
@@ -110,6 +110,8 @@ struct pci_map {
 	uint64_t offset;
 	uint64_t size;
 	uint64_t phaddr;
+	uint32_t nr_areas;
+	struct rte_mem_map_area *areas;
 };
 
 struct pci_msix_table {
diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
index fb7d934bd0..ddc913f121 100644
--- a/drivers/bus/pci/rte_bus_pci.h
+++ b/drivers/bus/pci/rte_bus_pci.h
@@ -70,6 +70,18 @@ enum rte_pci_kernel_driver {
 	RTE_PCI_KDRV_NET_UIO,      /* NetUIO for Windows */
 };
 
+struct rte_mem_map_area {
+	void *addr;
+	uint64_t offset;
+	uint64_t size;
+};
+
+struct rte_sparse_mem_map {
+	uint64_t size;
+	uint32_t nr_maps;
+	struct rte_mem_map_area *areas;
+};
+
 /**
  * A structure describing a PCI device.
  */
@@ -82,8 +94,12 @@ struct rte_pci_device {
 	};
 	uint8_t is_mdev;                    /**< True for mediated PCI device */
 	struct rte_pci_id id;               /**< PCI ID. */
-	struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
+	union {
+		struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
 					    /**< PCI Memory Resource */
+		struct rte_sparse_mem_map sparse_mem[PCI_MAX_RESOURCE];
+					    /**< Sparse Memory Map for Mdev */
+	};
 	struct rte_intr_handle intr_handle; /**< Interrupt handle */
 	struct rte_pci_driver *driver;      /**< PCI driver used in probing */
 	uint16_t max_vfs;                   /**< sriov enable if not zero */
-- 
2.17.1


  parent reply	other threads:[~2021-06-01  3:18 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-03  7:18 [RFC 0/3] Add mdev (Mediated device) support in DPDK Tiwei Bie
2019-04-03  7:18 ` [RFC 1/3] eal: add a helper for reading string from sysfs Tiwei Bie
2019-04-03  7:18 ` [RFC 2/3] bus/mdev: add mdev bus support Tiwei Bie
2019-04-03  7:18 ` [RFC 3/3] bus/pci: add mdev support Tiwei Bie
2019-04-03 14:13   ` Wiles, Keith
2019-04-04  4:19     ` Tiwei Bie
2019-04-08  8:44 ` [dpdk-dev] [RFC 0/3] Add mdev (Mediated device) support in DPDK Alejandro Lucero
2019-04-08  9:36   ` Tiwei Bie
2019-04-10 10:02     ` Francois Ozog
2023-07-03 23:54       ` Stephen Hemminger
2019-07-15  7:52 ` [dpdk-dev] [RFC v2 0/5] " Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 1/5] bus/pci: introduce an internal representation of PCI device Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 2/5] bus/pci: avoid depending on private value in kernel source Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 3/5] bus/pci: introduce helper for MMIO read and write Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 4/5] eal: add a helper for reading string from sysfs Tiwei Bie
2019-07-15  7:52   ` [dpdk-dev] [RFC v2 5/5] bus/pci: add mdev support Tiwei Bie
2021-06-01  3:06     ` [dpdk-dev] [RFC v3 0/6] Add mdev (Mediated device) support in DPDK Chenbo Xia
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 1/6] bus/pci: introduce an internal representation of PCI device Chenbo Xia
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 2/6] bus/pci: avoid depending on private value in kernel source Chenbo Xia
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 3/6] bus/pci: introduce helper for MMIO read and write Chenbo Xia
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 4/6] eal: add a helper for reading string from sysfs Chenbo Xia
2021-06-01  5:37         ` Stephen Hemminger
2021-06-08  5:47           ` Xia, Chenbo
2021-06-01  5:39         ` Stephen Hemminger
2021-06-08  5:48           ` Xia, Chenbo
2021-06-11  7:19         ` Thomas Monjalon
2021-06-01  3:06       ` [dpdk-dev] [RFC v3 5/6] bus/pci: add mdev support Chenbo Xia
2021-06-01  3:06       ` Chenbo Xia [this message]
2021-06-11  7:15       ` [dpdk-dev] [RFC v3 0/6] Add mdev (Mediated device) support in DPDK Thomas Monjalon
2021-06-15  2:49         ` Xia, Chenbo
2021-06-15  7:48           ` Thomas Monjalon
2021-06-15 10:44             ` Xia, Chenbo
2021-06-15 11:57             ` Jason Gunthorpe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210601030644.3318-7-chenbo.xia@intel.com \
    --to=chenbo.xia@intel.com \
    --cc=anatoly.burakov@intel.com \
    --cc=bruce.richardson@intel.com \
    --cc=cunming.liang@intel.com \
    --cc=david.marchand@redhat.com \
    --cc=dev@dpdk.org \
    --cc=ferruh.yigit@intel.com \
    --cc=jingjing.wu@intel.com \
    --cc=konstantin.ananyev@intel.com \
    --cc=mdr@ashroe.eu \
    --cc=nhorman@tuxdriver.com \
    --cc=stephen@networkplumber.org \
    --cc=thomas@monjalon.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.