All of lore.kernel.org
 help / color / mirror / Atom feed
From: Anatoly Burakov <anatoly.burakov@intel.com>
To: dev@dpdk.org
Cc: jerin.jacob@caviumnetworks.com, thomas@monjalon.net,
	t.yoshimura8869@gmail.com
Subject: [PATCH 18.11 v2] pci/vfio: allow mapping MSI-X BARs if kernel allows it
Date: Tue, 31 Jul 2018 12:28:46 +0100	[thread overview]
Message-ID: <35b0d3fcf990a064a94dc93d834d86352603de98.1533036456.git.anatoly.burakov@intel.com> (raw)
In-Reply-To: <b38b240d6b11f3f7dc6551b3d2232b5074a1202f.1532949065.git.anatoly.burakov@intel.com>

Currently, DPDK will skip mapping some areas (or even an entire BAR)
if MSI-X table happens to be in them but is smaller than page size.

Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
as a capability flag. Capability flags themselves are also only
supported since kernel 4.6 [2].

This commit will introduce support for checking VFIO capabilities,
and will use it to check if we are allowed to map BARs with MSI-X
tables in them, along with backwards compatibility for older
kernels, including a workaround for a variable rename in VFIO
region info structure [3].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279

[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v2->v1:
    - Fix pointer in pci_vfio_get_region_info
    - Fix commit message

 drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
 lib/librte_eal/common/include/rte_vfio.h |  26 +++++
 2 files changed, 140 insertions(+), 13 deletions(-)

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 686386d6a..24f665c20 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 	return 0;
 }
 
+/*
+ * region info may contain capability headers, so we need to keep reallocating
+ * the memory until we match allocated memory size with argsz.
+ */
+static int
+pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
+		int region)
+{
+	struct vfio_region_info *ri;
+	size_t argsz = sizeof(*ri);
+	int ret;
+
+	ri = malloc(sizeof(*ri));
+	if (ri == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
+		return -1;
+	}
+again:
+	memset(ri, 0, argsz);
+	ri->argsz = argsz;
+	ri->index = region;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
+	if (ret) {
+		free(ri);
+		return ret;
+	}
+	if (ri->argsz != argsz) {
+		argsz = ri->argsz;
+		ri = realloc(ri, argsz);
+
+		if (ri == NULL) {
+			RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n");
+			return -1;
+		}
+		goto again;
+	}
+	*info = ri;
+
+	return 0;
+}
+
+static struct vfio_info_cap_header *
+pci_vfio_info_cap(struct vfio_region_info *info, int cap)
+{
+	struct vfio_info_cap_header *h;
+	size_t offset;
+
+	if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
+		/* VFIO info does not advertise capabilities */
+		return NULL;
+	}
+
+	offset = VFIO_CAP_OFFSET(info);
+	while (offset != 0) {
+		h = RTE_PTR_ADD(info, offset);
+		if (h->id == cap)
+			return h;
+		offset = h->next;
+	}
+	return NULL;
+}
+
+static int
+pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
+{
+	struct vfio_region_info *info;
+	int ret;
+
+	ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
+	if (ret < 0)
+		return -1;
+
+	ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
+
+	/* cleanup */
+	free(info);
+
+	return ret;
+}
+
+
 static int
 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 {
@@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 	if (ret < 0) {
 		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
 				pci_addr);
-		goto err_vfio_dev_fd;
+		goto err_vfio_res;
+	}
+	/* if we found our MSI-X BAR region, check if we can mmap it */
+	if (vfio_res->msix_table.bar_index != -1) {
+		int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
+				vfio_res->msix_table.bar_index);
+		if (ret < 0) {
+			RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
+			goto err_vfio_res;
+		} else if (ret != 0) {
+			/* we can map it, so we don't care where it is */
+			RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
+			vfio_res->msix_table.bar_index = -1;
+		}
 	}
 
 	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
-		struct vfio_region_info reg = { .argsz = sizeof(reg) };
+		struct vfio_region_info *reg;
 		void *bar_addr;
 
-		reg.index = i;
-
-		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
-		if (ret) {
+		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
+		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  %s cannot get device region info "
-					"error %i (%s)\n", pci_addr, errno, strerror(errno));
+				"error %i (%s)\n", pci_addr, errno,
+				strerror(errno));
 			goto err_vfio_res;
 		}
 
 		/* chk for io port region */
 		ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
-		if (ret < 0)
+		if (ret < 0) {
+			free(reg);
 			goto err_vfio_res;
-		else if (ret) {
+		} else if (ret) {
 			RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
 					i);
+			free(reg);
 			continue;
 		}
 
 		/* skip non-mmapable BARs */
-		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+		if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+			free(reg);
 			continue;
+		}
 
 		/* try mapping somewhere close to the end of hugepages */
 		if (pci_map_addr == NULL)
 			pci_map_addr = pci_find_max_end_va();
 
 		bar_addr = pci_map_addr;
-		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
+		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
 
 		maps[i].addr = bar_addr;
-		maps[i].offset = reg.offset;
-		maps[i].size = reg.size;
+		maps[i].offset = reg->offset;
+		maps[i].size = reg->size;
 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
 
 		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
 		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
 					pci_addr, i, strerror(errno));
+			free(reg);
 			goto err_vfio_res;
 		}
 
 		dev->mem_resource[i].addr = maps[i].addr;
+
+		free(reg);
 	}
 
 	if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 5ca13fcce..f6617e004 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -14,6 +14,8 @@
 extern "C" {
 #endif
 
+#include <stdint.h>
+
 /*
  * determine if VFIO is present on the system
  */
@@ -44,6 +46,30 @@ extern "C" {
 #define RTE_VFIO_NOIOMMU 8
 #endif
 
+/*
+ * capabilities are only supported on kernel 4.6+. there were also some API
+ * changes as well, so add a macro to get cap offset.
+ */
+#ifdef VFIO_REGION_INFO_FLAG_CAPS
+#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
+#define VFIO_CAP_OFFSET(x) (x->cap_offset)
+#else
+#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
+#define VFIO_CAP_OFFSET(x) (x->resv)
+struct vfio_info_cap_header {
+	uint16_t id;
+	uint16_t version;
+	uint32_t next;
+};
+#endif
+
+/* kernels 4.16+ can map BAR containing MSI-X table */
+#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#else
+#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
+#endif
+
 #else /* not VFIO_PRESENT */
 
 /* we don't need an actual definition, only pointer is used */
-- 
2.17.1

  parent reply	other threads:[~2018-07-31 11:29 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-30 11:17 [PATCH 18.11] pci/vfio: allow mapping MSI-X BARs if kernel allows it Anatoly Burakov
2018-07-31  9:38 ` Takeshi Yoshimura
2018-07-31 11:24   ` Burakov, Anatoly
2018-07-31 11:28 ` Anatoly Burakov [this message]
2018-08-02  6:47   ` [PATCH 18.11 v2] " Takeshi Yoshimura
2018-08-02  8:17     ` Burakov, Anatoly
2018-09-20 13:11   ` [PATCH v3] " Anatoly Burakov
2018-10-03 22:40     ` Thomas Monjalon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=35b0d3fcf990a064a94dc93d834d86352603de98.1533036456.git.anatoly.burakov@intel.com \
    --to=anatoly.burakov@intel.com \
    --cc=dev@dpdk.org \
    --cc=jerin.jacob@caviumnetworks.com \
    --cc=t.yoshimura8869@gmail.com \
    --cc=thomas@monjalon.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.