From mboxrd@z Thu Jan 1 00:00:00 1970 From: Takeshi Yoshimura Subject: Re: [PATCH 18.11 v2] pci/vfio: allow mapping MSI-X BARs if kernel allows it Date: Thu, 2 Aug 2018 15:47:25 +0900 Message-ID: References: <35b0d3fcf990a064a94dc93d834d86352603de98.1533036456.git.anatoly.burakov@intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable Cc: dev@dpdk.org, Jerin Jacob , thomas@monjalon.net To: Anatoly Burakov Return-path: Received: from mail-pg1-f193.google.com (mail-pg1-f193.google.com [209.85.215.193]) by dpdk.org (Postfix) with ESMTP id 5D6781B436 for ; Thu, 2 Aug 2018 08:47:27 +0200 (CEST) Received: by mail-pg1-f193.google.com with SMTP id a11-v6so659610pgw.6 for ; Wed, 01 Aug 2018 23:47:27 -0700 (PDT) In-Reply-To: <35b0d3fcf990a064a94dc93d834d86352603de98.1533036456.git.anatoly.burakov@intel.com> List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" 2018-07-31 20:28 GMT+09:00 Anatoly Burakov : > Currently, DPDK will skip mapping some areas (or even an entire BAR) > if MSI-X table happens to be in them but is smaller than page size. > > Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this > as a capability flag. Capability flags themselves are also only > supported since kernel 4.6 [2]. > > This commit will introduce support for checking VFIO capabilities, > and will use it to check if we are allowed to map BARs with MSI-X > tables in them, along with backwards compatibility for older > kernels, including a workaround for a variable rename in VFIO > region info structure [3]. > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ > linux.git/commit/?id=3Da32295c612c57990d17fb0f41e7134394b2f35f6 > > [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ > linux.git/commit/?id=3Dc84982adb23bcf3b99b79ca33527cd2625fbe279 > > [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ > linux.git/commit/?id=3Dff63eb638d63b95e489f976428f1df01391e15e4 > > Signed-off-by: Anatoly Burakov > --- > > Notes: > v2->v1: > - Fix pointer in pci_vfio_get_region_info > - Fix commit message > > drivers/bus/pci/linux/pci_vfio.c | 127 ++++++++++++++++++++--- > lib/librte_eal/common/include/rte_vfio.h | 26 +++++ > 2 files changed, 140 insertions(+), 13 deletions(-) > > diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci= _vfio.c > index 686386d6a..24f665c20 100644 > --- a/drivers/bus/pci/linux/pci_vfio.c > +++ b/drivers/bus/pci/linux/pci_vfio.c > @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci= _resource *vfio_res, > return 0; > } > > +/* > + * region info may contain capability headers, so we need to keep reallo= cating > + * the memory until we match allocated memory size with argsz. > + */ > +static int > +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info= , > + int region) > +{ > + struct vfio_region_info *ri; > + size_t argsz =3D sizeof(*ri); > + int ret; > + > + ri =3D malloc(sizeof(*ri)); > + if (ri =3D=3D NULL) { > + RTE_LOG(ERR, EAL, "Cannot allocate memory for region info= \n"); > + return -1; > + } > +again: > + memset(ri, 0, argsz); > + ri->argsz =3D argsz; > + ri->index =3D region; > + > + ret =3D ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri); > + if (ret) { > + free(ri); > + return ret; > + } > + if (ri->argsz !=3D argsz) { > + argsz =3D ri->argsz; > + ri =3D realloc(ri, argsz); > + > + if (ri =3D=3D NULL) { > + RTE_LOG(ERR, EAL, "Cannot reallocate memory for r= egion info\n"); > + return -1; > + } > + goto again; > + } > + *info =3D ri; > + > + return 0; > +} > + > +static struct vfio_info_cap_header * > +pci_vfio_info_cap(struct vfio_region_info *info, int cap) > +{ > + struct vfio_info_cap_header *h; > + size_t offset; > + > + if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) =3D=3D 0) { > + /* VFIO info does not advertise capabilities */ > + return NULL; > + } > + > + offset =3D VFIO_CAP_OFFSET(info); > + while (offset !=3D 0) { > + h =3D RTE_PTR_ADD(info, offset); > + if (h->id =3D=3D cap) > + return h; > + offset =3D h->next; > + } > + return NULL; > +} > + > +static int > +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) > +{ > + struct vfio_region_info *info; > + int ret; > + > + ret =3D pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region)= ; > + if (ret < 0) > + return -1; > + > + ret =3D pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) !=3D = NULL; > + > + /* cleanup */ > + free(info); > + > + return ret; > +} > + > + > static int > pci_vfio_map_resource_primary(struct rte_pci_device *dev) > { > @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device= *dev) > if (ret < 0) { > RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", > pci_addr); > - goto err_vfio_dev_fd; > + goto err_vfio_res; > + } > + /* if we found our MSI-X BAR region, check if we can mmap it */ > + if (vfio_res->msix_table.bar_index !=3D -1) { > + int ret =3D pci_vfio_msix_is_mappable(vfio_dev_fd, > + vfio_res->msix_table.bar_index); > + if (ret < 0) { > + RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is= mappable\n"); > + goto err_vfio_res; > + } else if (ret !=3D 0) { > + /* we can map it, so we don't care where it is */ > + RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as ma= ppable\n"); > + vfio_res->msix_table.bar_index =3D -1; > + } > } > > for (i =3D 0; i < (int) vfio_res->nb_maps; i++) { > - struct vfio_region_info reg =3D { .argsz =3D sizeof(reg) = }; > + struct vfio_region_info *reg; > void *bar_addr; > > - reg.index =3D i; > - > - ret =3D ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &= reg); > - if (ret) { > + ret =3D pci_vfio_get_region_info(vfio_dev_fd, ®, i); > + if (ret < 0) { > RTE_LOG(ERR, EAL, " %s cannot get device region = info " > - "error %i (%s)\n", pci_addr, errn= o, strerror(errno)); > + "error %i (%s)\n", pci_addr, errno, > + strerror(errno)); > goto err_vfio_res; > } > > /* chk for io port region */ > ret =3D pci_vfio_is_ioport_bar(vfio_dev_fd, i); > - if (ret < 0) > + if (ret < 0) { > + free(reg); > goto err_vfio_res; > - else if (ret) { > + } else if (ret) { > RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d= )\n", > i); > + free(reg); > continue; > } > > /* skip non-mmapable BARs */ > - if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) =3D=3D 0) > + if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) =3D=3D 0) { > + free(reg); > continue; > + } > > /* try mapping somewhere close to the end of hugepages */ > if (pci_map_addr =3D=3D NULL) > pci_map_addr =3D pci_find_max_end_va(); > > bar_addr =3D pci_map_addr; > - pci_map_addr =3D RTE_PTR_ADD(bar_addr, (size_t) reg.size)= ; > + pci_map_addr =3D RTE_PTR_ADD(bar_addr, (size_t) reg->size= ); > > maps[i].addr =3D bar_addr; > - maps[i].offset =3D reg.offset; > - maps[i].size =3D reg.size; > + maps[i].offset =3D reg->offset; > + maps[i].size =3D reg->size; > maps[i].path =3D NULL; /* vfio doesn't have per-resource = paths */ > > ret =3D pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); > if (ret < 0) { > RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\= n", > pci_addr, i, strerror(errno)); > + free(reg); > goto err_vfio_res; > } > > dev->mem_resource[i].addr =3D maps[i].addr; > + > + free(reg); > } > > if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { > diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/co= mmon/include/rte_vfio.h > index 5ca13fcce..f6617e004 100644 > --- a/lib/librte_eal/common/include/rte_vfio.h > +++ b/lib/librte_eal/common/include/rte_vfio.h > @@ -14,6 +14,8 @@ > extern "C" { > #endif > > +#include > + > /* > * determine if VFIO is present on the system > */ > @@ -44,6 +46,30 @@ extern "C" { > #define RTE_VFIO_NOIOMMU 8 > #endif > > +/* > + * capabilities are only supported on kernel 4.6+. there were also some = API > + * changes as well, so add a macro to get cap offset. > + */ > +#ifdef VFIO_REGION_INFO_FLAG_CAPS > +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS > +#define VFIO_CAP_OFFSET(x) (x->cap_offset) > +#else > +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3) > +#define VFIO_CAP_OFFSET(x) (x->resv) > +struct vfio_info_cap_header { > + uint16_t id; > + uint16_t version; > + uint32_t next; > +}; > +#endif > + > +/* kernels 4.16+ can map BAR containing MSI-X table */ > +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE > +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE > +#else > +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3 > +#endif > + > #else /* not VFIO_PRESENT */ > > /* we don't need an actual definition, only pointer is used */ > -- > 2.17.1 Hi Anatoly, Please fix the error check for ioctl in pci_vfio_region_info() from "if (ret)" to "if (ret < 0)" My environment reported compiler errors with -Werror=3Dmaybe-uninitialized)= . dpdk/drivers/bus/pci/linux/pci_vfio.c: In function =E2=80=98pci_vfio_map_resource_primary=E2=80=99: dpdk/drivers/bus/pci/linux/pci_vfio.c:612:4: error: =E2=80=98reg=E2=80=99 m= ay be used uninitialized in this function [-Werror=3Dmaybe-uninitialized] free(reg); ^~~~~~~~~ dpdk/drivers/bus/pci/linux/pci_vfio.c:495:2: error: =E2=80=98info=E2=80=99 = may be used uninitialized in this function [-Werror=3Dmaybe-uninitialized] free(info); ^~~~~~~~~~ dpdk/drivers/bus/pci/linux/pci_vfio.c:485:27: note: =E2=80=98info=E2=80=99 = was declared here struct vfio_region_info *info; Other code looks good to me. I tested the updated patch with the above change and confirmed it could mmap BAR on my ppc64le machine. Thanks, Takeshi