linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yishai Hadas <yishaih@nvidia.com>
To: <linux-rdma@vger.kernel.org>
Cc: <jgg@nvidia.com>, <yishaih@nvidia.com>, <maorg@nvidia.com>,
	<markzhang@nvidia.com>, <edwards@nvidia.com>
Subject: [PATCH rdma-core 06/27] mlx5: Setup mlx5 vfio context
Date: Tue, 20 Jul 2021 11:16:26 +0300	[thread overview]
Message-ID: <20210720081647.1980-7-yishaih@nvidia.com> (raw)
In-Reply-To: <20210720081647.1980-1-yishaih@nvidia.com>

Setup mlx5 vfio context by using the ioctl API over vfio to read the
device initialization segment and other required stuff.

As part of that the applicable IOVA ranges are set to be ready for DMA
usage and the command interface is initialized.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 providers/mlx5/mlx5.h      |   2 +
 providers/mlx5/mlx5_vfio.c | 740 +++++++++++++++++++++++++++++++++++++++++++++
 providers/mlx5/mlx5_vfio.h | 134 ++++++++
 3 files changed, 876 insertions(+)

diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h
index 7436bc8..7e7d70d 100644
--- a/providers/mlx5/mlx5.h
+++ b/providers/mlx5/mlx5.h
@@ -72,6 +72,7 @@ enum {
 
 enum {
 	MLX5_ADAPTER_PAGE_SIZE		= 4096,
+	MLX5_ADAPTER_PAGE_SHIFT		= 12,
 };
 
 #define MLX5_CQ_PREFIX "MLX_CQ"
@@ -90,6 +91,7 @@ enum {
 	MLX5_DBG_CQ_CQE		= 1 << 4,
 	MLX5_DBG_CONTIG		= 1 << 5,
 	MLX5_DBG_DR		= 1 << 6,
+	MLX5_DBG_VFIO		= 1 << 7,
 };
 
 extern uint32_t mlx5_debug_mask;
diff --git a/providers/mlx5/mlx5_vfio.c b/providers/mlx5/mlx5_vfio.c
index 69c7662..86f14f1 100644
--- a/providers/mlx5/mlx5_vfio.c
+++ b/providers/mlx5/mlx5_vfio.c
@@ -15,15 +15,755 @@
 #include <sys/mman.h>
 #include <string.h>
 #include <sys/param.h>
+#include <linux/vfio.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
 
 #include "mlx5dv.h"
 #include "mlx5_vfio.h"
 #include "mlx5.h"
 
+static void mlx5_vfio_free_cmd_msg(struct mlx5_vfio_context *ctx,
+				   struct mlx5_cmd_msg *msg);
+
+static int mlx5_vfio_alloc_cmd_msg(struct mlx5_vfio_context *ctx,
+				   uint32_t size, struct mlx5_cmd_msg *msg);
+
+static int mlx5_vfio_register_mem(struct mlx5_vfio_context *ctx,
+				  void *vaddr, uint64_t iova, uint64_t size)
+{
+	struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) };
+
+	dma_map.vaddr = (uintptr_t)vaddr;
+	dma_map.size = size;
+	dma_map.iova = iova;
+	dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+	return ioctl(ctx->container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+}
+
+static void mlx5_vfio_unregister_mem(struct mlx5_vfio_context *ctx,
+				     uint64_t iova, uint64_t size)
+{
+	struct vfio_iommu_type1_dma_unmap dma_unmap = {};
+
+	dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+	dma_unmap.size = size;
+	dma_unmap.iova = iova;
+
+	if (ioctl(ctx->container_fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap))
+		assert(false);
+}
+
+static struct page_block *mlx5_vfio_new_block(struct mlx5_vfio_context *ctx)
+{
+	struct page_block *page_block;
+	int err;
+
+	page_block = calloc(1, sizeof(*page_block));
+	if (!page_block) {
+		errno = ENOMEM;
+		return NULL;
+	}
+
+	err = posix_memalign(&page_block->page_ptr, MLX5_VFIO_BLOCK_SIZE,
+			     MLX5_VFIO_BLOCK_SIZE);
+	if (err) {
+		errno = err;
+		goto err;
+	}
+
+	err = iset_alloc_range(ctx->iova_alloc, MLX5_VFIO_BLOCK_SIZE, &page_block->iova);
+	if (err)
+		goto err_range;
+
+	bitmap_fill(page_block->free_pages, MLX5_VFIO_BLOCK_NUM_PAGES);
+	err = mlx5_vfio_register_mem(ctx, page_block->page_ptr, page_block->iova,
+				     MLX5_VFIO_BLOCK_SIZE);
+	if (err)
+		goto err_reg;
+
+	list_add(&ctx->mem_alloc.block_list, &page_block->next_block);
+	return page_block;
+
+err_reg:
+	iset_insert_range(ctx->iova_alloc, page_block->iova,
+			  MLX5_VFIO_BLOCK_SIZE);
+err_range:
+	free(page_block->page_ptr);
+err:
+	free(page_block);
+	return NULL;
+}
+
+static void mlx5_vfio_free_block(struct mlx5_vfio_context *ctx,
+				 struct page_block *page_block)
+{
+	mlx5_vfio_unregister_mem(ctx, page_block->iova, MLX5_VFIO_BLOCK_SIZE);
+	iset_insert_range(ctx->iova_alloc, page_block->iova, MLX5_VFIO_BLOCK_SIZE);
+	list_del(&page_block->next_block);
+	free(page_block->page_ptr);
+	free(page_block);
+}
+
+static int mlx5_vfio_alloc_page(struct mlx5_vfio_context *ctx, uint64_t *iova)
+{
+	struct page_block *page_block;
+	unsigned long pg;
+	int ret = 0;
+
+	pthread_mutex_lock(&ctx->mem_alloc.block_list_mutex);
+	while (true) {
+		list_for_each(&ctx->mem_alloc.block_list, page_block, next_block) {
+			pg = bitmap_ffs(page_block->free_pages, 0, MLX5_VFIO_BLOCK_NUM_PAGES);
+			if (pg != MLX5_VFIO_BLOCK_NUM_PAGES) {
+				bitmap_clear_bit(page_block->free_pages, pg);
+				*iova = page_block->iova + pg * MLX5_ADAPTER_PAGE_SIZE;
+				goto end;
+			}
+		}
+		if (!mlx5_vfio_new_block(ctx)) {
+			ret = -1;
+			goto end;
+		}
+	}
+end:
+	pthread_mutex_unlock(&ctx->mem_alloc.block_list_mutex);
+	return ret;
+}
+
+static void mlx5_vfio_free_page(struct mlx5_vfio_context *ctx, uint64_t iova)
+{
+	struct page_block *page_block;
+	unsigned long pg;
+
+	pthread_mutex_lock(&ctx->mem_alloc.block_list_mutex);
+	list_for_each(&ctx->mem_alloc.block_list, page_block, next_block) {
+		if (page_block->iova > iova ||
+		    (page_block->iova + MLX5_VFIO_BLOCK_SIZE <= iova))
+			continue;
+
+		pg = (iova - page_block->iova) / MLX5_ADAPTER_PAGE_SIZE;
+		assert(!bitmap_test_bit(page_block->free_pages, pg));
+		bitmap_set_bit(page_block->free_pages, pg);
+		if (bitmap_full(page_block->free_pages, MLX5_VFIO_BLOCK_NUM_PAGES))
+			mlx5_vfio_free_block(ctx, page_block);
+
+		goto end;
+	}
+
+	assert(false);
+end:
+	pthread_mutex_unlock(&ctx->mem_alloc.block_list_mutex);
+}
+
+static int mlx5_vfio_enable_pci_cmd(struct mlx5_vfio_context *ctx)
+{
+	struct vfio_region_info pci_config_reg = {};
+	uint16_t pci_com_buf = 0x6;
+	char buffer[4096];
+
+	pci_config_reg.argsz = sizeof(pci_config_reg);
+	pci_config_reg.index = VFIO_PCI_CONFIG_REGION_INDEX;
+
+	if (ioctl(ctx->device_fd, VFIO_DEVICE_GET_REGION_INFO, &pci_config_reg))
+		return -1;
+
+	if (pwrite(ctx->device_fd, &pci_com_buf, 2, pci_config_reg.offset + 0x4) != 2)
+		return -1;
+
+	if (pread(ctx->device_fd, buffer, pci_config_reg.size, pci_config_reg.offset)
+			!= pci_config_reg.size)
+		return -1;
+
+	return 0;
+}
+
+static void free_cmd_box(struct mlx5_vfio_context *ctx,
+			 struct mlx5_cmd_mailbox *mailbox)
+{
+	mlx5_vfio_unregister_mem(ctx, mailbox->iova, MLX5_ADAPTER_PAGE_SIZE);
+	iset_insert_range(ctx->iova_alloc, mailbox->iova, MLX5_ADAPTER_PAGE_SIZE);
+	free(mailbox->buf);
+	free(mailbox);
+}
+
+static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_cmd_mailbox *mailbox;
+	int ret;
+
+	mailbox = calloc(1, sizeof(*mailbox));
+	if (!mailbox) {
+		errno = ENOMEM;
+		return NULL;
+	}
+
+	ret = posix_memalign(&mailbox->buf, MLX5_ADAPTER_PAGE_SIZE,
+			     MLX5_ADAPTER_PAGE_SIZE);
+	if (ret) {
+		errno = ret;
+		goto err_free;
+	}
+
+	memset(mailbox->buf, 0, MLX5_ADAPTER_PAGE_SIZE);
+
+	ret = iset_alloc_range(ctx->iova_alloc, MLX5_ADAPTER_PAGE_SIZE, &mailbox->iova);
+	if (ret)
+		goto err_tree;
+
+	ret = mlx5_vfio_register_mem(ctx, mailbox->buf, mailbox->iova,
+				     MLX5_ADAPTER_PAGE_SIZE);
+	if (ret)
+		goto err_reg;
+
+	return mailbox;
+
+err_reg:
+	iset_insert_range(ctx->iova_alloc, mailbox->iova,
+			  MLX5_ADAPTER_PAGE_SIZE);
+err_tree:
+	free(mailbox->buf);
+err_free:
+	free(mailbox);
+	return NULL;
+}
+
+static int mlx5_calc_cmd_blocks(uint32_t msg_len)
+{
+	int size = msg_len;
+	int blen = size - min_t(int, 16, size);
+
+	return DIV_ROUND_UP(blen, MLX5_CMD_DATA_BLOCK_SIZE);
+}
+
+static void mlx5_vfio_free_cmd_msg(struct mlx5_vfio_context *ctx,
+				   struct mlx5_cmd_msg *msg)
+{
+	struct mlx5_cmd_mailbox *head = msg->next;
+	struct mlx5_cmd_mailbox *next;
+
+	while (head) {
+		next = head->next;
+		free_cmd_box(ctx, head);
+		head = next;
+	}
+	msg->len = 0;
+}
+
+static int mlx5_vfio_alloc_cmd_msg(struct mlx5_vfio_context *ctx,
+				   uint32_t size, struct mlx5_cmd_msg *msg)
+{
+	struct mlx5_cmd_mailbox *tmp, *head = NULL;
+	struct mlx5_cmd_block *block;
+	int i, num_blocks;
+
+	msg->len = size;
+	num_blocks = mlx5_calc_cmd_blocks(size);
+
+	for (i = 0; i < num_blocks; i++) {
+		tmp = alloc_cmd_box(ctx);
+		if (!tmp)
+			goto err_alloc;
+
+		block = tmp->buf;
+		tmp->next = head;
+		block->next = htobe64(tmp->next ? tmp->next->iova : 0);
+		block->block_num = htobe32(num_blocks - i - 1);
+		head = tmp;
+	}
+	msg->next = head;
+	return 0;
+
+err_alloc:
+	while (head) {
+		tmp = head->next;
+		free_cmd_box(ctx, head);
+		head = tmp;
+	}
+	msg->len = 0;
+	return -1;
+}
+
+static void mlx5_vfio_free_cmd_slot(struct mlx5_vfio_context *ctx, int slot)
+{
+	struct mlx5_vfio_cmd_slot *cmd_slot = &ctx->cmd.cmds[slot];
+
+	mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->in);
+	mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->out);
+	close(cmd_slot->completion_event_fd);
+}
+
+static int mlx5_vfio_setup_cmd_slot(struct mlx5_vfio_context *ctx, int slot)
+{
+	struct mlx5_vfio_cmd *cmd = &ctx->cmd;
+	struct mlx5_vfio_cmd_slot *cmd_slot = &cmd->cmds[slot];
+	struct mlx5_cmd_layout *cmd_lay;
+	int ret;
+
+	ret = mlx5_vfio_alloc_cmd_msg(ctx, 4096, &cmd_slot->in);
+	if (ret)
+		return ret;
+
+	ret = mlx5_vfio_alloc_cmd_msg(ctx, 4096, &cmd_slot->out);
+	if (ret)
+		goto err;
+
+	cmd_lay = cmd->vaddr + (slot * (1 << cmd->log_stride));
+	cmd_lay->type = MLX5_PCI_CMD_XPORT;
+	cmd_lay->iptr = htobe64(cmd_slot->in.next->iova);
+	cmd_lay->optr = htobe64(cmd_slot->out.next->iova);
+
+	cmd_slot->lay = cmd_lay;
+	cmd_slot->completion_event_fd = eventfd(0, EFD_CLOEXEC);
+	if (cmd_slot->completion_event_fd < 0) {
+		ret = -1;
+		goto err_fd;
+	}
+
+	pthread_mutex_init(&cmd_slot->lock, NULL);
+
+	return 0;
+
+err_fd:
+	mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->out);
+err:
+	mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->in);
+	return ret;
+}
+
+static int mlx5_vfio_init_cmd_interface(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_init_seg *init_seg = ctx->bar_map;
+	struct mlx5_vfio_cmd *cmd = &ctx->cmd;
+	uint16_t cmdif_rev;
+	uint32_t cmd_h, cmd_l;
+	int ret;
+
+	cmdif_rev = be32toh(init_seg->cmdif_rev_fw_sub) >> 16;
+
+	if (cmdif_rev != 5) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	cmd_l = be32toh(init_seg->cmdq_addr_l_sz) & 0xff;
+	ctx->cmd.log_sz = cmd_l >> 4 & 0xf;
+	ctx->cmd.log_stride = cmd_l & 0xf;
+	if (1 << ctx->cmd.log_sz > MLX5_MAX_COMMANDS) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (ctx->cmd.log_sz + ctx->cmd.log_stride > MLX5_ADAPTER_PAGE_SHIFT) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* The initial address must be 4K aligned */
+	ret = posix_memalign(&cmd->vaddr, MLX5_ADAPTER_PAGE_SIZE,
+			     MLX5_ADAPTER_PAGE_SIZE);
+	if (ret) {
+		errno = ret;
+		return -1;
+	}
+
+	memset(cmd->vaddr, 0, MLX5_ADAPTER_PAGE_SIZE);
+
+	ret = iset_alloc_range(ctx->iova_alloc, MLX5_ADAPTER_PAGE_SIZE, &cmd->iova);
+	if (ret)
+		goto err_free;
+
+	ret = mlx5_vfio_register_mem(ctx, cmd->vaddr, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
+	if (ret)
+		goto err_reg;
+
+	cmd_h = (uint32_t)((uint64_t)(cmd->iova) >> 32);
+	cmd_l = (uint32_t)(uint64_t)(cmd->iova);
+
+	init_seg->cmdq_addr_h = htobe32(cmd_h);
+	init_seg->cmdq_addr_l_sz = htobe32(cmd_l);
+
+	/* Make sure firmware sees the complete address before we proceed */
+	udma_to_device_barrier();
+
+	ret = mlx5_vfio_setup_cmd_slot(ctx, 0);
+	if (ret)
+		goto err_slot_0;
+
+	ret = mlx5_vfio_setup_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1);
+	if (ret)
+		goto err_slot_1;
+
+	ret = mlx5_vfio_enable_pci_cmd(ctx);
+	if (!ret)
+		return 0;
+
+	mlx5_vfio_free_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1);
+err_slot_1:
+	mlx5_vfio_free_cmd_slot(ctx, 0);
+err_slot_0:
+	mlx5_vfio_unregister_mem(ctx, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
+err_reg:
+	iset_insert_range(ctx->iova_alloc, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
+err_free:
+	free(cmd->vaddr);
+	return ret;
+}
+
+static void mlx5_vfio_clean_cmd_interface(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_vfio_cmd *cmd = &ctx->cmd;
+
+	mlx5_vfio_free_cmd_slot(ctx, 0);
+	mlx5_vfio_free_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1);
+	mlx5_vfio_unregister_mem(ctx, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
+	iset_insert_range(ctx->iova_alloc, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
+	free(cmd->vaddr);
+}
+
+static void set_iova_min_page_size(struct mlx5_vfio_context *ctx,
+				   uint64_t iova_pgsizes)
+{
+	int i;
+
+	for (i = MLX5_ADAPTER_PAGE_SHIFT; i < 64; i++) {
+		if (iova_pgsizes & (1 << i)) {
+			ctx->iova_min_page_size = 1 << i;
+			return;
+		}
+	}
+
+	assert(false);
+}
+
+/* if the kernel does not report usable IOVA regions, choose the legacy region */
+#define MLX5_VFIO_IOVA_MIN1 0x10000ULL
+#define MLX5_VFIO_IOVA_MAX1 0xFEDFFFFFULL
+#define MLX5_VFIO_IOVA_MIN2 0xFEF00000ULL
+#define MLX5_VFIO_IOVA_MAX2 ((1ULL << 39) - 1)
+
+static int mlx5_vfio_get_iommu_info(struct mlx5_vfio_context *ctx)
+{
+	struct vfio_iommu_type1_info *info;
+	int ret, i;
+	void *ptr;
+	uint32_t offset;
+
+	info = calloc(1, sizeof(*info));
+	if (!info) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	info->argsz = sizeof(*info);
+	ret = ioctl(ctx->container_fd, VFIO_IOMMU_GET_INFO, info);
+	if (ret)
+		goto end;
+
+	if (info->argsz > sizeof(*info)) {
+		info = realloc(info, info->argsz);
+		if (!info) {
+			errno = ENOMEM;
+			ret = -1;
+			goto end;
+		}
+
+		ret = ioctl(ctx->container_fd, VFIO_IOMMU_GET_INFO, info);
+		if (ret)
+			goto end;
+	}
+
+	set_iova_min_page_size(ctx, (info->flags & VFIO_IOMMU_INFO_PGSIZES) ?
+			       info->iova_pgsizes : 4096);
+
+	if (!(info->flags & VFIO_IOMMU_INFO_CAPS))
+		goto set_legacy;
+
+	offset = info->cap_offset;
+	while (offset) {
+		struct vfio_iommu_type1_info_cap_iova_range *iova_range;
+		struct vfio_info_cap_header *header;
+
+		ptr = (void *)info + offset;
+		header = ptr;
+
+		if (header->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+			offset = header->next;
+			continue;
+		}
+
+		iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)header;
+
+		for (i = 0; i < iova_range->nr_iovas; i++) {
+			mlx5_dbg(ctx->dbg_fp, MLX5_DBG_VFIO, "\t%02d: %016llx - %016llx\n", i,
+				 iova_range->iova_ranges[i].start,
+				 iova_range->iova_ranges[i].end);
+			ret = iset_insert_range(ctx->iova_alloc, iova_range->iova_ranges[i].start,
+						iova_range->iova_ranges[i].end -
+						iova_range->iova_ranges[i].start + 1);
+			if (ret)
+				goto end;
+		}
+
+		goto end;
+	}
+
+set_legacy:
+	ret = iset_insert_range(ctx->iova_alloc, MLX5_VFIO_IOVA_MIN1,
+				MLX5_VFIO_IOVA_MAX1 - MLX5_VFIO_IOVA_MIN1 + 1);
+	if (!ret)
+		ret = iset_insert_range(ctx->iova_alloc, MLX5_VFIO_IOVA_MIN2,
+					MLX5_VFIO_IOVA_MAX2 - MLX5_VFIO_IOVA_MIN2 + 1);
+
+end:
+	free(info);
+	return ret;
+}
+
+static void mlx5_vfio_clean_device_dma(struct mlx5_vfio_context *ctx)
+{
+	struct page_block *page_block, *tmp;
+
+	list_for_each_safe(&ctx->mem_alloc.block_list, page_block,
+			   tmp, next_block)
+		mlx5_vfio_free_block(ctx, page_block);
+
+	iset_destroy(ctx->iova_alloc);
+}
+
+static int mlx5_vfio_init_device_dma(struct mlx5_vfio_context *ctx)
+{
+	ctx->iova_alloc = iset_create();
+	if (!ctx->iova_alloc)
+		return -1;
+
+	list_head_init(&ctx->mem_alloc.block_list);
+	pthread_mutex_init(&ctx->mem_alloc.block_list_mutex, NULL);
+
+	if (mlx5_vfio_get_iommu_info(ctx))
+		goto err;
+
+	/* create an initial block of DMA memory ready to be used */
+	if (!mlx5_vfio_new_block(ctx))
+		goto err;
+
+	return 0;
+err:
+	iset_destroy(ctx->iova_alloc);
+	return -1;
+}
+
+static void mlx5_vfio_uninit_bar0(struct mlx5_vfio_context *ctx)
+{
+	munmap(ctx->bar_map, ctx->bar_map_size);
+}
+
+static int mlx5_vfio_init_bar0(struct mlx5_vfio_context *ctx)
+{
+	struct vfio_region_info reg = { .argsz = sizeof(reg) };
+	void *base;
+	int err;
+
+	reg.index = 0;
+	err = ioctl(ctx->device_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
+	if (err)
+		return err;
+
+	base = mmap(NULL, reg.size, PROT_READ | PROT_WRITE, MAP_SHARED,
+		    ctx->device_fd, reg.offset);
+	if (base == MAP_FAILED)
+		return -1;
+
+	ctx->bar_map = (struct mlx5_init_seg *)base;
+	ctx->bar_map_size = reg.size;
+	return 0;
+}
+
+#define MLX5_VFIO_MAX_INTR_VEC_ID 1
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+			      sizeof(int) * (MLX5_VFIO_MAX_INTR_VEC_ID))
+
+/* enable MSI-X interrupts */
+static int
+mlx5_vfio_enable_msix(struct mlx5_vfio_context *ctx)
+{
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+	struct vfio_irq_set *irq_set;
+	int len;
+	int *fd_ptr;
+
+	len = sizeof(irq_set_buf);
+
+	irq_set = (struct vfio_irq_set *)irq_set_buf;
+	irq_set->argsz = len;
+	irq_set->count = 1;
+	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+	irq_set->start = 0;
+	fd_ptr = (int *)&irq_set->data;
+	fd_ptr[0] = ctx->cmd_comp_fd;
+
+	return ioctl(ctx->device_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+}
+
+static int mlx5_vfio_init_async_fd(struct mlx5_vfio_context *ctx)
+{
+	struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+
+	irq.index = VFIO_PCI_MSIX_IRQ_INDEX;
+	if (ioctl(ctx->device_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq))
+		return -1;
+
+	/* fail if this vector cannot be used with eventfd */
+	if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0)
+		return -1;
+
+	/* set up an eventfd for command completion interrupts */
+	ctx->cmd_comp_fd = eventfd(0, EFD_CLOEXEC);
+	if (ctx->cmd_comp_fd < 0)
+		return -1;
+
+	if (mlx5_vfio_enable_msix(ctx))
+		goto err_msix;
+
+	return 0;
+
+err_msix:
+	close(ctx->cmd_comp_fd);
+	return -1;
+}
+
+static void mlx5_vfio_close_fds(struct mlx5_vfio_context *ctx)
+{
+	close(ctx->device_fd);
+	close(ctx->container_fd);
+	close(ctx->group_fd);
+	close(ctx->cmd_comp_fd);
+}
+
+static int mlx5_vfio_open_fds(struct mlx5_vfio_context *ctx,
+			      struct mlx5_vfio_device *mdev)
+{
+	struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
+
+	/* Create a new container */
+	ctx->container_fd = open("/dev/vfio/vfio", O_RDWR);
+
+	if (ctx->container_fd < 0)
+		return -1;
+
+	if (ioctl(ctx->container_fd, VFIO_GET_API_VERSION) != VFIO_API_VERSION)
+		goto close_cont;
+
+	if (!ioctl(ctx->container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
+		/* Doesn't support the IOMMU driver we want. */
+		goto close_cont;
+
+	/* Open the group */
+	ctx->group_fd = open(mdev->vfio_path, O_RDWR);
+	if (ctx->group_fd < 0)
+		goto close_cont;
+
+	/* Test the group is viable and available */
+	if (ioctl(ctx->group_fd, VFIO_GROUP_GET_STATUS, &group_status))
+		goto close_group;
+
+	if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+		/* Group is not viable (ie, not all devices bound for vfio) */
+		errno = EINVAL;
+		goto close_group;
+	}
+
+	/* Add the group to the container */
+	if (ioctl(ctx->group_fd, VFIO_GROUP_SET_CONTAINER, &ctx->container_fd))
+		goto close_group;
+
+	/* Enable the IOMMU model we want */
+	if (ioctl(ctx->container_fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU))
+		goto close_group;
+
+	/* Get a file descriptor for the device */
+	ctx->device_fd = ioctl(ctx->group_fd, VFIO_GROUP_GET_DEVICE_FD,
+			       mdev->pci_name);
+	if (ctx->device_fd < 0)
+		goto close_group;
+
+	if (mlx5_vfio_init_async_fd(ctx))
+		goto close_group;
+
+	return 0;
+
+close_group:
+	close(ctx->group_fd);
+close_cont:
+	close(ctx->container_fd);
+	return -1;
+}
+
+static void mlx5_vfio_uninit_context(struct mlx5_vfio_context *ctx)
+{
+	mlx5_close_debug_file(ctx->dbg_fp);
+
+	verbs_uninit_context(&ctx->vctx);
+	free(ctx);
+}
+
+static void mlx5_vfio_free_context(struct ibv_context *ibctx)
+{
+	struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
+
+	mlx5_vfio_clean_cmd_interface(ctx);
+	mlx5_vfio_clean_device_dma(ctx);
+	mlx5_vfio_uninit_bar0(ctx);
+	mlx5_vfio_close_fds(ctx);
+	mlx5_vfio_uninit_context(ctx);
+}
+
+static const struct verbs_context_ops mlx5_vfio_common_ops = {
+	.free_context = mlx5_vfio_free_context,
+};
+
 static struct verbs_context *
 mlx5_vfio_alloc_context(struct ibv_device *ibdev,
 			int cmd_fd, void *private_data)
 {
+	struct mlx5_vfio_device *mdev = to_mvfio_dev(ibdev);
+	struct mlx5_vfio_context *mctx;
+
+	cmd_fd = -1;
+
+	mctx = verbs_init_and_alloc_context(ibdev, cmd_fd, mctx, vctx,
+					    RDMA_DRIVER_UNKNOWN);
+	if (!mctx)
+		return NULL;
+
+	mlx5_open_debug_file(&mctx->dbg_fp);
+	mlx5_set_debug_mask();
+
+	if (mlx5_vfio_open_fds(mctx, mdev))
+		goto err;
+
+	if (mlx5_vfio_init_bar0(mctx))
+		goto close_fds;
+
+	if (mlx5_vfio_init_device_dma(mctx))
+		goto err_bar;
+
+	if (mlx5_vfio_init_cmd_interface(mctx))
+		goto err_dma;
+
+	verbs_set_ops(&mctx->vctx, &mlx5_vfio_common_ops);
+	return &mctx->vctx;
+
+err_dma:
+	mlx5_vfio_clean_device_dma(mctx);
+err_bar:
+	mlx5_vfio_uninit_bar0(mctx);
+close_fds:
+	mlx5_vfio_close_fds(mctx);
+err:
+	mlx5_vfio_uninit_context(mctx);
 	return NULL;
 }
 
diff --git a/providers/mlx5/mlx5_vfio.h b/providers/mlx5/mlx5_vfio.h
index 6ba4254..392ddcb 100644
--- a/providers/mlx5/mlx5_vfio.h
+++ b/providers/mlx5/mlx5_vfio.h
@@ -8,8 +8,21 @@
 
 #include <stddef.h>
 #include <stdio.h>
+#include "mlx5.h"
 
 #include <infiniband/driver.h>
+#include <util/interval_set.h>
+
+enum {
+	MLX5_MAX_COMMANDS = 32,
+	MLX5_CMD_DATA_BLOCK_SIZE = 512,
+	MLX5_PCI_CMD_XPORT = 7,
+};
+
+enum {
+	MLX5_VFIO_BLOCK_SIZE = 2 * 1024 * 1024,
+	MLX5_VFIO_BLOCK_NUM_PAGES = MLX5_VFIO_BLOCK_SIZE / MLX5_ADAPTER_PAGE_SIZE,
+};
 
 struct mlx5_vfio_device {
 	struct verbs_device vdev;
@@ -19,9 +32,130 @@ struct mlx5_vfio_device {
 	uint32_t flags;
 };
 
+struct health_buffer {
+	__be32		assert_var[5];
+	__be32		rsvd0[3];
+	__be32		assert_exit_ptr;
+	__be32		assert_callra;
+	__be32		rsvd1[2];
+	__be32		fw_ver;
+	__be32		hw_id;
+	__be32		rfr;
+	uint8_t		irisc_index;
+	uint8_t		synd;
+	__be16		ext_synd;
+};
+
+struct mlx5_init_seg {
+	__be32			fw_rev;
+	__be32			cmdif_rev_fw_sub;
+	__be32			rsvd0[2];
+	__be32			cmdq_addr_h;
+	__be32			cmdq_addr_l_sz;
+	__be32			cmd_dbell;
+	__be32			rsvd1[120];
+	__be32			initializing;
+	struct health_buffer	health;
+	__be32			rsvd2[880];
+	__be32			internal_timer_h;
+	__be32			internal_timer_l;
+	__be32			rsvd3[2];
+	__be32			health_counter;
+	__be32			rsvd4[1019];
+	__be64			ieee1588_clk;
+	__be32			ieee1588_clk_type;
+	__be32			clr_intx;
+};
+
+struct mlx5_cmd_layout {
+	uint8_t		type;
+	uint8_t		rsvd0[3];
+	__be32		ilen;
+	__be64		iptr;
+	__be32		in[4];
+	__be32		out[4];
+	__be64		optr;
+	__be32		olen;
+	uint8_t		token;
+	uint8_t		sig;
+	uint8_t		rsvd1;
+	uint8_t		status_own;
+};
+
+struct mlx5_cmd_block {
+	uint8_t		data[MLX5_CMD_DATA_BLOCK_SIZE];
+	uint8_t		rsvd0[48];
+	__be64		next;
+	__be32		block_num;
+	uint8_t		rsvd1;
+	uint8_t		token;
+	uint8_t		ctrl_sig;
+	uint8_t		sig;
+};
+
+struct page_block {
+	void *page_ptr;
+	uint64_t iova;
+	struct list_node next_block;
+	BITMAP_DECLARE(free_pages, MLX5_VFIO_BLOCK_NUM_PAGES);
+};
+
+struct vfio_mem_allocator {
+	struct list_head block_list;
+	pthread_mutex_t block_list_mutex;
+};
+
+struct mlx5_cmd_mailbox {
+	void *buf;
+	uint64_t iova;
+	struct mlx5_cmd_mailbox *next;
+};
+
+struct mlx5_cmd_msg {
+	uint32_t len;
+	struct mlx5_cmd_mailbox *next;
+};
+
+struct mlx5_vfio_cmd_slot {
+	struct mlx5_cmd_layout *lay;
+	struct mlx5_cmd_msg in;
+	struct mlx5_cmd_msg out;
+	pthread_mutex_t lock;
+	int completion_event_fd;
+};
+
+struct mlx5_vfio_cmd {
+	void *vaddr; /* cmd page address */
+	uint64_t iova;
+	uint8_t log_sz;
+	uint8_t log_stride;
+	struct mlx5_vfio_cmd_slot cmds[MLX5_MAX_COMMANDS];
+};
+
+struct mlx5_vfio_context {
+	struct verbs_context vctx;
+	int container_fd;
+	int group_fd;
+	int device_fd;
+	int cmd_comp_fd; /* command completion FD */
+	struct iset *iova_alloc;
+	uint64_t iova_min_page_size;
+	FILE *dbg_fp;
+	struct vfio_mem_allocator mem_alloc;
+	struct mlx5_init_seg *bar_map;
+	size_t bar_map_size;
+	struct mlx5_vfio_cmd cmd;
+	bool have_eq;
+};
+
 static inline struct mlx5_vfio_device *to_mvfio_dev(struct ibv_device *ibdev)
 {
 	return container_of(ibdev, struct mlx5_vfio_device, vdev.device);
 }
 
+static inline struct mlx5_vfio_context *to_mvfio_ctx(struct ibv_context *ibctx)
+{
+	return container_of(ibctx, struct mlx5_vfio_context, vctx.context);
+}
+
 #endif
-- 
1.8.3.1


  parent reply	other threads:[~2021-07-20  8:17 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-20  8:16 [PATCH rdma-core 00/27] Introduce mlx5 user space driver over VFIO Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 01/27] Update kernel headers Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 02/27] mlx5: Introduce mlx5dv_get_vfio_device_list() Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 03/27] mlx5: Enable debug functionality for vfio Yishai Hadas
2021-07-20  8:51   ` Leon Romanovsky
2021-07-20  9:27     ` Yishai Hadas
2021-07-20 12:27       ` Leon Romanovsky
2021-07-20 14:57         ` Yishai Hadas
2021-07-21  7:05           ` Gal Pressman
2021-07-21  7:58             ` Yishai Hadas
2021-07-21  8:51               ` Gal Pressman
2021-07-20  8:16 ` [PATCH rdma-core 04/27] util: Add interval_set support Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 05/27] verbs: Enable verbs_open_device() to work over non sysfs devices Yishai Hadas
2021-07-20  8:16 ` Yishai Hadas [this message]
2021-07-20  8:16 ` [PATCH rdma-core 07/27] mlx5: Add mlx5_vfio_cmd_exec() support Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 08/27] mlx5: vfio setup function support Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 09/27] mlx5: vfio setup basic caps Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 10/27] mlx5: Support fast teardown over vfio Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 11/27] mlx5: Enable interrupt command mode " Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 12/27] mlx5: Introduce vfio APIs to process events Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 13/27] mlx5: VFIO poll_health support Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 14/27] mlx5: Implement basic verbs operation for PD and MR over vfio Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 15/27] mlx5: Set DV context ops Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 16/27] mlx5: Support initial DEVX/DV APIs over vfio Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 17/27] mlx5: Implement mlx5dv devx_obj " Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 18/27] pyverbs: Support DevX UMEM registration Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 19/27] pyverbs/mlx5: Support EQN querying Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 20/27] pyverbs/mlx5: Support more DevX objects Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 21/27] pyverbs: Add auxiliary memory functions Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 22/27] pyverbs/mlx5: Add support to extract mlx5dv objects Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 23/27] pyverbs/mlx5: Wrap mlx5_cqe64 struct and add enums Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 24/27] tests: Add MAC address to the tests' args Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 25/27] tests: Add mlx5 DevX data path test Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 26/27] pyverbs/mlx5: Support mlx5 devices over VFIO Yishai Hadas
2021-07-20  8:16 ` [PATCH rdma-core 27/27] tests: Add a test for mlx5 " Yishai Hadas
2021-08-01  8:00 ` [PATCH rdma-core 00/27] Introduce mlx5 user space driver " Yishai Hadas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210720081647.1980-7-yishaih@nvidia.com \
    --to=yishaih@nvidia.com \
    --cc=edwards@nvidia.com \
    --cc=jgg@nvidia.com \
    --cc=linux-rdma@vger.kernel.org \
    --cc=maorg@nvidia.com \
    --cc=markzhang@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).