[PATCH rdma-next 1/6] RDMA/mlx5: Use ib_umem_find_best_pgsz() for devx

From: Leon Romanovsky <leon@kernel.org>
To: Doug Ledford <dledford@redhat.com>, Jason Gunthorpe <jgg@nvidia.com>
Cc: linux-rdma@vger.kernel.org
Subject: [PATCH rdma-next 1/6] RDMA/mlx5: Use ib_umem_find_best_pgsz() for devx
Date: Mon, 26 Oct 2020 15:26:30 +0200	[thread overview]
Message-ID: <20201026132635.1337663-2-leon@kernel.org> (raw)
In-Reply-To: <20201026132635.1337663-1-leon@kernel.org>

From: Jason Gunthorpe <jgg@nvidia.com>

Since devx uses the new rdma_for_each_block() to fill the PAS it can also
use ib_umem_find_best_pgsz(). However devx does not compute a PAS list
with an IOVA, it is computing a PAS with a page_offset to the DMA address
that requires a slightly different calculation.

Introduce ib_umem_find_best_pgoff() to do this math and a wrapper to make
it easier to understand how the math relates directly to the mailbox
format.

This will allow umems to use large pages in a wider range of cases.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/devx.c    | 56 +++++++++++++---------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 26 ++++++++++++-
 include/rdma/ib_umem.h               | 42 +++++++++++++++++++++
 3 files changed, 91 insertions(+), 33 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 611ce21157de..274f04b39dce 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -93,8 +93,6 @@ struct devx_async_event_file {
 struct devx_umem {
 	struct mlx5_core_dev		*mdev;
 	struct ib_umem			*umem;
-	u32				page_offset;
-	int				page_shift;
 	u32				dinlen;
 	u32				dinbox[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)];
 };
@@ -2057,7 +2055,6 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 	size_t size;
 	u32 access;
 	int err;
-	u32 page_mask;
 
 	if (uverbs_copy_from(&addr, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR) ||
 	    uverbs_copy_from(&size, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN))
@@ -2078,46 +2075,45 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 	obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
 	if (IS_ERR(obj->umem))
 		return PTR_ERR(obj->umem);
-
-	mlx5_ib_cont_pages(obj->umem, obj->umem->address,
-			   MLX5_MKEY_PAGE_SHIFT_MASK, &obj->page_shift);
-	page_mask = (1 << obj->page_shift) - 1;
-	obj->page_offset = obj->umem->address & page_mask;
-
 	return 0;
 }
 
-static int devx_umem_reg_cmd_alloc(struct uverbs_attr_bundle *attrs,
+static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
+				   struct uverbs_attr_bundle *attrs,
 				   struct devx_umem *obj,
 				   struct devx_umem_reg_cmd *cmd)
 {
-	cmd->inlen =
-		MLX5_ST_SZ_BYTES(create_umem_in) +
-		(MLX5_ST_SZ_BYTES(mtt) *
-		 ib_umem_num_dma_blocks(obj->umem, 1UL << obj->page_shift));
-	cmd->in = uverbs_zalloc(attrs, cmd->inlen);
-	return PTR_ERR_OR_ZERO(cmd->in);
-}
-
-static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev,
-				    struct devx_umem *obj,
-				    struct devx_umem_reg_cmd *cmd)
-{
-	void *umem;
+	unsigned int page_size;
 	__be64 *mtt;
+	void *umem;
+
+	page_size =
+		mlx5_umem_find_best_pgoff(obj->umem, umem, log_page_size,
+					  MLX5_ADAPTER_PAGE_SHIFT, page_offset);
+	if (!page_size)
+		return -EINVAL;
+	cmd->inlen = MLX5_ST_SZ_BYTES(create_umem_in) +
+		     (MLX5_ST_SZ_BYTES(mtt) *
+		      ib_umem_num_dma_blocks(obj->umem, page_size));
+	cmd->in = uverbs_zalloc(attrs, cmd->inlen);
+	if (!cmd->in)
+		return PTR_ERR(cmd->in);
 
 	umem = MLX5_ADDR_OF(create_umem_in, cmd->in, umem);
 	mtt = (__be64 *)MLX5_ADDR_OF(umem, umem, mtt);
 
 	MLX5_SET(create_umem_in, cmd->in, opcode, MLX5_CMD_OP_CREATE_UMEM);
 	MLX5_SET64(umem, umem, num_of_mtt,
-		   ib_umem_num_dma_blocks(obj->umem, 1UL << obj->page_shift));
-	MLX5_SET(umem, umem, log_page_size, obj->page_shift -
-					    MLX5_ADAPTER_PAGE_SHIFT);
-	MLX5_SET(umem, umem, page_offset, obj->page_offset);
-	mlx5_ib_populate_pas(obj->umem, 1UL << obj->page_shift, mtt,
+		   ib_umem_num_dma_blocks(obj->umem, page_size));
+	MLX5_SET(umem, umem, log_page_size,
+		 order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(umem, umem, page_offset,
+		 ib_umem_dma_offset(obj->umem, page_size));
+
+	mlx5_ib_populate_pas(obj->umem, page_size, mtt,
 			     (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
 				     MLX5_IB_MTT_READ);
+	return 0;
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
@@ -2144,12 +2140,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
 	if (err)
 		goto err_obj_free;
 
-	err = devx_umem_reg_cmd_alloc(attrs, obj, &cmd);
+	err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd);
 	if (err)
 		goto err_umem_release;
 
-	devx_umem_reg_cmd_build(dev, obj, &cmd);
-
 	MLX5_SET(create_umem_in, cmd.in, uid, c->devx_uid);
 	err = mlx5_cmd_exec(dev->mdev, cmd.in, cmd.inlen, cmd.out,
 			    sizeof(cmd.out));
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index bb44080170be..a31daf7253be 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -40,8 +40,6 @@
 #define MLX5_IB_DEFAULT_UIDX 0xffffff
 #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
 
-#define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size)
-
 static __always_inline unsigned long
 __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits,
 			       unsigned int pgsz_shift)
@@ -69,6 +67,30 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits,
 				       pgsz_shift),                            \
 			       iova)
 
+static __always_inline unsigned long
+__mlx5_page_offset_to_bitmask(unsigned int page_offset_bits,
+			      unsigned int offset_shift)
+{
+	unsigned int largest_offset_shift =
+		min_t(unsigned long, page_offset_bits - 1 + offset_shift,
+		      BITS_PER_LONG - 1);
+
+	return GENMASK(largest_offset_shift, offset_shift);
+}
+
+/*
+ * This computes a the page_size and non-quantized page_offset to fit within a
+ * prm structure.
+ */
+#define mlx5_umem_find_best_pgoff(umem, typ, log_pgsz_fld, pgsz_shift,         \
+				  page_offset_fld)                             \
+	ib_umem_find_best_pgoff(                                               \
+		umem,                                                          \
+		__mlx5_log_page_size_to_bitmap(                                \
+			__mlx5_bit_sz(typ, log_pgsz_fld), pgsz_shift),         \
+		__mlx5_page_offset_to_bitmask(                                 \
+			__mlx5_bit_sz(typ, page_offset_fld), 0))
+
 enum {
 	MLX5_IB_MMAP_OFFSET_START = 9,
 	MLX5_IB_MMAP_OFFSET_END = 255,
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 70597508c765..7752211c9638 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -34,6 +34,13 @@ static inline int ib_umem_offset(struct ib_umem *umem)
 	return umem->address & ~PAGE_MASK;
 }
 
+static inline unsigned long ib_umem_dma_offset(struct ib_umem *umem,
+					       unsigned long pgsz)
+{
+	return (sg_dma_address(umem->sg_head.sgl) + ib_umem_offset(umem)) &
+	       (pgsz - 1);
+}
+
 static inline size_t ib_umem_num_dma_blocks(struct ib_umem *umem,
 					    unsigned long pgsz)
 {
@@ -79,6 +86,35 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 				     unsigned long pgsz_bitmap,
 				     unsigned long virt);
+/**
+ * ib_umem_find_best_pgoff - Find best HW page size
+ *
+ * @umem: umem struct
+ * @pgsz_bitmap bitmap of HW supported page sizes
+ * @pgoff_bitmask: Mask of bits that can be represented with an offset
+ *
+ * This is very similar to ib_umem_find_best_pgsz() except instead of accepting
+ * an IOVA it accepts a bitmask specifying what address bits can be represented
+ * with a page offset.
+ *
+ * For instance if the HW has multiple page sizes, requires 64 byte alignemnt,
+ * and can support aligned offsets up to 4032 then pgoff_bitmask would be
+ * "111111000000".
+ *
+ * If the pgoff_bitmask requires either alignment in the low bit or an
+ * unavailable page size for the high bits, this function returns 0.
+ */
+static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
+						    unsigned long pgsz_bitmap,
+						    u64 pgoff_bitmask)
+{
+	struct scatterlist *sg = umem->sg_head.sgl;
+	dma_addr_t dma_addr;
+
+	dma_addr = sg_dma_address(sg) + (umem->address & ~PAGE_MASK);
+	return ib_umem_find_best_pgsz(umem, pgsz_bitmap,
+				      dma_addr & pgoff_bitmask);
+}
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -101,6 +137,12 @@ static inline unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 {
 	return 0;
 }
+static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
+						    unsigned long pgsz_bitmap,
+						    u64 pgoff_bitmask)
+{
+	return 0;
+}
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 
-- 
2.26.2