Linux-RDMA Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH rdma-next 0/6] Use ib_umem_find_best_pgsz() for all umems
@ 2020-10-26 13:26 Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 1/6] RDMA/mlx5: Use ib_umem_find_best_pgsz() for devx Leon Romanovsky
                   ` (5 more replies)
  0 siblings, 6 replies; 9+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:26 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, David S. Miller, Eli Cohen, Haggai Abramonvsky,
	Jack Morgenstein, linux-kernel, linux-rdma, majd, Matan Barak,
	Or Gerlitz, Roland Dreier, Sagi Grimberg, Yishai Hadas

From: Leon Romanovsky <leonro@nvidia.com>

From Jason:

Move the remaining cases working with umems to use versions of
ib_umem_find_best_pgsz() tailored to the calculations the devices
requires.

Unlike a MR there is no IOVA, instead a page offset from the starting page
is possible, with various restrictions.

Compute the best page size to meet the page_offset restrictions.

Thanks

Jason Gunthorpe (6):
  RDMA/mlx5: Use ib_umem_find_best_pgsz() for devx
  RDMA/mlx5: Use ib_umem_find_best_pgoff() for SRQ
  RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for WQ
  RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for QP
  RDMA/mlx5: mlx5_umem_find_best_quantized_pgoff() for CQ
  RDMA/mlx5: Lower setting the umem's PAS for SRQ

 drivers/infiniband/hw/mlx5/cq.c      |  48 ++++++++---
 drivers/infiniband/hw/mlx5/devx.c    |  56 ++++++------
 drivers/infiniband/hw/mlx5/mem.c     | 115 +++++++++----------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  56 ++++++++++--
 drivers/infiniband/hw/mlx5/qp.c      | 124 ++++++++++++---------------
 drivers/infiniband/hw/mlx5/srq.c     |  27 +-----
 drivers/infiniband/hw/mlx5/srq.h     |   1 +
 drivers/infiniband/hw/mlx5/srq_cmd.c |  80 ++++++++++++++++-
 include/rdma/ib_umem.h               |  42 +++++++++
 9 files changed, 326 insertions(+), 223 deletions(-)

--
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH rdma-next 1/6] RDMA/mlx5: Use ib_umem_find_best_pgsz() for devx
  2020-10-26 13:26 [PATCH rdma-next 0/6] Use ib_umem_find_best_pgsz() for all umems Leon Romanovsky
@ 2020-10-26 13:26 ` Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 2/6] RDMA/mlx5: Use ib_umem_find_best_pgoff() for SRQ Leon Romanovsky
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:26 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma

From: Jason Gunthorpe <jgg@nvidia.com>

Since devx uses the new rdma_for_each_block() to fill the PAS it can also
use ib_umem_find_best_pgsz(). However devx does not compute a PAS list
with an IOVA, it is computing a PAS with a page_offset to the DMA address
that requires a slightly different calculation.

Introduce ib_umem_find_best_pgoff() to do this math and a wrapper to make
it easier to understand how the math relates directly to the mailbox
format.

This will allow umems to use large pages in a wider range of cases.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/devx.c    | 56 +++++++++++++---------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 26 ++++++++++++-
 include/rdma/ib_umem.h               | 42 +++++++++++++++++++++
 3 files changed, 91 insertions(+), 33 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 611ce21157de..274f04b39dce 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -93,8 +93,6 @@ struct devx_async_event_file {
 struct devx_umem {
 	struct mlx5_core_dev		*mdev;
 	struct ib_umem			*umem;
-	u32				page_offset;
-	int				page_shift;
 	u32				dinlen;
 	u32				dinbox[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)];
 };
@@ -2057,7 +2055,6 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 	size_t size;
 	u32 access;
 	int err;
-	u32 page_mask;
 
 	if (uverbs_copy_from(&addr, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR) ||
 	    uverbs_copy_from(&size, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN))
@@ -2078,46 +2075,45 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 	obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
 	if (IS_ERR(obj->umem))
 		return PTR_ERR(obj->umem);
-
-	mlx5_ib_cont_pages(obj->umem, obj->umem->address,
-			   MLX5_MKEY_PAGE_SHIFT_MASK, &obj->page_shift);
-	page_mask = (1 << obj->page_shift) - 1;
-	obj->page_offset = obj->umem->address & page_mask;
-
 	return 0;
 }
 
-static int devx_umem_reg_cmd_alloc(struct uverbs_attr_bundle *attrs,
+static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
+				   struct uverbs_attr_bundle *attrs,
 				   struct devx_umem *obj,
 				   struct devx_umem_reg_cmd *cmd)
 {
-	cmd->inlen =
-		MLX5_ST_SZ_BYTES(create_umem_in) +
-		(MLX5_ST_SZ_BYTES(mtt) *
-		 ib_umem_num_dma_blocks(obj->umem, 1UL << obj->page_shift));
-	cmd->in = uverbs_zalloc(attrs, cmd->inlen);
-	return PTR_ERR_OR_ZERO(cmd->in);
-}
-
-static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev,
-				    struct devx_umem *obj,
-				    struct devx_umem_reg_cmd *cmd)
-{
-	void *umem;
+	unsigned int page_size;
 	__be64 *mtt;
+	void *umem;
+
+	page_size =
+		mlx5_umem_find_best_pgoff(obj->umem, umem, log_page_size,
+					  MLX5_ADAPTER_PAGE_SHIFT, page_offset);
+	if (!page_size)
+		return -EINVAL;
+	cmd->inlen = MLX5_ST_SZ_BYTES(create_umem_in) +
+		     (MLX5_ST_SZ_BYTES(mtt) *
+		      ib_umem_num_dma_blocks(obj->umem, page_size));
+	cmd->in = uverbs_zalloc(attrs, cmd->inlen);
+	if (!cmd->in)
+		return PTR_ERR(cmd->in);
 
 	umem = MLX5_ADDR_OF(create_umem_in, cmd->in, umem);
 	mtt = (__be64 *)MLX5_ADDR_OF(umem, umem, mtt);
 
 	MLX5_SET(create_umem_in, cmd->in, opcode, MLX5_CMD_OP_CREATE_UMEM);
 	MLX5_SET64(umem, umem, num_of_mtt,
-		   ib_umem_num_dma_blocks(obj->umem, 1UL << obj->page_shift));
-	MLX5_SET(umem, umem, log_page_size, obj->page_shift -
-					    MLX5_ADAPTER_PAGE_SHIFT);
-	MLX5_SET(umem, umem, page_offset, obj->page_offset);
-	mlx5_ib_populate_pas(obj->umem, 1UL << obj->page_shift, mtt,
+		   ib_umem_num_dma_blocks(obj->umem, page_size));
+	MLX5_SET(umem, umem, log_page_size,
+		 order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(umem, umem, page_offset,
+		 ib_umem_dma_offset(obj->umem, page_size));
+
+	mlx5_ib_populate_pas(obj->umem, page_size, mtt,
 			     (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
 				     MLX5_IB_MTT_READ);
+	return 0;
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
@@ -2144,12 +2140,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
 	if (err)
 		goto err_obj_free;
 
-	err = devx_umem_reg_cmd_alloc(attrs, obj, &cmd);
+	err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd);
 	if (err)
 		goto err_umem_release;
 
-	devx_umem_reg_cmd_build(dev, obj, &cmd);
-
 	MLX5_SET(create_umem_in, cmd.in, uid, c->devx_uid);
 	err = mlx5_cmd_exec(dev->mdev, cmd.in, cmd.inlen, cmd.out,
 			    sizeof(cmd.out));
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index bb44080170be..a31daf7253be 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -40,8 +40,6 @@
 #define MLX5_IB_DEFAULT_UIDX 0xffffff
 #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
 
-#define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size)
-
 static __always_inline unsigned long
 __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits,
 			       unsigned int pgsz_shift)
@@ -69,6 +67,30 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits,
 				       pgsz_shift),                            \
 			       iova)
 
+static __always_inline unsigned long
+__mlx5_page_offset_to_bitmask(unsigned int page_offset_bits,
+			      unsigned int offset_shift)
+{
+	unsigned int largest_offset_shift =
+		min_t(unsigned long, page_offset_bits - 1 + offset_shift,
+		      BITS_PER_LONG - 1);
+
+	return GENMASK(largest_offset_shift, offset_shift);
+}
+
+/*
+ * This computes a the page_size and non-quantized page_offset to fit within a
+ * prm structure.
+ */
+#define mlx5_umem_find_best_pgoff(umem, typ, log_pgsz_fld, pgsz_shift,         \
+				  page_offset_fld)                             \
+	ib_umem_find_best_pgoff(                                               \
+		umem,                                                          \
+		__mlx5_log_page_size_to_bitmap(                                \
+			__mlx5_bit_sz(typ, log_pgsz_fld), pgsz_shift),         \
+		__mlx5_page_offset_to_bitmask(                                 \
+			__mlx5_bit_sz(typ, page_offset_fld), 0))
+
 enum {
 	MLX5_IB_MMAP_OFFSET_START = 9,
 	MLX5_IB_MMAP_OFFSET_END = 255,
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 70597508c765..7752211c9638 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -34,6 +34,13 @@ static inline int ib_umem_offset(struct ib_umem *umem)
 	return umem->address & ~PAGE_MASK;
 }
 
+static inline unsigned long ib_umem_dma_offset(struct ib_umem *umem,
+					       unsigned long pgsz)
+{
+	return (sg_dma_address(umem->sg_head.sgl) + ib_umem_offset(umem)) &
+	       (pgsz - 1);
+}
+
 static inline size_t ib_umem_num_dma_blocks(struct ib_umem *umem,
 					    unsigned long pgsz)
 {
@@ -79,6 +86,35 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 				     unsigned long pgsz_bitmap,
 				     unsigned long virt);
+/**
+ * ib_umem_find_best_pgoff - Find best HW page size
+ *
+ * @umem: umem struct
+ * @pgsz_bitmap bitmap of HW supported page sizes
+ * @pgoff_bitmask: Mask of bits that can be represented with an offset
+ *
+ * This is very similar to ib_umem_find_best_pgsz() except instead of accepting
+ * an IOVA it accepts a bitmask specifying what address bits can be represented
+ * with a page offset.
+ *
+ * For instance if the HW has multiple page sizes, requires 64 byte alignemnt,
+ * and can support aligned offsets up to 4032 then pgoff_bitmask would be
+ * "111111000000".
+ *
+ * If the pgoff_bitmask requires either alignment in the low bit or an
+ * unavailable page size for the high bits, this function returns 0.
+ */
+static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
+						    unsigned long pgsz_bitmap,
+						    u64 pgoff_bitmask)
+{
+	struct scatterlist *sg = umem->sg_head.sgl;
+	dma_addr_t dma_addr;
+
+	dma_addr = sg_dma_address(sg) + (umem->address & ~PAGE_MASK);
+	return ib_umem_find_best_pgsz(umem, pgsz_bitmap,
+				      dma_addr & pgoff_bitmask);
+}
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -101,6 +137,12 @@ static inline unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 {
 	return 0;
 }
+static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
+						    unsigned long pgsz_bitmap,
+						    u64 pgoff_bitmask)
+{
+	return 0;
+}
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH rdma-next 2/6] RDMA/mlx5: Use ib_umem_find_best_pgoff() for SRQ
  2020-10-26 13:26 [PATCH rdma-next 0/6] Use ib_umem_find_best_pgsz() for all umems Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 1/6] RDMA/mlx5: Use ib_umem_find_best_pgsz() for devx Leon Romanovsky
@ 2020-10-26 13:26 ` Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 3/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for WQ Leon Romanovsky
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:26 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma

From: Jason Gunthorpe <jgg@nvidia.com>

SRQ uses a quantized and scaled page_offset, which is another variation of
ib_umem_find_best_pgsz(). Add mlx5_umem_find_best_quantized_pgoff() to
perform this calculation for each mailbox. A macro shows how the
calculation is directly connected to the mailbox format.

This new routine replaces the limited mlx5_ib_cont_pages() and
mlx5_ib_get_buf_offset() pairing which would reject valid configurations
rather than adjust the page_size to make it work.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mem.c     | 45 ++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 15 ++++++++++
 drivers/infiniband/hw/mlx5/srq.c     | 20 ++++++-------
 3 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 92e7621ec858..9f8f9e55bceb 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -107,6 +107,51 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 	}
 }
 
+/*
+ * Compute the page shift and page_offset for mailboxes that use a quantized
+ * page_offset. The granulatity of the page offset scales according to page
+ * size.
+ */
+unsigned long __mlx5_umem_find_best_quantized_pgoff(
+	struct ib_umem *umem, unsigned long pgsz_bitmap,
+	unsigned int page_offset_bits, u64 pgoff_bitmask, unsigned int scale,
+	unsigned int *page_offset_quantized)
+{
+	const u64 page_offset_mask = (1 << page_offset_bits) - 1;
+	unsigned long page_size;
+	u64 page_offset;
+
+	page_size = ib_umem_find_best_pgoff(umem, pgsz_bitmap, pgoff_bitmask);
+	if (!page_size)
+		return 0;
+
+	/*
+	 * page size is the largest possibile page size.
+	 *
+	 * Reduce the page_size, and thus the page_offset and quanta, until the
+	 * page_offset fits into the mailbox field. Once page_size < scale this
+	 * loop is guaranteed to terminate.
+	 */
+	page_offset = ib_umem_dma_offset(umem, page_size);
+	while (page_offset & ~(u64)(page_offset_mask * (page_size / scale))) {
+		page_size /= 2;
+		page_offset = ib_umem_dma_offset(umem, page_size);
+	}
+
+	/*
+	 * The address is not aligned, or otherwise cannot be represented by the
+	 * page_offset.
+	 */
+	if (!(pgsz_bitmap & page_size))
+		return 0;
+
+	*page_offset_quantized =
+		(unsigned long)page_offset / (page_size / scale);
+	if (WARN_ON(*page_offset_quantized > page_offset_mask))
+		return 0;
+	return page_size;
+}
+
 int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
 {
 	u64 page_size;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a31daf7253be..769a0b0738cd 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -91,6 +91,21 @@ __mlx5_page_offset_to_bitmask(unsigned int page_offset_bits,
 		__mlx5_page_offset_to_bitmask(                                 \
 			__mlx5_bit_sz(typ, page_offset_fld), 0))
 
+unsigned long __mlx5_umem_find_best_quantized_pgoff(
+	struct ib_umem *umem, unsigned long pgsz_bitmap,
+	unsigned int page_offset_bits, u64 pgoff_bitmask, unsigned int scale,
+	unsigned int *page_offset_quantized);
+#define mlx5_umem_find_best_quantized_pgoff(umem, typ, log_pgsz_fld,           \
+					    pgsz_shift, page_offset_fld,       \
+					    scale, page_offset_quantized)      \
+	__mlx5_umem_find_best_quantized_pgoff(                                 \
+		umem,                                                          \
+		__mlx5_log_page_size_to_bitmap(                                \
+			__mlx5_bit_sz(typ, log_pgsz_fld), pgsz_shift),         \
+		__mlx5_bit_sz(typ, page_offset_fld),                           \
+		GENMASK(31, order_base_2(scale)), scale,                       \
+		page_offset_quantized)
+
 enum {
 	MLX5_IB_MMAP_OFFSET_START = 9,
 	MLX5_IB_MMAP_OFFSET_END = 255,
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index dd6e42d3d175..898a6cac023a 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -51,8 +51,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 		udata, struct mlx5_ib_ucontext, ibucontext);
 	size_t ucmdlen;
 	int err;
-	int page_shift;
-	u32 offset;
+	unsigned int page_offset_quantized;
+	unsigned int page_size;
 	u32 uidx = MLX5_IB_DEFAULT_UIDX;
 
 	ucmdlen = min(udata->inlen, sizeof(ucmd));
@@ -85,22 +85,22 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 		return err;
 	}
 
-	mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, 0, &page_shift);
-	err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift,
-				     &offset);
-	if (err) {
+	page_size = mlx5_umem_find_best_quantized_pgoff(
+		srq->umem, srqc, log_page_size, MLX5_ADAPTER_PAGE_SHIFT,
+		page_offset, 64, &page_offset_quantized);
+	if (!page_size) {
 		mlx5_ib_warn(dev, "bad offset\n");
 		goto err_umem;
 	}
 
-	in->pas = kvcalloc(ib_umem_num_dma_blocks(srq->umem, 1UL << page_shift),
+	in->pas = kvcalloc(ib_umem_num_dma_blocks(srq->umem, page_size),
 			   sizeof(*in->pas), GFP_KERNEL);
 	if (!in->pas) {
 		err = -ENOMEM;
 		goto err_umem;
 	}
 
-	mlx5_ib_populate_pas(srq->umem, 1UL << page_shift, in->pas, 0);
+	mlx5_ib_populate_pas(srq->umem, page_size, in->pas, 0);
 
 	err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db);
 	if (err) {
@@ -108,8 +108,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 		goto err_in;
 	}
 
-	in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
-	in->page_offset = offset;
+	in->log_page_size = order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT;
+	in->page_offset = page_offset_quantized;
 	in->uid = (in->type != IB_SRQT_XRC) ?  to_mpd(pd)->uid : 0;
 	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
 	    in->type != IB_SRQT_BASIC)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH rdma-next 3/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for WQ
  2020-10-26 13:26 [PATCH rdma-next 0/6] Use ib_umem_find_best_pgsz() for all umems Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 1/6] RDMA/mlx5: Use ib_umem_find_best_pgsz() for devx Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 2/6] RDMA/mlx5: Use ib_umem_find_best_pgoff() for SRQ Leon Romanovsky
@ 2020-10-26 13:26 ` Leon Romanovsky
  2020-10-26 14:42   ` Gal Pressman
  2020-10-26 13:26 ` [PATCH rdma-next 4/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for QP Leon Romanovsky
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 9+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:26 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: linux-rdma, majd, Matan Barak, Sagi Grimberg, Yishai Hadas

From: Jason Gunthorpe <jgg@nvidia.com>

This fixes a subtle bug, the WQ mailbox has only 5 bits to describe the
page_offset, while mlx5_ib_get_buf_offset() is hard wired to only work
with 6 bit page_offsets.

Thus it did not properly reject badly aligned buffers.

YISHAI: WTF? Why does this PRM command only have 5 bits? We must force 4k
alignment for WQ umems in the userspace?

Fixes: 79b20a6c3014 ("IB/mlx5: Add receive Work Queue verbs")
Fixes: 0fb2ed66a14c ("IB/mlx5: Add create and destroy functionality for Raw Packet QP")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 53 +++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 042177f33252..143fd5eae304 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -833,7 +833,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 {
 	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
 		udata, struct mlx5_ib_ucontext, ibucontext);
-	int page_shift = 0;
+	unsigned long page_size = 0;
 	u32 offset = 0;
 	int err;
 
@@ -847,24 +847,25 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		return err;
 	}
 
-	mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, 0, &page_shift);
-	err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift,
-				     &rwq->rq_page_offset);
-	if (err) {
+	page_size = mlx5_umem_find_best_quantized_pgoff(
+		rwq->umem, wq, log_wq_pg_sz, MLX5_ADAPTER_PAGE_SHIFT,
+		page_offset, 64, &rwq->rq_page_offset);
+	if (!page_size) {
 		mlx5_ib_warn(dev, "bad offset\n");
+		err = -EINVAL;
 		goto err_umem;
 	}
 
-	rwq->rq_num_pas = ib_umem_num_dma_blocks(rwq->umem, 1UL << page_shift);
-	rwq->page_shift = page_shift;
-	rwq->log_page_size =  page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	rwq->rq_num_pas = ib_umem_num_dma_blocks(rwq->umem, page_size);
+	rwq->page_shift = order_base_2(page_size);
+	rwq->log_page_size =  rwq->page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 	rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE);
 
 	mlx5_ib_dbg(
 		dev,
-		"addr 0x%llx, size %zd, npages %zu, page_shift %d, ncont %d, offset %d\n",
+		"addr 0x%llx, size %zd, npages %zu, page_size %ld, ncont %d, offset %d\n",
 		(unsigned long long)ucmd->buf_addr, rwq->buf_size,
-		ib_umem_num_pages(rwq->umem), page_shift, rwq->rq_num_pas,
+		ib_umem_num_pages(rwq->umem), page_size, rwq->rq_num_pas,
 		offset);
 
 	err = mlx5_ib_db_map_user(ucontext, udata, ucmd->db_addr, &rwq->db);
@@ -1209,17 +1210,24 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
 	void *wq;
 	int inlen;
 	int err;
-	int page_shift = 0;
-	u32 offset = 0;
-
-	err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, ubuffer->buf_size,
-			       &sq->ubuffer.umem, &page_shift, &offset);
-	if (err)
-		return err;
+	unsigned int page_offset_quantized;
+	unsigned long page_size;
+
+	sq->ubuffer.umem = ib_umem_get(&dev->ib_dev, ubuffer->buf_addr,
+				       ubuffer->buf_size, 0);
+	if (IS_ERR(sq->ubuffer.umem))
+		return PTR_ERR(sq->ubuffer.umem);
+	page_size = mlx5_umem_find_best_quantized_pgoff(
+		ubuffer->umem, wq, log_wq_pg_sz, MLX5_ADAPTER_PAGE_SHIFT,
+		page_offset, 64, &page_offset_quantized);
+	if (!page_size) {
+		err = -EINVAL;
+		goto err_umem;
+	}
 
 	inlen = MLX5_ST_SZ_BYTES(create_sq_in) +
-		sizeof(u64) * ib_umem_num_dma_blocks(sq->ubuffer.umem,
-						     1UL << page_shift);
+		sizeof(u64) *
+			ib_umem_num_dma_blocks(sq->ubuffer.umem, page_size);
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in) {
 		err = -ENOMEM;
@@ -1247,11 +1255,12 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
 	MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size));
-	MLX5_SET(wq, wq, log_wq_pg_sz,  page_shift - MLX5_ADAPTER_PAGE_SHIFT);
-	MLX5_SET(wq, wq, page_offset, offset);
+	MLX5_SET(wq, wq, log_wq_pg_sz,
+		 order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(wq, wq, page_offset, page_offset_quantized);
 
 	pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
-	mlx5_ib_populate_pas(sq->ubuffer.umem, 1UL << page_shift, pas, 0);
+	mlx5_ib_populate_pas(sq->ubuffer.umem, page_size, pas, 0);
 
 	err = mlx5_core_create_sq_tracked(dev, in, inlen, &sq->base.mqp);
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH rdma-next 4/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for QP
  2020-10-26 13:26 [PATCH rdma-next 0/6] Use ib_umem_find_best_pgsz() for all umems Leon Romanovsky
                   ` (2 preceding siblings ...)
  2020-10-26 13:26 ` [PATCH rdma-next 3/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for WQ Leon Romanovsky
@ 2020-10-26 13:26 ` Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 5/6] RDMA/mlx5: mlx5_umem_find_best_quantized_pgoff() for CQ Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 6/6] RDMA/mlx5: Lower setting the umem's PAS for SRQ Leon Romanovsky
  5 siblings, 0 replies; 9+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:26 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma

From: Jason Gunthorpe <jgg@nvidia.com>

Last user of mlx5_ib_get_buf_offset() so it can also be removed.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mem.c     | 21 --------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  1 -
 drivers/infiniband/hw/mlx5/qp.c      | 71 +++++++++-------------------
 3 files changed, 23 insertions(+), 70 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 9f8f9e55bceb..e38f9f2bf181 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -152,27 +152,6 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff(
 	return page_size;
 }
 
-int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
-{
-	u64 page_size;
-	u64 page_mask;
-	u64 off_size;
-	u64 off_mask;
-	u64 buf_off;
-
-	page_size = (u64)1 << page_shift;
-	page_mask = page_size - 1;
-	buf_off = addr & page_mask;
-	off_size = page_size >> 6;
-	off_mask = off_size - 1;
-
-	if (buf_off & off_mask)
-		return -EINVAL;
-
-	*offset = buf_off >> ilog2(off_size);
-	return 0;
-}
-
 #define WR_ID_BF 0xBF
 #define WR_ID_END 0xBAD
 #define TEST_WC_NUM_WQES 255
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 769a0b0738cd..9777103af575 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1273,7 +1273,6 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			size_t *out_mad_size, u16 *out_mad_pkey_index);
 int mlx5_ib_alloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
 int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
-int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
 int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
 int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev,
 					  struct ib_smp *out_mad);
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 143fd5eae304..bedba14c1bfb 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -778,39 +778,6 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 	return bfregi->sys_pages[index_of_sys_page] + offset;
 }
 
-static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
-			    unsigned long addr, size_t size,
-			    struct ib_umem **umem, int *page_shift,
-			    u32 *offset)
-{
-	int err;
-
-	*umem = ib_umem_get(&dev->ib_dev, addr, size, 0);
-	if (IS_ERR(*umem)) {
-		mlx5_ib_dbg(dev, "umem_get failed\n");
-		return PTR_ERR(*umem);
-	}
-
-	mlx5_ib_cont_pages(*umem, addr, 0, page_shift);
-
-	err = mlx5_ib_get_buf_offset(addr, *page_shift, offset);
-	if (err) {
-		mlx5_ib_warn(dev, "bad offset\n");
-		goto err_umem;
-	}
-
-	mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %zu, page_shift %d, offset %d\n",
-		    addr, size, ib_umem_num_pages(*umem), *page_shift, *offset);
-
-	return 0;
-
-err_umem:
-	ib_umem_release(*umem);
-	*umem = NULL;
-
-	return err;
-}
-
 static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			    struct mlx5_ib_rwq *rwq, struct ib_udata *udata)
 {
@@ -897,9 +864,9 @@ static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 {
 	struct mlx5_ib_ucontext *context;
 	struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
-	int page_shift = 0;
+	unsigned int page_offset_quantized = 0;
+	unsigned long page_size = 0;
 	int uar_index = 0;
-	u32 offset = 0;
 	int bfregn;
 	int ncont = 0;
 	__be64 *pas;
@@ -950,12 +917,21 @@ static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 	if (ucmd->buf_addr && ubuffer->buf_size) {
 		ubuffer->buf_addr = ucmd->buf_addr;
-		err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr,
-				       ubuffer->buf_size, &ubuffer->umem,
-				       &page_shift, &offset);
-		if (err)
+		ubuffer->umem = ib_umem_get(&dev->ib_dev, ubuffer->buf_addr,
+					    ubuffer->buf_size, 0);
+		if (IS_ERR(ubuffer->umem)) {
+			err = PTR_ERR(ubuffer->umem);
 			goto err_bfreg;
-		ncont = ib_umem_num_dma_blocks(ubuffer->umem, 1UL << page_shift);
+		}
+		page_size = mlx5_umem_find_best_quantized_pgoff(
+			ubuffer->umem, qpc, log_page_size,
+			MLX5_ADAPTER_PAGE_SHIFT, page_offset, 64,
+			&page_offset_quantized);
+		if (!page_size) {
+			err = -EINVAL;
+			goto err_umem;
+		}
+		ncont = ib_umem_num_dma_blocks(ubuffer->umem, page_size);
 	} else {
 		ubuffer->umem = NULL;
 	}
@@ -970,15 +946,14 @@ static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 	uid = (attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0;
 	MLX5_SET(create_qp_in, *in, uid, uid);
-	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
-	if (ubuffer->umem)
-		mlx5_ib_populate_pas(ubuffer->umem, 1UL << page_shift, pas, 0);
-
 	qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc);
-
-	MLX5_SET(qpc, qpc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT);
-	MLX5_SET(qpc, qpc, page_offset, offset);
-
+	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
+	if (ubuffer->umem) {
+		mlx5_ib_populate_pas(ubuffer->umem, page_size, pas, 0);
+		MLX5_SET(qpc, qpc, log_page_size,
+			 order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT);
+		MLX5_SET(qpc, qpc, page_offset, page_offset_quantized);
+	}
 	MLX5_SET(qpc, qpc, uar_page, uar_index);
 	if (bfregn != MLX5_IB_INVALID_BFREG)
 		resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn);
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH rdma-next 5/6] RDMA/mlx5: mlx5_umem_find_best_quantized_pgoff() for CQ
  2020-10-26 13:26 [PATCH rdma-next 0/6] Use ib_umem_find_best_pgsz() for all umems Leon Romanovsky
                   ` (3 preceding siblings ...)
  2020-10-26 13:26 ` [PATCH rdma-next 4/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for QP Leon Romanovsky
@ 2020-10-26 13:26 ` Leon Romanovsky
  2020-10-26 13:26 ` [PATCH rdma-next 6/6] RDMA/mlx5: Lower setting the umem's PAS for SRQ Leon Romanovsky
  5 siblings, 0 replies; 9+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:26 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Eli Cohen, Jack Morgenstein, linux-rdma, Or Gerlitz, Roland Dreier

From: Jason Gunthorpe <jgg@nvidia.com>

This fixes a bug where the page_offset was not being considered when
building a CQ. The PRM says it 'must be zero', so use a variant of
mlx5_umem_find_best_quantized_pgoff() with a 0 pgoff_bitmask to force this
result.

YISHAI: Why does the PRM say 'must be zero' and then give math to compute
it in red highlight?

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/cq.c      | 48 +++++++++++++++++-------
 drivers/infiniband/hw/mlx5/mem.c     | 55 ----------------------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 14 +++++--
 3 files changed, 45 insertions(+), 72 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 9ab93d730769..eb92cefffd77 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -707,8 +707,9 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 			  int *cqe_size, int *index, int *inlen)
 {
 	struct mlx5_ib_create_cq ucmd = {};
+	unsigned long page_size;
+	unsigned int page_offset_quantized;
 	size_t ucmdlen;
-	int page_shift;
 	__be64 *pas;
 	int ncont;
 	void *cqc;
@@ -741,17 +742,24 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 		return err;
 	}
 
+	page_size = mlx5_umem_find_best_cq_quantized_pgoff(
+		cq->buf.umem, cqc, log_page_size, MLX5_ADAPTER_PAGE_SHIFT,
+		page_offset, 64, &page_offset_quantized);
+	if (!page_size) {
+		err = -EINVAL;
+		goto err_umem;
+	}
+
 	err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &cq->db);
 	if (err)
 		goto err_umem;
 
-	mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, 0, &page_shift);
-	ncont = ib_umem_num_dma_blocks(cq->buf.umem, 1UL << page_shift);
+	ncont = ib_umem_num_dma_blocks(cq->buf.umem, page_size);
 	mlx5_ib_dbg(
 		dev,
-		"addr 0x%llx, size %u, npages %zu, page_shift %d, ncont %d\n",
+		"addr 0x%llx, size %u, npages %zu, page_size %lu, ncont %d\n",
 		ucmd.buf_addr, entries * ucmd.cqe_size,
-		ib_umem_num_pages(cq->buf.umem), page_shift, ncont);
+		ib_umem_num_pages(cq->buf.umem), page_size, ncont);
 
 	*inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 		 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * ncont;
@@ -762,11 +770,12 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	}
 
 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
-	mlx5_ib_populate_pas(cq->buf.umem, 1UL << page_shift, pas, 0);
+	mlx5_ib_populate_pas(cq->buf.umem, page_size, pas, 0);
 
 	cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
 	MLX5_SET(cqc, cqc, log_page_size,
-		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+		 order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(cqc, cqc, page_offset, page_offset_quantized);
 
 	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) {
 		*index = ucmd.uar_page_index;
@@ -1131,7 +1140,7 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 
 static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 		       int entries, struct ib_udata *udata,
-		       int *page_shift, int *cqe_size)
+		       int *cqe_size)
 {
 	struct mlx5_ib_resize_cq ucmd;
 	struct ib_umem *umem;
@@ -1156,8 +1165,6 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 		return err;
 	}
 
-	mlx5_ib_cont_pages(umem, ucmd.buf_addr, 0, page_shift);
-
 	cq->resize_umem = umem;
 	*cqe_size = ucmd.cqe_size;
 
@@ -1250,7 +1257,8 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 	int err;
 	int npas;
 	__be64 *pas;
-	int page_shift;
+	unsigned int page_offset_quantized = 0;
+	unsigned int page_shift;
 	int inlen;
 	int cqe_size;
 	unsigned long flags;
@@ -1277,11 +1285,22 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 
 	mutex_lock(&cq->resize_mutex);
 	if (udata) {
-		err = resize_user(dev, cq, entries, udata, &page_shift,
-				  &cqe_size);
+		unsigned long page_size;
+
+		err = resize_user(dev, cq, entries, udata, &cqe_size);
 		if (err)
 			goto ex;
-		npas = ib_umem_num_dma_blocks(cq->resize_umem, 1UL << page_shift);
+
+		page_size = mlx5_umem_find_best_cq_quantized_pgoff(
+			cq->resize_umem, cqc, log_page_size,
+			MLX5_ADAPTER_PAGE_SHIFT, page_offset, 64,
+			&page_offset_quantized);
+		if (!page_size) {
+			err = -EINVAL;
+			goto ex_resize;
+		}
+		npas = ib_umem_num_dma_blocks(cq->resize_umem, page_size);
+		page_shift = order_base_2(page_size);
 	} else {
 		struct mlx5_frag_buf *frag_buf;
 
@@ -1320,6 +1339,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(cqc, cqc, page_offset, page_offset_quantized);
 	MLX5_SET(cqc, cqc, cqe_sz,
 		 cqe_sz_to_mlx_sz(cqe_size,
 				  cq->private_flags &
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index e38f9f2bf181..9031c2c153e1 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -36,61 +36,6 @@
 #include "mlx5_ib.h"
 #include <linux/jiffies.h>
 
-/* @umem: umem object to scan
- * @addr: ib virtual address requested by the user
- * @max_page_shift: high limit for page_shift - 0 means no limit
- * @shift: page shift for the compound pages found in the region
- */
-void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
-			unsigned long max_page_shift, int *shift)
-{
-	unsigned long tmp;
-	unsigned long m;
-	u64 base = ~0, p = 0;
-	u64 len, pfn;
-	int i = 0;
-	struct scatterlist *sg;
-	int entry;
-
-	if (umem->is_odp) {
-		struct ib_umem_odp *odp = to_ib_umem_odp(umem);
-
-		*shift = odp->page_shift;
-		return;
-	}
-
-	addr = addr >> PAGE_SHIFT;
-	tmp = (unsigned long)addr;
-	m = find_first_bit(&tmp, BITS_PER_LONG);
-	if (max_page_shift)
-		m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
-
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
-		if (base + p != pfn) {
-			/* If either the offset or the new
-			 * base are unaligned update m
-			 */
-			tmp = (unsigned long)(pfn | p);
-			if (!IS_ALIGNED(tmp, 1 << m))
-				m = find_first_bit(&tmp, BITS_PER_LONG);
-
-			base = pfn;
-			p = 0;
-		}
-
-		p += len;
-		i += len;
-	}
-
-	if (i)
-		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
-	else
-		m  = 0;
-	*shift = PAGE_SHIFT + m;
-}
-
 /*
  * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be
  * filled in the pas array.
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 9777103af575..254668d31988 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -106,6 +106,17 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff(
 		GENMASK(31, order_base_2(scale)), scale,                       \
 		page_offset_quantized)
 
+/* PRM says page_offset must be 0 for CQ */
+#define mlx5_umem_find_best_cq_quantized_pgoff(umem, typ, log_pgsz_fld,        \
+					       pgsz_shift, page_offset_fld,    \
+					       scale, page_offset_quantized)   \
+	__mlx5_umem_find_best_quantized_pgoff(                                 \
+		umem,                                                          \
+		__mlx5_log_page_size_to_bitmap(                                \
+			__mlx5_bit_sz(typ, log_pgsz_fld), pgsz_shift),         \
+		__mlx5_bit_sz(typ, page_offset_fld), 0, scale,                 \
+		page_offset_quantized)
+
 enum {
 	MLX5_IB_MMAP_OFFSET_START = 9,
 	MLX5_IB_MMAP_OFFSET_END = 255,
@@ -1292,9 +1303,6 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
 			    struct ib_port_attr *props);
 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 		       struct ib_port_attr *props);
-void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
-			unsigned long max_page_shift,
-			int *shift);
 void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH rdma-next 6/6] RDMA/mlx5: Lower setting the umem's PAS for SRQ
  2020-10-26 13:26 [PATCH rdma-next 0/6] Use ib_umem_find_best_pgsz() for all umems Leon Romanovsky
                   ` (4 preceding siblings ...)
  2020-10-26 13:26 ` [PATCH rdma-next 5/6] RDMA/mlx5: mlx5_umem_find_best_quantized_pgoff() for CQ Leon Romanovsky
@ 2020-10-26 13:26 ` Leon Romanovsky
  5 siblings, 0 replies; 9+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:26 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: David S. Miller, Haggai Abramonvsky, linux-rdma, Or Gerlitz

From: Jason Gunthorpe <jgg@nvidia.com>

Some of the SRQ types are created using a WQ, and the WQ requires a
different parameter set to mlx5_umem_find_best_quantized_pgoff() as it has
a 5 bit page_offset.

Add the umem to the mlx5_srq_attr and defer computing the PAS data until
the code has figured out what kind of mailbox to use. Compute the PAS
directly from the umem for each of the four unique mailbox types.

This also avoids allocating memory to store the user PAS, instead it is
written directly to the mailbox as in most other cases.

Fixes: 01949d0109ee ("net/mlx5_core: Enable XRCs and SRQs when using ISSI > 0")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/srq.c     | 27 +---------
 drivers/infiniband/hw/mlx5/srq.h     |  1 +
 drivers/infiniband/hw/mlx5/srq_cmd.c | 80 ++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 898a6cac023a..89494114ee56 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -51,8 +51,6 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 		udata, struct mlx5_ib_ucontext, ibucontext);
 	size_t ucmdlen;
 	int err;
-	unsigned int page_offset_quantized;
-	unsigned int page_size;
 	u32 uidx = MLX5_IB_DEFAULT_UIDX;
 
 	ucmdlen = min(udata->inlen, sizeof(ucmd));
@@ -84,32 +82,14 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 		err = PTR_ERR(srq->umem);
 		return err;
 	}
-
-	page_size = mlx5_umem_find_best_quantized_pgoff(
-		srq->umem, srqc, log_page_size, MLX5_ADAPTER_PAGE_SHIFT,
-		page_offset, 64, &page_offset_quantized);
-	if (!page_size) {
-		mlx5_ib_warn(dev, "bad offset\n");
-		goto err_umem;
-	}
-
-	in->pas = kvcalloc(ib_umem_num_dma_blocks(srq->umem, page_size),
-			   sizeof(*in->pas), GFP_KERNEL);
-	if (!in->pas) {
-		err = -ENOMEM;
-		goto err_umem;
-	}
-
-	mlx5_ib_populate_pas(srq->umem, page_size, in->pas, 0);
+	in->umem = srq->umem;
 
 	err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map doorbell failed\n");
-		goto err_in;
+		goto err_umem;
 	}
 
-	in->log_page_size = order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT;
-	in->page_offset = page_offset_quantized;
 	in->uid = (in->type != IB_SRQT_XRC) ?  to_mpd(pd)->uid : 0;
 	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
 	    in->type != IB_SRQT_BASIC)
@@ -117,9 +97,6 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 
 	return 0;
 
-err_in:
-	kvfree(in->pas);
-
 err_umem:
 	ib_umem_release(srq->umem);
 
diff --git a/drivers/infiniband/hw/mlx5/srq.h b/drivers/infiniband/hw/mlx5/srq.h
index 2c3627b2509d..a7e3dc5564ac 100644
--- a/drivers/infiniband/hw/mlx5/srq.h
+++ b/drivers/infiniband/hw/mlx5/srq.h
@@ -28,6 +28,7 @@ struct mlx5_srq_attr {
 	u32 user_index;
 	u64 db_record;
 	__be64 *pas;
+	struct ib_umem *umem;
 	u32 tm_log_list_size;
 	u32 tm_next_tag;
 	u32 tm_hw_phase_cnt;
diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c
index db889ec3fd48..8b3385396599 100644
--- a/drivers/infiniband/hw/mlx5/srq_cmd.c
+++ b/drivers/infiniband/hw/mlx5/srq_cmd.c
@@ -92,6 +92,25 @@ struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn)
 	return srq;
 }
 
+static int __set_srq_page_size(struct mlx5_srq_attr *in,
+			       unsigned long page_size)
+{
+	if (!page_size)
+		return -EINVAL;
+	in->log_page_size = order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT;
+
+	if (WARN_ON(get_pas_size(in) !=
+		    ib_umem_num_dma_blocks(in->umem, page_size) * sizeof(u64)))
+		return -EINVAL;
+	return 0;
+}
+
+#define set_srq_page_size(in, typ, log_pgsz_fld)                               \
+	__set_srq_page_size(in, mlx5_umem_find_best_quantized_pgoff(           \
+					(in)->umem, typ, log_pgsz_fld,         \
+					MLX5_ADAPTER_PAGE_SHIFT, page_offset,  \
+					64, &(in)->page_offset))
+
 static int create_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 			  struct mlx5_srq_attr *in)
 {
@@ -103,6 +122,12 @@ static int create_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 	int inlen;
 	int err;
 
+	if (in->umem) {
+		err = set_srq_page_size(in, srqc, log_page_size);
+		if (err)
+			return err;
+	}
+
 	pas_size  = get_pas_size(in);
 	inlen	  = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size;
 	create_in = kvzalloc(inlen, GFP_KERNEL);
@@ -114,7 +139,13 @@ static int create_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 	pas = MLX5_ADDR_OF(create_srq_in, create_in, pas);
 
 	set_srqc(srqc, in);
-	memcpy(pas, in->pas, pas_size);
+	if (in->umem)
+		mlx5_ib_populate_pas(
+			in->umem,
+			1UL << (in->log_page_size + MLX5_ADAPTER_PAGE_SHIFT),
+			pas, 0);
+	else
+		memcpy(pas, in->pas, pas_size);
 
 	MLX5_SET(create_srq_in, create_in, opcode,
 		 MLX5_CMD_OP_CREATE_SRQ);
@@ -194,6 +225,12 @@ static int create_xrc_srq_cmd(struct mlx5_ib_dev *dev,
 	int inlen;
 	int err;
 
+	if (in->umem) {
+		err = set_srq_page_size(in, xrc_srqc, log_page_size);
+		if (err)
+			return err;
+	}
+
 	pas_size  = get_pas_size(in);
 	inlen	  = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size;
 	create_in = kvzalloc(inlen, GFP_KERNEL);
@@ -207,7 +244,13 @@ static int create_xrc_srq_cmd(struct mlx5_ib_dev *dev,
 
 	set_srqc(xrc_srqc, in);
 	MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index);
-	memcpy(pas, in->pas, pas_size);
+	if (in->umem)
+		mlx5_ib_populate_pas(
+			in->umem,
+			1UL << (in->log_page_size + MLX5_ADAPTER_PAGE_SHIFT),
+			pas, 0);
+	else
+		memcpy(pas, in->pas, pas_size);
 	MLX5_SET(create_xrc_srq_in, create_in, opcode,
 		 MLX5_CMD_OP_CREATE_XRC_SRQ);
 
@@ -289,11 +332,18 @@ static int create_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 	void *create_in = NULL;
 	void *rmpc;
 	void *wq;
+	void *pas;
 	int pas_size;
 	int outlen;
 	int inlen;
 	int err;
 
+	if (in->umem) {
+		err = set_srq_page_size(in, wq, log_wq_pg_sz);
+		if (err)
+			return err;
+	}
+
 	pas_size = get_pas_size(in);
 	inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size;
 	outlen = MLX5_ST_SZ_BYTES(create_rmp_out);
@@ -309,8 +359,16 @@ static int create_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 
 	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
 	MLX5_SET(create_rmp_in, create_in, uid, in->uid);
+	pas = MLX5_ADDR_OF(rmpc, rmpc, wq.pas);
+
 	set_wq(wq, in);
-	memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
+	if (in->umem)
+		mlx5_ib_populate_pas(
+			in->umem,
+			1UL << (in->log_page_size + MLX5_ADAPTER_PAGE_SHIFT),
+			pas, 0);
+	else
+		memcpy(pas, in->pas, pas_size);
 
 	MLX5_SET(create_rmp_in, create_in, opcode, MLX5_CMD_OP_CREATE_RMP);
 	err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out, outlen);
@@ -421,10 +479,17 @@ static int create_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 	void *create_in;
 	void *xrqc;
 	void *wq;
+	void *pas;
 	int pas_size;
 	int inlen;
 	int err;
 
+	if (in->umem) {
+		err = set_srq_page_size(in, wq, log_wq_pg_sz);
+		if (err)
+			return err;
+	}
+
 	pas_size = get_pas_size(in);
 	inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size;
 	create_in = kvzalloc(inlen, GFP_KERNEL);
@@ -433,9 +498,16 @@ static int create_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 
 	xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context);
 	wq = MLX5_ADDR_OF(xrqc, xrqc, wq);
+	pas = MLX5_ADDR_OF(xrqc, xrqc, wq.pas);
 
 	set_wq(wq, in);
-	memcpy(MLX5_ADDR_OF(xrqc, xrqc, wq.pas), in->pas, pas_size);
+	if (in->umem)
+		mlx5_ib_populate_pas(
+			in->umem,
+			1UL << (in->log_page_size + MLX5_ADAPTER_PAGE_SHIFT),
+			pas, 0);
+	else
+		memcpy(pas, in->pas, pas_size);
 
 	if (in->type == IB_SRQT_TM) {
 		MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING);
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH rdma-next 3/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for WQ
  2020-10-26 13:26 ` [PATCH rdma-next 3/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for WQ Leon Romanovsky
@ 2020-10-26 14:42   ` Gal Pressman
  2020-10-27  6:03     ` Leon Romanovsky
  0 siblings, 1 reply; 9+ messages in thread
From: Gal Pressman @ 2020-10-26 14:42 UTC (permalink / raw)
  To: Leon Romanovsky, Doug Ledford, Jason Gunthorpe
  Cc: linux-rdma, majd, Matan Barak, Sagi Grimberg, Yishai Hadas

On 26/10/2020 15:26, Leon Romanovsky wrote:
> From: Jason Gunthorpe <jgg@nvidia.com>
> 
> This fixes a subtle bug, the WQ mailbox has only 5 bits to describe the
> page_offset, while mlx5_ib_get_buf_offset() is hard wired to only work
> with 6 bit page_offsets.
> 
> Thus it did not properly reject badly aligned buffers.
> 
> YISHAI: WTF? Why does this PRM command only have 5 bits? We must force 4k
> alignment for WQ umems in the userspace?

You forgot to remove those :).

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH rdma-next 3/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for WQ
  2020-10-26 14:42   ` Gal Pressman
@ 2020-10-27  6:03     ` Leon Romanovsky
  0 siblings, 0 replies; 9+ messages in thread
From: Leon Romanovsky @ 2020-10-27  6:03 UTC (permalink / raw)
  To: Gal Pressman
  Cc: Doug Ledford, Jason Gunthorpe, linux-rdma, majd, Matan Barak,
	Sagi Grimberg, Yishai Hadas

On Mon, Oct 26, 2020 at 04:42:12PM +0200, Gal Pressman wrote:
> On 26/10/2020 15:26, Leon Romanovsky wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> >
> > This fixes a subtle bug, the WQ mailbox has only 5 bits to describe the
> > page_offset, while mlx5_ib_get_buf_offset() is hard wired to only work
> > with 6 bit page_offsets.
> >
> > Thus it did not properly reject badly aligned buffers.
> >
> > YISHAI: WTF? Why does this PRM command only have 5 bits? We must force 4k
> > alignment for WQ umems in the userspace?
>
> You forgot to remove those :).

Yeah, sorry, this is what is happening if you are keeping patches in
submission queue for months :(.

Thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, back to index

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-26 13:26 [PATCH rdma-next 0/6] Use ib_umem_find_best_pgsz() for all umems Leon Romanovsky
2020-10-26 13:26 ` [PATCH rdma-next 1/6] RDMA/mlx5: Use ib_umem_find_best_pgsz() for devx Leon Romanovsky
2020-10-26 13:26 ` [PATCH rdma-next 2/6] RDMA/mlx5: Use ib_umem_find_best_pgoff() for SRQ Leon Romanovsky
2020-10-26 13:26 ` [PATCH rdma-next 3/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for WQ Leon Romanovsky
2020-10-26 14:42   ` Gal Pressman
2020-10-27  6:03     ` Leon Romanovsky
2020-10-26 13:26 ` [PATCH rdma-next 4/6] RDMA/mlx5: Use mlx5_umem_find_best_quantized_pgoff() for QP Leon Romanovsky
2020-10-26 13:26 ` [PATCH rdma-next 5/6] RDMA/mlx5: mlx5_umem_find_best_quantized_pgoff() for CQ Leon Romanovsky
2020-10-26 13:26 ` [PATCH rdma-next 6/6] RDMA/mlx5: Lower setting the umem's PAS for SRQ Leon Romanovsky

Linux-RDMA Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-rdma/0 linux-rdma/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-rdma linux-rdma/ https://lore.kernel.org/linux-rdma \
		linux-rdma@vger.kernel.org
	public-inbox-index linux-rdma

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-rdma


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git