Linux-RDMA Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs
@ 2020-10-26 13:23 Leon Romanovsky
  2020-10-26 13:23 ` [PATCH rdma-next 1/5] RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block() Leon Romanovsky
                   ` (5 more replies)
  0 siblings, 6 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: Leon Romanovsky, linux-kernel, linux-rdma

From: Leon Romanovsky <leonro@nvidia.com>

From Jason:

The new common code does a better job finding large page sizes. Use it in
mlx5 for MRs.

This requires moving the MTT population for mailboxes and UMR over to
rdma_for_each_dma_block().

Thanks

Jason Gunthorpe (5):
  RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block()
  RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c
  RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt()
  RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases
  RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's

 drivers/infiniband/core/umem.c       |   9 +
 drivers/infiniband/hw/mlx5/cq.c      |   6 +-
 drivers/infiniband/hw/mlx5/devx.c    |   4 +-
 drivers/infiniband/hw/mlx5/main.c    |  26 +-
 drivers/infiniband/hw/mlx5/mem.c     |  73 +-----
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  37 ++-
 drivers/infiniband/hw/mlx5/mr.c      | 364 ++++++++++++++++++---------
 drivers/infiniband/hw/mlx5/qp.c      |   6 +-
 drivers/infiniband/hw/mlx5/srq.c     |   2 +-
 9 files changed, 312 insertions(+), 215 deletions(-)

--
2.26.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH rdma-next 1/5] RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block()
  2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
  2020-10-26 13:23 ` [PATCH rdma-next 2/5] RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c Leon Romanovsky
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma

From: Jason Gunthorpe <jgg@nvidia.com>

This routine converts the umem SGL into a list of fixed pages for DMA,
which is exactly what rdma_umem_for_each_dma_block() is for, use the
common code directly.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/cq.c      |  6 +++---
 drivers/infiniband/hw/mlx5/devx.c    |  4 ++--
 drivers/infiniband/hw/mlx5/mem.c     | 19 ++++++++++++++-----
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  4 ++--
 drivers/infiniband/hw/mlx5/mr.c      |  7 +++++--
 drivers/infiniband/hw/mlx5/qp.c      |  6 +++---
 drivers/infiniband/hw/mlx5/srq.c     |  2 +-
 7 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 2088e4a3c32d..9ab93d730769 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -762,7 +762,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	}
 
 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
-	mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, pas, 0);
+	mlx5_ib_populate_pas(cq->buf.umem, 1UL << page_shift, pas, 0);
 
 	cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
 	MLX5_SET(cqc, cqc, log_page_size,
@@ -1305,8 +1305,8 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 
 	pas = (__be64 *)MLX5_ADDR_OF(modify_cq_in, in, pas);
 	if (udata)
-		mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift,
-				     pas, 0);
+		mlx5_ib_populate_pas(cq->resize_umem, 1UL << page_shift, pas,
+				     0);
 	else
 		mlx5_fill_page_frag_array(&cq->resize_buf->frag_buf, pas);
 
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index ae889266acf1..611ce21157de 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2115,9 +2115,9 @@ static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev,
 	MLX5_SET(umem, umem, log_page_size, obj->page_shift -
 					    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET(umem, umem, page_offset, obj->page_offset);
-	mlx5_ib_populate_pas(dev, obj->umem, obj->page_shift, mtt,
+	mlx5_ib_populate_pas(obj->umem, 1UL << obj->page_shift, mtt,
 			     (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
-			     MLX5_IB_MTT_READ);
+				     MLX5_IB_MTT_READ);
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 7ae96b37bd6e..779c4a040d8b 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -155,13 +155,22 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	}
 }
 
-void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
-			  int page_shift, __be64 *pas, int access_flags)
+/*
+ * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be
+ * filled in the pas array.
+ */
+void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
+			  u64 access_flags)
 {
-	return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
-				      ib_umem_num_dma_blocks(umem, PAGE_SIZE),
-				      pas, access_flags);
+	struct ib_block_iter biter;
+
+	rdma_umem_for_each_dma_block (umem, &biter, page_size) {
+		*pas = cpu_to_be64(rdma_block_iter_dma_address(&biter) |
+				   access_flags);
+		pas++;
+	}
 }
+
 int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
 {
 	u64 page_size;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 3e2c471d77bd..b043a178e95b 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1235,8 +1235,8 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 			    int page_shift, size_t offset, size_t num_pages,
 			    __be64 *pas, int access_flags);
-void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
-			  int page_shift, __be64 *pas, int access_flags);
+void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
+			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 57d3dc111a2b..b7119722a54a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1167,7 +1167,10 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 
 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	if (populate)
-		inlen += sizeof(*pas) * roundup(ib_umem_num_pages(umem), 2);
+		inlen +=
+			sizeof(*pas) *
+			roundup(ib_umem_num_dma_blocks(umem, 1UL << page_shift),
+				2);
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in) {
 		err = -ENOMEM;
@@ -1179,7 +1182,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 			err = -EINVAL;
 			goto err_2;
 		}
-		mlx5_ib_populate_pas(dev, umem, page_shift, pas,
+		mlx5_ib_populate_pas(umem, 1ULL << page_shift, pas,
 				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
 	}
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6915639a776f..042177f33252 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -971,7 +971,7 @@ static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	MLX5_SET(create_qp_in, *in, uid, uid);
 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
 	if (ubuffer->umem)
-		mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0);
+		mlx5_ib_populate_pas(ubuffer->umem, 1UL << page_shift, pas, 0);
 
 	qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc);
 
@@ -1251,7 +1251,7 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
 	MLX5_SET(wq, wq, page_offset, offset);
 
 	pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
-	mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0);
+	mlx5_ib_populate_pas(sq->ubuffer.umem, 1UL << page_shift, pas, 0);
 
 	err = mlx5_core_create_sq_tracked(dev, in, inlen, &sq->base.mqp);
 
@@ -4881,7 +4881,7 @@ static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
 		MLX5_SET(rqc, rqc, delay_drop_en, 1);
 	}
 	rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
-	mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
+	mlx5_ib_populate_pas(rwq->umem, 1UL << rwq->page_shift, rq_pas0, 0);
 	err = mlx5_core_create_rq_tracked(dev, in, inlen, &rwq->core_qp);
 	if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
 		err = set_delay_drop(dev);
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 239c7ec65e11..dd6e42d3d175 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -100,7 +100,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 		goto err_umem;
 	}
 
-	mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
+	mlx5_ib_populate_pas(srq->umem, 1UL << page_shift, in->pas, 0);
 
 	err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db);
 	if (err) {
-- 
2.26.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH rdma-next 2/5] RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c
  2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
  2020-10-26 13:23 ` [PATCH rdma-next 1/5] RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block() Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
  2020-10-26 13:23 ` [PATCH rdma-next 3/5] RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt() Leon Romanovsky
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma

From: Jason Gunthorpe <jgg@nvidia.com>

This is the only user, so remove the wrappers.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c    | 20 --------------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  3 +--
 drivers/infiniband/hw/mlx5/mr.c      | 18 ++++++++++++++++++
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index bca57c7661eb..b7eb977eb869 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -75,12 +75,6 @@ static LIST_HEAD(mlx5_ib_dev_list);
  */
 static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
 
-/* We can't use an array for xlt_emergency_page because dma_map_single
- * doesn't work on kernel modules memory
- */
-static unsigned long xlt_emergency_page;
-static struct mutex xlt_emergency_page_mutex;
-
 struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
 {
 	struct mlx5_ib_dev *dev;
@@ -4877,17 +4871,6 @@ static struct mlx5_interface mlx5_ib_interface = {
 	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
 };
 
-unsigned long mlx5_ib_get_xlt_emergency_page(void)
-{
-	mutex_lock(&xlt_emergency_page_mutex);
-	return xlt_emergency_page;
-}
-
-void mlx5_ib_put_xlt_emergency_page(void)
-{
-	mutex_unlock(&xlt_emergency_page_mutex);
-}
-
 static int __init mlx5_ib_init(void)
 {
 	int err;
@@ -4896,8 +4879,6 @@ static int __init mlx5_ib_init(void)
 	if (!xlt_emergency_page)
 		return -ENOMEM;
 
-	mutex_init(&xlt_emergency_page_mutex);
-
 	mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
 	if (!mlx5_ib_event_wq) {
 		free_page(xlt_emergency_page);
@@ -4915,7 +4896,6 @@ static void __exit mlx5_ib_cleanup(void)
 {
 	mlx5_unregister_interface(&mlx5_ib_interface);
 	destroy_workqueue(mlx5_ib_event_wq);
-	mutex_destroy(&xlt_emergency_page_mutex);
 	free_page(xlt_emergency_page);
 }
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b043a178e95b..8f728b98f9a6 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1454,8 +1454,7 @@ static inline int get_num_static_uars(struct mlx5_ib_dev *dev,
 	return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages;
 }
 
-unsigned long mlx5_ib_get_xlt_emergency_page(void);
-void mlx5_ib_put_xlt_emergency_page(void);
+extern unsigned long xlt_emergency_page;
 
 int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 			struct mlx5_bfreg_info *bfregi, u32 bfregn,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index b7119722a54a..2971e7f48d40 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -41,6 +41,13 @@
 #include <rdma/ib_verbs.h>
 #include "mlx5_ib.h"
 
+/*
+ * We can't use an array for xlt_emergency_page because dma_map_single doesn't
+ * work on kernel modules memory
+ */
+unsigned long xlt_emergency_page;
+static DEFINE_MUTEX(xlt_emergency_page_mutex);
+
 enum {
 	MAX_PENDING_REG_MR = 8,
 };
@@ -992,6 +999,17 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
 			    MLX5_UMR_MTT_ALIGNMENT)
 #define MLX5_SPARE_UMR_CHUNK 0x10000
 
+static unsigned long mlx5_ib_get_xlt_emergency_page(void)
+{
+	mutex_lock(&xlt_emergency_page_mutex);
+	return xlt_emergency_page;
+}
+
+static void mlx5_ib_put_xlt_emergency_page(void)
+{
+	mutex_unlock(&xlt_emergency_page_mutex);
+}
+
 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		       int page_shift, int flags)
 {
-- 
2.26.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH rdma-next 3/5] RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt()
  2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
  2020-10-26 13:23 ` [PATCH rdma-next 1/5] RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block() Leon Romanovsky
  2020-10-26 13:23 ` [PATCH rdma-next 2/5] RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
  2020-10-26 13:23 ` [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases Leon Romanovsky
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma

From: Jason Gunthorpe <jgg@nvidia.com>

The memory allocation is quite complicated, and makes this function hard
to understand. Refactor things so that a function call sets up the WR, SG,
DMA mapping and buffer, further splitting that into buffer and DMA/wr.

This also slightly changes the buffer allocation logic to try an order 0
page allocation (with OOM warnings on) before going to the emergency page.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c    |   6 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   2 +-
 drivers/infiniband/hw/mlx5/mr.c      | 189 ++++++++++++++++++---------
 3 files changed, 128 insertions(+), 69 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b7eb977eb869..32ec59315f39 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4875,13 +4875,13 @@ static int __init mlx5_ib_init(void)
 {
 	int err;
 
-	xlt_emergency_page = __get_free_page(GFP_KERNEL);
+	xlt_emergency_page = (void *)__get_free_page(GFP_KERNEL);
 	if (!xlt_emergency_page)
 		return -ENOMEM;
 
 	mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
 	if (!mlx5_ib_event_wq) {
-		free_page(xlt_emergency_page);
+		free_page((unsigned long)xlt_emergency_page);
 		return -ENOMEM;
 	}
 
@@ -4896,7 +4896,7 @@ static void __exit mlx5_ib_cleanup(void)
 {
 	mlx5_unregister_interface(&mlx5_ib_interface);
 	destroy_workqueue(mlx5_ib_event_wq);
-	free_page(xlt_emergency_page);
+	free_page((unsigned long)xlt_emergency_page);
 }
 
 module_init(mlx5_ib_init);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 8f728b98f9a6..d92afbd26aa5 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1454,7 +1454,7 @@ static inline int get_num_static_uars(struct mlx5_ib_dev *dev,
 	return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages;
 }
 
-extern unsigned long xlt_emergency_page;
+extern void *xlt_emergency_page;
 
 int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 			struct mlx5_bfreg_info *bfregi, u32 bfregn,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 2971e7f48d40..b2ec4abc5639 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -45,7 +45,7 @@
  * We can't use an array for xlt_emergency_page because dma_map_single doesn't
  * work on kernel modules memory
  */
-unsigned long xlt_emergency_page;
+void *xlt_emergency_page;
 static DEFINE_MUTEX(xlt_emergency_page_mutex);
 
 enum {
@@ -999,15 +999,121 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
 			    MLX5_UMR_MTT_ALIGNMENT)
 #define MLX5_SPARE_UMR_CHUNK 0x10000
 
-static unsigned long mlx5_ib_get_xlt_emergency_page(void)
+/*
+ * Allocate a temporary buffer to hold the per-page information to transfer to
+ * HW. For efficiency this should be as large as it can be, but buffer
+ * allocation failure is not allowed, so try smaller sizes.
+ */
+static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
 {
+	const size_t xlt_chunk_align =
+		MLX5_UMR_MTT_ALIGNMENT / sizeof(ent_size);
+	size_t size;
+	void *res = NULL;
+
+	static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
+
+	/*
+	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
+	 * allocation can't trigger any kind of reclaim.
+	 */
+	might_sleep();
+
+	gfp_mask |= __GFP_ZERO;
+
+	/*
+	 * If the system already has a suitable high order page then just use
+	 * that, but don't try hard to create one. This max is about 1M, so a
+	 * free x86 huge page will satisfy it.
+	 */
+	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
+		     MLX5_MAX_UMR_CHUNK);
+	*nents = size / ent_size;
+	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
+				       get_order(size));
+	if (res)
+		return res;
+
+	if (size > MLX5_SPARE_UMR_CHUNK) {
+		size = MLX5_SPARE_UMR_CHUNK;
+		*nents = get_order(size) / ent_size;
+		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
+					       get_order(size));
+		if (res)
+			return res;
+	}
+
+	*nents = PAGE_SIZE / ent_size;
+	res = (void *)__get_free_page(gfp_mask);
+	if (res)
+		return res;
+
 	mutex_lock(&xlt_emergency_page_mutex);
+	memset(xlt_emergency_page, 0, PAGE_SIZE);
 	return xlt_emergency_page;
 }
 
-static void mlx5_ib_put_xlt_emergency_page(void)
+static void mlx5_ib_free_xlt(void *xlt, size_t length)
+{
+	if (xlt == xlt_emergency_page) {
+		mutex_unlock(&xlt_emergency_page_mutex);
+		return;
+	}
+
+	free_pages((unsigned long)xlt, get_order(length));
+}
+
+/*
+ * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
+ * submission.
+ */
+static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
+				   struct mlx5_umr_wr *wr, struct ib_sge *sg,
+				   size_t nents, size_t ent_size,
+				   unsigned int flags)
+{
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct device *ddev = dev->ib_dev.dev.parent;
+	dma_addr_t dma;
+	void *xlt;
+
+	xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
+				flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
+								 GFP_KERNEL);
+	sg->length = nents * ent_size;
+	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
+	if (dma_mapping_error(ddev, dma)) {
+		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
+		mlx5_ib_free_xlt(xlt, sg->length);
+		return NULL;
+	}
+	sg->addr = dma;
+	sg->lkey = dev->umrc.pd->local_dma_lkey;
+
+	memset(wr, 0, sizeof(*wr));
+	wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
+	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
+		wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+	wr->wr.sg_list = sg;
+	wr->wr.num_sge = 1;
+	wr->wr.opcode = MLX5_IB_WR_UMR;
+	wr->pd = mr->ibmr.pd;
+	wr->mkey = mr->mmkey.key;
+	wr->length = mr->mmkey.size;
+	wr->virt_addr = mr->mmkey.iova;
+	wr->access_flags = mr->access_flags;
+	wr->page_shift = mr->page_shift;
+	wr->xlt_size = sg->length;
+	return xlt;
+}
+
+static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
+				   struct ib_sge *sg)
 {
-	mutex_unlock(&xlt_emergency_page_mutex);
+	struct device *ddev = dev->ib_dev.dev.parent;
+
+	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
+	mlx5_ib_free_xlt(xlt, sg->length);
 }
 
 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
@@ -1015,9 +1121,7 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 {
 	struct mlx5_ib_dev *dev = mr->dev;
 	struct device *ddev = dev->ib_dev.dev.parent;
-	int size;
 	void *xlt;
-	dma_addr_t dma;
 	struct mlx5_umr_wr wr;
 	struct ib_sge sg;
 	int err = 0;
@@ -1028,10 +1132,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	const int page_mask = page_align - 1;
 	size_t pages_mapped = 0;
 	size_t pages_to_map = 0;
-	size_t pages_iter = 0;
+	size_t pages_iter;
 	size_t size_to_map = 0;
-	gfp_t gfp;
-	bool use_emergency_page = false;
+	size_t orig_sg_length;
 
 	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
 	    !umr_can_use_indirect_mkey(dev))
@@ -1044,37 +1147,13 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		npages += idx & page_mask;
 		idx &= ~page_mask;
 	}
-
-	gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL;
-	gfp |= __GFP_ZERO | __GFP_NOWARN;
-
 	pages_to_map = ALIGN(npages, page_align);
-	size = desc_size * pages_to_map;
-	size = min_t(int, size, MLX5_MAX_UMR_CHUNK);
-
-	xlt = (void *)__get_free_pages(gfp, get_order(size));
-	if (!xlt && size > MLX5_SPARE_UMR_CHUNK) {
-		mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n",
-			    size, get_order(size), MLX5_SPARE_UMR_CHUNK);
 
-		size = MLX5_SPARE_UMR_CHUNK;
-		xlt = (void *)__get_free_pages(gfp, get_order(size));
-	}
-
-	if (!xlt) {
-		mlx5_ib_warn(dev, "Using XLT emergency buffer\n");
-		xlt = (void *)mlx5_ib_get_xlt_emergency_page();
-		size = PAGE_SIZE;
-		memset(xlt, 0, size);
-		use_emergency_page = true;
-	}
-	pages_iter = size / desc_size;
-	dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE);
-	if (dma_mapping_error(ddev, dma)) {
-		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
-		err = -ENOMEM;
-		goto free_xlt;
-	}
+	xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
+	if (!xlt)
+		return -ENOMEM;
+	pages_iter = sg.length / desc_size;
+	orig_sg_length = sg.length;
 
 	if (mr->umem->is_odp) {
 		if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
@@ -1085,22 +1164,6 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		}
 	}
 
-	sg.addr = dma;
-	sg.lkey = dev->umrc.pd->local_dma_lkey;
-
-	memset(&wr, 0, sizeof(wr));
-	wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
-	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
-		wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
-	wr.wr.sg_list = &sg;
-	wr.wr.num_sge = 1;
-	wr.wr.opcode = MLX5_IB_WR_UMR;
-
-	wr.pd = mr->ibmr.pd;
-	wr.mkey = mr->mmkey.key;
-	wr.length = mr->mmkey.size;
-	wr.virt_addr = mr->mmkey.iova;
-	wr.access_flags = mr->access_flags;
 	wr.page_shift = page_shift;
 
 	for (pages_mapped = 0;
@@ -1108,7 +1171,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	     pages_mapped += pages_iter, idx += pages_iter) {
 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
 		size_to_map = npages * desc_size;
-		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
+		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
+					DMA_TO_DEVICE);
 		if (mr->umem->is_odp) {
 			mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
 		} else {
@@ -1118,9 +1182,10 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 			/* Clear padding after the pages
 			 * brought from the umem.
 			 */
-			memset(xlt + size_to_map, 0, size - size_to_map);
+			memset(xlt + size_to_map, 0, sg.length - size_to_map);
 		}
-		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
+		dma_sync_single_for_device(ddev, sg.addr, sg.length,
+					   DMA_TO_DEVICE);
 
 		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
 
@@ -1144,14 +1209,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 
 		err = mlx5_ib_post_send_wait(dev, &wr);
 	}
-	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
-
-free_xlt:
-	if (use_emergency_page)
-		mlx5_ib_put_xlt_emergency_page();
-	else
-		free_pages((unsigned long)xlt, get_order(size));
-
+	sg.length = orig_sg_length;
+	mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
 	return err;
 }
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases
  2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
                   ` (2 preceding siblings ...)
  2020-10-26 13:23 ` [PATCH rdma-next 3/5] RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt() Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
  2020-10-29 12:33   ` Jason Gunthorpe
  2020-10-26 13:23 ` [PATCH rdma-next 5/5] RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's Leon Romanovsky
  2020-11-02 19:11 ` [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Jason Gunthorpe
  5 siblings, 1 reply; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma

From: Jason Gunthorpe <jgg@nvidia.com>

Mixing these together is just a mess, make a dedicated version,
mlx5_ib_update_mr_pas(), which directly loads the whole MTT for a non-ODP
MR.

The split out version can trivially use a simple loop with
rdma_for_each_block() which allows using the core code to compute the MR
pages and avoids seeking in the SGL list after each chunk as the
__mlx5_ib_populate_pas() call required.

Significantly speeds loading large MTTs.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mem.c     |  64 -------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   3 -
 drivers/infiniband/hw/mlx5/mr.c      | 137 +++++++++++++++++++--------
 3 files changed, 97 insertions(+), 107 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 779c4a040d8b..92e7621ec858 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -91,70 +91,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 	*shift = PAGE_SHIFT + m;
 }
 
-/*
- * Populate the given array with bus addresses from the umem.
- *
- * dev - mlx5_ib device
- * umem - umem to use to fill the pages
- * page_shift - determines the page size used in the resulting array
- * offset - offset into the umem to start from,
- *          only implemented for ODP umems
- * num_pages - total number of pages to fill
- * pas - bus addresses array to fill
- * access_flags - access flags to set on all present pages.
-		  use enum mlx5_ib_mtt_access_flags for this.
- */
-void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
-			    int page_shift, size_t offset, size_t num_pages,
-			    __be64 *pas, int access_flags)
-{
-	int shift = page_shift - PAGE_SHIFT;
-	int mask = (1 << shift) - 1;
-	int i, k, idx;
-	u64 cur = 0;
-	u64 base;
-	int len;
-	struct scatterlist *sg;
-	int entry;
-
-	i = 0;
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		base = sg_dma_address(sg);
-
-		/* Skip elements below offset */
-		if (i + len < offset << shift) {
-			i += len;
-			continue;
-		}
-
-		/* Skip pages below offset */
-		if (i < offset << shift) {
-			k = (offset << shift) - i;
-			i = offset << shift;
-		} else {
-			k = 0;
-		}
-
-		for (; k < len; k++) {
-			if (!(i & mask)) {
-				cur = base + (k << PAGE_SHIFT);
-				cur |= access_flags;
-				idx = (i >> shift) - offset;
-
-				pas[idx] = cpu_to_be64(cur);
-				mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
-					    i >> shift, be64_to_cpu(pas[idx]));
-			}
-			i++;
-
-			/* Stop after num_pages reached */
-			if (i >> shift >= offset + num_pages)
-				return;
-		}
-	}
-}
-
 /*
  * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be
  * filled in the pas array.
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d92afbd26aa5..aadd43425a58 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1232,9 +1232,6 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 			unsigned long max_page_shift,
 			int *shift);
-void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
-			    int page_shift, size_t offset, size_t num_pages,
-			    __be64 *pas, int access_flags);
 void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index b2ec4abc5639..10f13acc88c9 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1116,6 +1116,21 @@ static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
 	mlx5_ib_free_xlt(xlt, sg->length);
 }
 
+static unsigned int xlt_wr_final_send_flags(unsigned int flags)
+{
+	unsigned int res = 0;
+
+	if (flags & MLX5_IB_UPD_XLT_ENABLE)
+		res |= MLX5_IB_SEND_UMR_ENABLE_MR |
+		       MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
+		       MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
+	if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
+		res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
+	if (flags & MLX5_IB_UPD_XLT_ADDR)
+		res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
+	return res;
+}
+
 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		       int page_shift, int flags)
 {
@@ -1140,6 +1155,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	    !umr_can_use_indirect_mkey(dev))
 		return -EPERM;
 
+	if (WARN_ON(!mr->umem->is_odp))
+		return -EINVAL;
+
 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
 	 * so we need to align the offset and length accordingly
 	 */
@@ -1155,13 +1173,11 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	pages_iter = sg.length / desc_size;
 	orig_sg_length = sg.length;
 
-	if (mr->umem->is_odp) {
-		if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
-			struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
-			size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
+	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
+		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
 
-			pages_to_map = min_t(size_t, pages_to_map, max_pages);
-		}
+		pages_to_map = min_t(size_t, pages_to_map, max_pages);
 	}
 
 	wr.page_shift = page_shift;
@@ -1173,36 +1189,14 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		size_to_map = npages * desc_size;
 		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
 					DMA_TO_DEVICE);
-		if (mr->umem->is_odp) {
-			mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
-		} else {
-			__mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
-					       npages, xlt,
-					       MLX5_IB_MTT_PRESENT);
-			/* Clear padding after the pages
-			 * brought from the umem.
-			 */
-			memset(xlt + size_to_map, 0, sg.length - size_to_map);
-		}
+		mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
 					   DMA_TO_DEVICE);
 
 		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
 
-		if (pages_mapped + pages_iter >= pages_to_map) {
-			if (flags & MLX5_IB_UPD_XLT_ENABLE)
-				wr.wr.send_flags |=
-					MLX5_IB_SEND_UMR_ENABLE_MR |
-					MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
-					MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
-			if (flags & MLX5_IB_UPD_XLT_PD ||
-			    flags & MLX5_IB_UPD_XLT_ACCESS)
-				wr.wr.send_flags |=
-					MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
-			if (flags & MLX5_IB_UPD_XLT_ADDR)
-				wr.wr.send_flags |=
-					MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
-		}
+		if (pages_mapped + pages_iter >= pages_to_map)
+			wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
 
 		wr.offset = idx * desc_size;
 		wr.xlt_size = sg.length;
@@ -1214,6 +1208,69 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	return err;
 }
 
+/*
+ * Send the DMA list to the HW for a normal MR using UMR.
+ */
+static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+{
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct device *ddev = dev->ib_dev.dev.parent;
+	struct ib_block_iter biter;
+	struct mlx5_mtt *cur_mtt;
+	struct mlx5_umr_wr wr;
+	size_t orig_sg_length;
+	struct mlx5_mtt *mtt;
+	size_t final_size;
+	struct ib_sge sg;
+	int err = 0;
+
+	if (WARN_ON(mr->umem->is_odp))
+		return -EINVAL;
+
+	mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
+				    ib_umem_num_dma_blocks(mr->umem,
+							   1 << mr->page_shift),
+				    sizeof(*mtt), flags);
+	if (!mtt)
+		return -ENOMEM;
+	orig_sg_length = sg.length;
+
+	cur_mtt = mtt;
+	rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap,
+			     BIT(mr->page_shift)) {
+		if (cur_mtt == (void *)mtt + sg.length) {
+			dma_sync_single_for_device(ddev, sg.addr, sg.length,
+						   DMA_TO_DEVICE);
+			err = mlx5_ib_post_send_wait(dev, &wr);
+			if (err)
+				goto err;
+			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
+						DMA_TO_DEVICE);
+			wr.offset += sg.length;
+			cur_mtt = mtt;
+		}
+
+		cur_mtt->ptag =
+			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
+				    MLX5_IB_MTT_PRESENT);
+		cur_mtt++;
+	}
+
+	final_size = (void *)cur_mtt - (void *)mtt;
+	sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
+	memset(cur_mtt, 0, sg.length - final_size);
+	wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
+	wr.xlt_size = sg.length;
+
+	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
+	err = mlx5_ib_post_send_wait(dev, &wr);
+
+err:
+	sg.length = orig_sg_length;
+	mlx5_ib_unmap_free_xlt(dev, mtt, &sg);
+	return err;
+}
+
 /*
  * If ibmr is NULL it will be allocated by reg_create.
  * Else, the given ibmr will be used.
@@ -1483,10 +1540,14 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		 */
 		int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
 
-		err = mlx5_ib_update_xlt(
-			mr, 0,
-			ib_umem_num_dma_blocks(umem, 1UL << mr->page_shift),
-			mr->page_shift, update_xlt_flags);
+		if (is_odp_mr(mr))
+			err = mlx5_ib_update_xlt(
+				mr, 0,
+				ib_umem_num_dma_blocks(umem,
+						       1UL << mr->page_shift),
+				mr->page_shift, update_xlt_flags);
+		else
+			err = mlx5_ib_update_mr_pas(mr, update_xlt_flags);
 		if (err) {
 			dereg_mr(dev, mr);
 			return ERR_PTR(err);
@@ -1652,11 +1713,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 				upd_flags |= MLX5_IB_UPD_XLT_PD;
 			if (flags & IB_MR_REREG_ACCESS)
 				upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
-			err = mlx5_ib_update_xlt(
-				mr, 0,
-				ib_umem_num_dma_blocks(mr->umem,
-						       1UL << mr->page_shift),
-				mr->page_shift, upd_flags);
+			err = mlx5_ib_update_mr_pas(mr, upd_flags);
 		} else {
 			err = rereg_umr(pd, mr, access_flags, flags);
 		}
-- 
2.26.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH rdma-next 5/5] RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's
  2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
                   ` (3 preceding siblings ...)
  2020-10-26 13:23 ` [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
  2020-11-02 19:11 ` [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Jason Gunthorpe
  5 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma

From: Jason Gunthorpe <jgg@nvidia.com>

Now that all the PAS arrays or UMR XLT's for mkcs are filled using
rdma_for_each_block() we can use the common ib_umem_find_best_pgsz()
algorithm.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/umem.c       |  9 +++++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 27 ++++++++++++++++++++
 drivers/infiniband/hw/mlx5/mr.c      | 37 +++++++++++++++-------------
 3 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index e9fecbdf391b..f1fc7e39c782 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -84,6 +84,15 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 	dma_addr_t mask;
 	int i;
 
+	if (umem->is_odp) {
+		unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift);
+
+		/* ODP must always be self consistent. */
+		if (!(pgsz_bitmap & page_size))
+			return 0;
+		return page_size;
+	}
+
 	/* rdma_for_each_block() has a bug if the page size is smaller than the
 	 * page size used to build the umem. For now prevent smaller page sizes
 	 * from being returned.
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index aadd43425a58..bb44080170be 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -42,6 +42,33 @@
 
 #define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size)
 
+static __always_inline unsigned long
+__mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits,
+			       unsigned int pgsz_shift)
+{
+	unsigned int largest_pg_shift =
+		min_t(unsigned long, (1ULL << log_pgsz_bits) - 1 + pgsz_shift,
+		      BITS_PER_LONG - 1);
+
+	/*
+	 * Despite a command allowing it, the device does not support lower than
+	 * 4k page size.
+	 */
+	pgsz_shift = max_t(unsigned int, MLX5_ADAPTER_PAGE_SHIFT, pgsz_shift);
+	return GENMASK(largest_pg_shift, pgsz_shift);
+}
+
+/*
+ * For mkc users, instead of a page_offset the command has a start_iova which
+ * specifies both the page_offset and the on-the-wire IOVA
+ */
+#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova)    \
+	ib_umem_find_best_pgsz(umem,                                           \
+			       __mlx5_log_page_size_to_bitmap(                 \
+				       __mlx5_bit_sz(typ, log_pgsz_fld),       \
+				       pgsz_shift),                            \
+			       iova)
+
 enum {
 	MLX5_IB_MMAP_OFFSET_START = 9,
 	MLX5_IB_MMAP_OFFSET_END = 255,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 10f13acc88c9..660b721df64d 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -964,11 +964,13 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
-	int page_shift;
+	unsigned int page_size;
 
-	mlx5_ib_cont_pages(umem, iova, MLX5_MKEY_PAGE_SHIFT_MASK, &page_shift);
-	ent = mr_cache_ent_from_order(dev, order_base_2(ib_umem_num_dma_blocks(
-						   umem, 1UL << page_shift)));
+	page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
+	if (WARN_ON(!page_size))
+		return ERR_PTR(-EINVAL);
+	ent = mr_cache_ent_from_order(
+		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
 	if (!ent)
 		return ERR_PTR(-E2BIG);
 
@@ -990,7 +992,7 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
 	mr->mmkey.iova = iova;
 	mr->mmkey.size = umem->length;
 	mr->mmkey.pd = to_mpd(pd)->pdn;
-	mr->page_shift = page_shift;
+	mr->page_shift = order_base_2(page_size);
 
 	return mr;
 }
@@ -1280,8 +1282,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 				     int access_flags, bool populate)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	unsigned int page_size;
 	struct mlx5_ib_mr *mr;
-	int page_shift;
 	__be64 *pas;
 	void *mkc;
 	int inlen;
@@ -1289,22 +1291,23 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 	int err;
 	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
 
+	page_size =
+		mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
+	if (WARN_ON(!page_size))
+		return ERR_PTR(-EINVAL);
+
 	mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mlx5_ib_cont_pages(umem, iova, MLX5_MKEY_PAGE_SHIFT_MASK, &page_shift);
-
-	mr->page_shift = page_shift;
 	mr->ibmr.pd = pd;
 	mr->access_flags = access_flags;
+	mr->page_shift = order_base_2(page_size);
 
 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	if (populate)
-		inlen +=
-			sizeof(*pas) *
-			roundup(ib_umem_num_dma_blocks(umem, 1UL << page_shift),
-				2);
+		inlen += sizeof(*pas) *
+			 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in) {
 		err = -ENOMEM;
@@ -1316,7 +1319,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 			err = -EINVAL;
 			goto err_2;
 		}
-		mlx5_ib_populate_pas(umem, 1ULL << page_shift, pas,
+		mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
 				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
 	}
 
@@ -1334,11 +1337,11 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 	MLX5_SET64(mkc, mkc, len, umem->length);
 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
 	MLX5_SET(mkc, mkc, translations_octword_size,
-		 get_octo_len(iova, umem->length, page_shift));
-	MLX5_SET(mkc, mkc, log_page_size, page_shift);
+		 get_octo_len(iova, umem->length, mr->page_shift));
+	MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
 	if (populate) {
 		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
-			 get_octo_len(iova, umem->length, page_shift));
+			 get_octo_len(iova, umem->length, mr->page_shift));
 	}
 
 	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
-- 
2.26.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases
  2020-10-26 13:23 ` [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases Leon Romanovsky
@ 2020-10-29 12:33   ` Jason Gunthorpe
  0 siblings, 0 replies; 8+ messages in thread
From: Jason Gunthorpe @ 2020-10-29 12:33 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Doug Ledford, linux-rdma

On Mon, Oct 26, 2020 at 03:23:13PM +0200, Leon Romanovsky wrote:
> @@ -1483,10 +1540,14 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
>  		 */
>  		int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
>  
> -		err = mlx5_ib_update_xlt(
> -			mr, 0,
> -			ib_umem_num_dma_blocks(umem, 1UL << mr->page_shift),
> -			mr->page_shift, update_xlt_flags);
> +		if (is_odp_mr(mr))
> +			err = mlx5_ib_update_xlt(
> +				mr, 0,
> +				ib_umem_num_dma_blocks(umem,
> +						       1UL << mr->page_shift),
> +				mr->page_shift, update_xlt_flags);
> +		else
> +			err = mlx5_ib_update_mr_pas(mr, update_xlt_flags);

This rebase looks a bit weird, this whole block is !ODP already, so
why is there an 'if is_odp_mr()' ?

Should just be this:

	if (xlt_with_umr && !(access_flags & IB_ACCESS_ON_DEMAND)) {
		/*
		 * If the MR was created with reg_create then it will be
		 * configured properly but left disabled. It is safe to go ahead
		 * and configure it again via UMR while enabling it.
		 */
		err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
		if (err) {
			dereg_mr(dev, mr);
			return ERR_PTR(err);
		}
	}

Jason

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs
  2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
                   ` (4 preceding siblings ...)
  2020-10-26 13:23 ` [PATCH rdma-next 5/5] RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's Leon Romanovsky
@ 2020-11-02 19:11 ` Jason Gunthorpe
  5 siblings, 0 replies; 8+ messages in thread
From: Jason Gunthorpe @ 2020-11-02 19:11 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Doug Ledford, Leon Romanovsky, linux-kernel, linux-rdma

On Mon, Oct 26, 2020 at 03:23:09PM +0200, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> >From Jason:
> 
> The new common code does a better job finding large page sizes. Use it in
> mlx5 for MRs.
> 
> This requires moving the MTT population for mailboxes and UMR over to
> rdma_for_each_dma_block().
> 
> Thanks
> 
> Jason Gunthorpe (5):
>   RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block()
>   RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c
>   RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt()
>   RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases
>   RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's

Applied to for-next, with the updated hunk

Thanks,
Jason

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, back to index

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 1/5] RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block() Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 2/5] RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 3/5] RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt() Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases Leon Romanovsky
2020-10-29 12:33   ` Jason Gunthorpe
2020-10-26 13:23 ` [PATCH rdma-next 5/5] RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's Leon Romanovsky
2020-11-02 19:11 ` [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Jason Gunthorpe

Linux-RDMA Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-rdma/0 linux-rdma/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-rdma linux-rdma/ https://lore.kernel.org/linux-rdma \
		linux-rdma@vger.kernel.org
	public-inbox-index linux-rdma

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-rdma


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git