* [PATCH rdma-next 1/5] RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block()
2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 2/5] RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c Leon Romanovsky
` (4 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma
From: Jason Gunthorpe <jgg@nvidia.com>
This routine converts the umem SGL into a list of fixed pages for DMA,
which is exactly what rdma_umem_for_each_dma_block() is for, use the
common code directly.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx5/cq.c | 6 +++---
drivers/infiniband/hw/mlx5/devx.c | 4 ++--
drivers/infiniband/hw/mlx5/mem.c | 19 ++++++++++++++-----
drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++--
drivers/infiniband/hw/mlx5/mr.c | 7 +++++--
drivers/infiniband/hw/mlx5/qp.c | 6 +++---
drivers/infiniband/hw/mlx5/srq.c | 2 +-
7 files changed, 30 insertions(+), 18 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 2088e4a3c32d..9ab93d730769 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -762,7 +762,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
}
pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
- mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, pas, 0);
+ mlx5_ib_populate_pas(cq->buf.umem, 1UL << page_shift, pas, 0);
cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
MLX5_SET(cqc, cqc, log_page_size,
@@ -1305,8 +1305,8 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
pas = (__be64 *)MLX5_ADDR_OF(modify_cq_in, in, pas);
if (udata)
- mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift,
- pas, 0);
+ mlx5_ib_populate_pas(cq->resize_umem, 1UL << page_shift, pas,
+ 0);
else
mlx5_fill_page_frag_array(&cq->resize_buf->frag_buf, pas);
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index ae889266acf1..611ce21157de 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2115,9 +2115,9 @@ static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev,
MLX5_SET(umem, umem, log_page_size, obj->page_shift -
MLX5_ADAPTER_PAGE_SHIFT);
MLX5_SET(umem, umem, page_offset, obj->page_offset);
- mlx5_ib_populate_pas(dev, obj->umem, obj->page_shift, mtt,
+ mlx5_ib_populate_pas(obj->umem, 1UL << obj->page_shift, mtt,
(obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
- MLX5_IB_MTT_READ);
+ MLX5_IB_MTT_READ);
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 7ae96b37bd6e..779c4a040d8b 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -155,13 +155,22 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
}
}
-void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
- int page_shift, __be64 *pas, int access_flags)
+/*
+ * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be
+ * filled in the pas array.
+ */
+void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
+ u64 access_flags)
{
- return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
- ib_umem_num_dma_blocks(umem, PAGE_SIZE),
- pas, access_flags);
+ struct ib_block_iter biter;
+
+ rdma_umem_for_each_dma_block (umem, &biter, page_size) {
+ *pas = cpu_to_be64(rdma_block_iter_dma_address(&biter) |
+ access_flags);
+ pas++;
+ }
}
+
int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
{
u64 page_size;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 3e2c471d77bd..b043a178e95b 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1235,8 +1235,8 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, size_t offset, size_t num_pages,
__be64 *pas, int access_flags);
-void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
- int page_shift, __be64 *pas, int access_flags);
+void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
+ u64 access_flags);
void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 57d3dc111a2b..b7119722a54a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1167,7 +1167,10 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
if (populate)
- inlen += sizeof(*pas) * roundup(ib_umem_num_pages(umem), 2);
+ inlen +=
+ sizeof(*pas) *
+ roundup(ib_umem_num_dma_blocks(umem, 1UL << page_shift),
+ 2);
in = kvzalloc(inlen, GFP_KERNEL);
if (!in) {
err = -ENOMEM;
@@ -1179,7 +1182,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
err = -EINVAL;
goto err_2;
}
- mlx5_ib_populate_pas(dev, umem, page_shift, pas,
+ mlx5_ib_populate_pas(umem, 1ULL << page_shift, pas,
pg_cap ? MLX5_IB_MTT_PRESENT : 0);
}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6915639a776f..042177f33252 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -971,7 +971,7 @@ static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
MLX5_SET(create_qp_in, *in, uid, uid);
pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
if (ubuffer->umem)
- mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0);
+ mlx5_ib_populate_pas(ubuffer->umem, 1UL << page_shift, pas, 0);
qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc);
@@ -1251,7 +1251,7 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
MLX5_SET(wq, wq, page_offset, offset);
pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
- mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0);
+ mlx5_ib_populate_pas(sq->ubuffer.umem, 1UL << page_shift, pas, 0);
err = mlx5_core_create_sq_tracked(dev, in, inlen, &sq->base.mqp);
@@ -4881,7 +4881,7 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
MLX5_SET(rqc, rqc, delay_drop_en, 1);
}
rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
- mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
+ mlx5_ib_populate_pas(rwq->umem, 1UL << rwq->page_shift, rq_pas0, 0);
err = mlx5_core_create_rq_tracked(dev, in, inlen, &rwq->core_qp);
if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
err = set_delay_drop(dev);
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 239c7ec65e11..dd6e42d3d175 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -100,7 +100,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
goto err_umem;
}
- mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
+ mlx5_ib_populate_pas(srq->umem, 1UL << page_shift, in->pas, 0);
err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db);
if (err) {
--
2.26.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH rdma-next 2/5] RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c
2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 1/5] RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block() Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 3/5] RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt() Leon Romanovsky
` (3 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma
From: Jason Gunthorpe <jgg@nvidia.com>
This is the only user, so remove the wrappers.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx5/main.c | 20 --------------------
drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +--
drivers/infiniband/hw/mlx5/mr.c | 18 ++++++++++++++++++
3 files changed, 19 insertions(+), 22 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index bca57c7661eb..b7eb977eb869 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -75,12 +75,6 @@ static LIST_HEAD(mlx5_ib_dev_list);
*/
static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
-/* We can't use an array for xlt_emergency_page because dma_map_single
- * doesn't work on kernel modules memory
- */
-static unsigned long xlt_emergency_page;
-static struct mutex xlt_emergency_page_mutex;
-
struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
{
struct mlx5_ib_dev *dev;
@@ -4877,17 +4871,6 @@ static struct mlx5_interface mlx5_ib_interface = {
.protocol = MLX5_INTERFACE_PROTOCOL_IB,
};
-unsigned long mlx5_ib_get_xlt_emergency_page(void)
-{
- mutex_lock(&xlt_emergency_page_mutex);
- return xlt_emergency_page;
-}
-
-void mlx5_ib_put_xlt_emergency_page(void)
-{
- mutex_unlock(&xlt_emergency_page_mutex);
-}
-
static int __init mlx5_ib_init(void)
{
int err;
@@ -4896,8 +4879,6 @@ static int __init mlx5_ib_init(void)
if (!xlt_emergency_page)
return -ENOMEM;
- mutex_init(&xlt_emergency_page_mutex);
-
mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
if (!mlx5_ib_event_wq) {
free_page(xlt_emergency_page);
@@ -4915,7 +4896,6 @@ static void __exit mlx5_ib_cleanup(void)
{
mlx5_unregister_interface(&mlx5_ib_interface);
destroy_workqueue(mlx5_ib_event_wq);
- mutex_destroy(&xlt_emergency_page_mutex);
free_page(xlt_emergency_page);
}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b043a178e95b..8f728b98f9a6 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1454,8 +1454,7 @@ static inline int get_num_static_uars(struct mlx5_ib_dev *dev,
return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages;
}
-unsigned long mlx5_ib_get_xlt_emergency_page(void);
-void mlx5_ib_put_xlt_emergency_page(void);
+extern unsigned long xlt_emergency_page;
int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
struct mlx5_bfreg_info *bfregi, u32 bfregn,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index b7119722a54a..2971e7f48d40 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -41,6 +41,13 @@
#include <rdma/ib_verbs.h>
#include "mlx5_ib.h"
+/*
+ * We can't use an array for xlt_emergency_page because dma_map_single doesn't
+ * work on kernel modules memory
+ */
+unsigned long xlt_emergency_page;
+static DEFINE_MUTEX(xlt_emergency_page_mutex);
+
enum {
MAX_PENDING_REG_MR = 8,
};
@@ -992,6 +999,17 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
MLX5_UMR_MTT_ALIGNMENT)
#define MLX5_SPARE_UMR_CHUNK 0x10000
+static unsigned long mlx5_ib_get_xlt_emergency_page(void)
+{
+ mutex_lock(&xlt_emergency_page_mutex);
+ return xlt_emergency_page;
+}
+
+static void mlx5_ib_put_xlt_emergency_page(void)
+{
+ mutex_unlock(&xlt_emergency_page_mutex);
+}
+
int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags)
{
--
2.26.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH rdma-next 3/5] RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt()
2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 1/5] RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block() Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 2/5] RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
2020-10-26 13:23 ` [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases Leon Romanovsky
` (2 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma
From: Jason Gunthorpe <jgg@nvidia.com>
The memory allocation is quite complicated, and makes this function hard
to understand. Refactor things so that a function call sets up the WR, SG,
DMA mapping and buffer, further splitting that into buffer and DMA/wr.
This also slightly changes the buffer allocation logic to try an order 0
page allocation (with OOM warnings on) before going to the emergency page.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx5/main.c | 6 +-
drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +-
drivers/infiniband/hw/mlx5/mr.c | 189 ++++++++++++++++++---------
3 files changed, 128 insertions(+), 69 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b7eb977eb869..32ec59315f39 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4875,13 +4875,13 @@ static int __init mlx5_ib_init(void)
{
int err;
- xlt_emergency_page = __get_free_page(GFP_KERNEL);
+ xlt_emergency_page = (void *)__get_free_page(GFP_KERNEL);
if (!xlt_emergency_page)
return -ENOMEM;
mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
if (!mlx5_ib_event_wq) {
- free_page(xlt_emergency_page);
+ free_page((unsigned long)xlt_emergency_page);
return -ENOMEM;
}
@@ -4896,7 +4896,7 @@ static void __exit mlx5_ib_cleanup(void)
{
mlx5_unregister_interface(&mlx5_ib_interface);
destroy_workqueue(mlx5_ib_event_wq);
- free_page(xlt_emergency_page);
+ free_page((unsigned long)xlt_emergency_page);
}
module_init(mlx5_ib_init);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 8f728b98f9a6..d92afbd26aa5 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1454,7 +1454,7 @@ static inline int get_num_static_uars(struct mlx5_ib_dev *dev,
return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages;
}
-extern unsigned long xlt_emergency_page;
+extern void *xlt_emergency_page;
int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
struct mlx5_bfreg_info *bfregi, u32 bfregn,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 2971e7f48d40..b2ec4abc5639 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -45,7 +45,7 @@
* We can't use an array for xlt_emergency_page because dma_map_single doesn't
* work on kernel modules memory
*/
-unsigned long xlt_emergency_page;
+void *xlt_emergency_page;
static DEFINE_MUTEX(xlt_emergency_page_mutex);
enum {
@@ -999,15 +999,121 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
MLX5_UMR_MTT_ALIGNMENT)
#define MLX5_SPARE_UMR_CHUNK 0x10000
-static unsigned long mlx5_ib_get_xlt_emergency_page(void)
+/*
+ * Allocate a temporary buffer to hold the per-page information to transfer to
+ * HW. For efficiency this should be as large as it can be, but buffer
+ * allocation failure is not allowed, so try smaller sizes.
+ */
+static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
{
+ const size_t xlt_chunk_align =
+ MLX5_UMR_MTT_ALIGNMENT / sizeof(ent_size);
+ size_t size;
+ void *res = NULL;
+
+ static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
+
+ /*
+ * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
+ * allocation can't trigger any kind of reclaim.
+ */
+ might_sleep();
+
+ gfp_mask |= __GFP_ZERO;
+
+ /*
+ * If the system already has a suitable high order page then just use
+ * that, but don't try hard to create one. This max is about 1M, so a
+ * free x86 huge page will satisfy it.
+ */
+ size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
+ MLX5_MAX_UMR_CHUNK);
+ *nents = size / ent_size;
+ res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
+ get_order(size));
+ if (res)
+ return res;
+
+ if (size > MLX5_SPARE_UMR_CHUNK) {
+ size = MLX5_SPARE_UMR_CHUNK;
+ *nents = get_order(size) / ent_size;
+ res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
+ get_order(size));
+ if (res)
+ return res;
+ }
+
+ *nents = PAGE_SIZE / ent_size;
+ res = (void *)__get_free_page(gfp_mask);
+ if (res)
+ return res;
+
mutex_lock(&xlt_emergency_page_mutex);
+ memset(xlt_emergency_page, 0, PAGE_SIZE);
return xlt_emergency_page;
}
-static void mlx5_ib_put_xlt_emergency_page(void)
+static void mlx5_ib_free_xlt(void *xlt, size_t length)
+{
+ if (xlt == xlt_emergency_page) {
+ mutex_unlock(&xlt_emergency_page_mutex);
+ return;
+ }
+
+ free_pages((unsigned long)xlt, get_order(length));
+}
+
+/*
+ * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
+ * submission.
+ */
+static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
+ struct mlx5_umr_wr *wr, struct ib_sge *sg,
+ size_t nents, size_t ent_size,
+ unsigned int flags)
+{
+ struct mlx5_ib_dev *dev = mr->dev;
+ struct device *ddev = dev->ib_dev.dev.parent;
+ dma_addr_t dma;
+ void *xlt;
+
+ xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
+ flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
+ GFP_KERNEL);
+ sg->length = nents * ent_size;
+ dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
+ if (dma_mapping_error(ddev, dma)) {
+ mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
+ mlx5_ib_free_xlt(xlt, sg->length);
+ return NULL;
+ }
+ sg->addr = dma;
+ sg->lkey = dev->umrc.pd->local_dma_lkey;
+
+ memset(wr, 0, sizeof(*wr));
+ wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
+ if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
+ wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+ wr->wr.sg_list = sg;
+ wr->wr.num_sge = 1;
+ wr->wr.opcode = MLX5_IB_WR_UMR;
+ wr->pd = mr->ibmr.pd;
+ wr->mkey = mr->mmkey.key;
+ wr->length = mr->mmkey.size;
+ wr->virt_addr = mr->mmkey.iova;
+ wr->access_flags = mr->access_flags;
+ wr->page_shift = mr->page_shift;
+ wr->xlt_size = sg->length;
+ return xlt;
+}
+
+static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
+ struct ib_sge *sg)
{
- mutex_unlock(&xlt_emergency_page_mutex);
+ struct device *ddev = dev->ib_dev.dev.parent;
+
+ dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
+ mlx5_ib_free_xlt(xlt, sg->length);
}
int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
@@ -1015,9 +1121,7 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
{
struct mlx5_ib_dev *dev = mr->dev;
struct device *ddev = dev->ib_dev.dev.parent;
- int size;
void *xlt;
- dma_addr_t dma;
struct mlx5_umr_wr wr;
struct ib_sge sg;
int err = 0;
@@ -1028,10 +1132,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
const int page_mask = page_align - 1;
size_t pages_mapped = 0;
size_t pages_to_map = 0;
- size_t pages_iter = 0;
+ size_t pages_iter;
size_t size_to_map = 0;
- gfp_t gfp;
- bool use_emergency_page = false;
+ size_t orig_sg_length;
if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
!umr_can_use_indirect_mkey(dev))
@@ -1044,37 +1147,13 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
npages += idx & page_mask;
idx &= ~page_mask;
}
-
- gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL;
- gfp |= __GFP_ZERO | __GFP_NOWARN;
-
pages_to_map = ALIGN(npages, page_align);
- size = desc_size * pages_to_map;
- size = min_t(int, size, MLX5_MAX_UMR_CHUNK);
-
- xlt = (void *)__get_free_pages(gfp, get_order(size));
- if (!xlt && size > MLX5_SPARE_UMR_CHUNK) {
- mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n",
- size, get_order(size), MLX5_SPARE_UMR_CHUNK);
- size = MLX5_SPARE_UMR_CHUNK;
- xlt = (void *)__get_free_pages(gfp, get_order(size));
- }
-
- if (!xlt) {
- mlx5_ib_warn(dev, "Using XLT emergency buffer\n");
- xlt = (void *)mlx5_ib_get_xlt_emergency_page();
- size = PAGE_SIZE;
- memset(xlt, 0, size);
- use_emergency_page = true;
- }
- pages_iter = size / desc_size;
- dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE);
- if (dma_mapping_error(ddev, dma)) {
- mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
- err = -ENOMEM;
- goto free_xlt;
- }
+ xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
+ if (!xlt)
+ return -ENOMEM;
+ pages_iter = sg.length / desc_size;
+ orig_sg_length = sg.length;
if (mr->umem->is_odp) {
if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
@@ -1085,22 +1164,6 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
}
}
- sg.addr = dma;
- sg.lkey = dev->umrc.pd->local_dma_lkey;
-
- memset(&wr, 0, sizeof(wr));
- wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
- if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
- wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
- wr.wr.sg_list = &sg;
- wr.wr.num_sge = 1;
- wr.wr.opcode = MLX5_IB_WR_UMR;
-
- wr.pd = mr->ibmr.pd;
- wr.mkey = mr->mmkey.key;
- wr.length = mr->mmkey.size;
- wr.virt_addr = mr->mmkey.iova;
- wr.access_flags = mr->access_flags;
wr.page_shift = page_shift;
for (pages_mapped = 0;
@@ -1108,7 +1171,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
pages_mapped += pages_iter, idx += pages_iter) {
npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
size_to_map = npages * desc_size;
- dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
+ dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
+ DMA_TO_DEVICE);
if (mr->umem->is_odp) {
mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
} else {
@@ -1118,9 +1182,10 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
/* Clear padding after the pages
* brought from the umem.
*/
- memset(xlt + size_to_map, 0, size - size_to_map);
+ memset(xlt + size_to_map, 0, sg.length - size_to_map);
}
- dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
+ dma_sync_single_for_device(ddev, sg.addr, sg.length,
+ DMA_TO_DEVICE);
sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
@@ -1144,14 +1209,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
err = mlx5_ib_post_send_wait(dev, &wr);
}
- dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
-
-free_xlt:
- if (use_emergency_page)
- mlx5_ib_put_xlt_emergency_page();
- else
- free_pages((unsigned long)xlt, get_order(size));
-
+ sg.length = orig_sg_length;
+ mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
return err;
}
--
2.26.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases
2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
` (2 preceding siblings ...)
2020-10-26 13:23 ` [PATCH rdma-next 3/5] RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt() Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
2020-10-29 12:33 ` Jason Gunthorpe
2020-10-26 13:23 ` [PATCH rdma-next 5/5] RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's Leon Romanovsky
2020-11-02 19:11 ` [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Jason Gunthorpe
5 siblings, 1 reply; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma
From: Jason Gunthorpe <jgg@nvidia.com>
Mixing these together is just a mess, make a dedicated version,
mlx5_ib_update_mr_pas(), which directly loads the whole MTT for a non-ODP
MR.
The split out version can trivially use a simple loop with
rdma_for_each_block() which allows using the core code to compute the MR
pages and avoids seeking in the SGL list after each chunk as the
__mlx5_ib_populate_pas() call required.
Significantly speeds loading large MTTs.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/hw/mlx5/mem.c | 64 -------------
drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 -
drivers/infiniband/hw/mlx5/mr.c | 137 +++++++++++++++++++--------
3 files changed, 97 insertions(+), 107 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 779c4a040d8b..92e7621ec858 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -91,70 +91,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
*shift = PAGE_SHIFT + m;
}
-/*
- * Populate the given array with bus addresses from the umem.
- *
- * dev - mlx5_ib device
- * umem - umem to use to fill the pages
- * page_shift - determines the page size used in the resulting array
- * offset - offset into the umem to start from,
- * only implemented for ODP umems
- * num_pages - total number of pages to fill
- * pas - bus addresses array to fill
- * access_flags - access flags to set on all present pages.
- use enum mlx5_ib_mtt_access_flags for this.
- */
-void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
- int page_shift, size_t offset, size_t num_pages,
- __be64 *pas, int access_flags)
-{
- int shift = page_shift - PAGE_SHIFT;
- int mask = (1 << shift) - 1;
- int i, k, idx;
- u64 cur = 0;
- u64 base;
- int len;
- struct scatterlist *sg;
- int entry;
-
- i = 0;
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
- len = sg_dma_len(sg) >> PAGE_SHIFT;
- base = sg_dma_address(sg);
-
- /* Skip elements below offset */
- if (i + len < offset << shift) {
- i += len;
- continue;
- }
-
- /* Skip pages below offset */
- if (i < offset << shift) {
- k = (offset << shift) - i;
- i = offset << shift;
- } else {
- k = 0;
- }
-
- for (; k < len; k++) {
- if (!(i & mask)) {
- cur = base + (k << PAGE_SHIFT);
- cur |= access_flags;
- idx = (i >> shift) - offset;
-
- pas[idx] = cpu_to_be64(cur);
- mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
- i >> shift, be64_to_cpu(pas[idx]));
- }
- i++;
-
- /* Stop after num_pages reached */
- if (i >> shift >= offset + num_pages)
- return;
- }
- }
-}
-
/*
* Fill in a physical address list. ib_umem_num_dma_blocks() entries will be
* filled in the pas array.
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d92afbd26aa5..aadd43425a58 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1232,9 +1232,6 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
unsigned long max_page_shift,
int *shift);
-void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
- int page_shift, size_t offset, size_t num_pages,
- __be64 *pas, int access_flags);
void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
u64 access_flags);
void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index b2ec4abc5639..10f13acc88c9 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1116,6 +1116,21 @@ static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
mlx5_ib_free_xlt(xlt, sg->length);
}
+static unsigned int xlt_wr_final_send_flags(unsigned int flags)
+{
+ unsigned int res = 0;
+
+ if (flags & MLX5_IB_UPD_XLT_ENABLE)
+ res |= MLX5_IB_SEND_UMR_ENABLE_MR |
+ MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
+ MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
+ if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
+ res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
+ if (flags & MLX5_IB_UPD_XLT_ADDR)
+ res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
+ return res;
+}
+
int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags)
{
@@ -1140,6 +1155,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
!umr_can_use_indirect_mkey(dev))
return -EPERM;
+ if (WARN_ON(!mr->umem->is_odp))
+ return -EINVAL;
+
/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
* so we need to align the offset and length accordingly
*/
@@ -1155,13 +1173,11 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
pages_iter = sg.length / desc_size;
orig_sg_length = sg.length;
- if (mr->umem->is_odp) {
- if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
- struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
+ if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
- pages_to_map = min_t(size_t, pages_to_map, max_pages);
- }
+ pages_to_map = min_t(size_t, pages_to_map, max_pages);
}
wr.page_shift = page_shift;
@@ -1173,36 +1189,14 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
size_to_map = npages * desc_size;
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
- if (mr->umem->is_odp) {
- mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
- } else {
- __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
- npages, xlt,
- MLX5_IB_MTT_PRESENT);
- /* Clear padding after the pages
- * brought from the umem.
- */
- memset(xlt + size_to_map, 0, sg.length - size_to_map);
- }
+ mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
- if (pages_mapped + pages_iter >= pages_to_map) {
- if (flags & MLX5_IB_UPD_XLT_ENABLE)
- wr.wr.send_flags |=
- MLX5_IB_SEND_UMR_ENABLE_MR |
- MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
- MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
- if (flags & MLX5_IB_UPD_XLT_PD ||
- flags & MLX5_IB_UPD_XLT_ACCESS)
- wr.wr.send_flags |=
- MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
- if (flags & MLX5_IB_UPD_XLT_ADDR)
- wr.wr.send_flags |=
- MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
- }
+ if (pages_mapped + pages_iter >= pages_to_map)
+ wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
wr.offset = idx * desc_size;
wr.xlt_size = sg.length;
@@ -1214,6 +1208,69 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
return err;
}
+/*
+ * Send the DMA list to the HW for a normal MR using UMR.
+ */
+static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+{
+ struct mlx5_ib_dev *dev = mr->dev;
+ struct device *ddev = dev->ib_dev.dev.parent;
+ struct ib_block_iter biter;
+ struct mlx5_mtt *cur_mtt;
+ struct mlx5_umr_wr wr;
+ size_t orig_sg_length;
+ struct mlx5_mtt *mtt;
+ size_t final_size;
+ struct ib_sge sg;
+ int err = 0;
+
+ if (WARN_ON(mr->umem->is_odp))
+ return -EINVAL;
+
+ mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
+ ib_umem_num_dma_blocks(mr->umem,
+ 1 << mr->page_shift),
+ sizeof(*mtt), flags);
+ if (!mtt)
+ return -ENOMEM;
+ orig_sg_length = sg.length;
+
+ cur_mtt = mtt;
+ rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap,
+ BIT(mr->page_shift)) {
+ if (cur_mtt == (void *)mtt + sg.length) {
+ dma_sync_single_for_device(ddev, sg.addr, sg.length,
+ DMA_TO_DEVICE);
+ err = mlx5_ib_post_send_wait(dev, &wr);
+ if (err)
+ goto err;
+ dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
+ DMA_TO_DEVICE);
+ wr.offset += sg.length;
+ cur_mtt = mtt;
+ }
+
+ cur_mtt->ptag =
+ cpu_to_be64(rdma_block_iter_dma_address(&biter) |
+ MLX5_IB_MTT_PRESENT);
+ cur_mtt++;
+ }
+
+ final_size = (void *)cur_mtt - (void *)mtt;
+ sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
+ memset(cur_mtt, 0, sg.length - final_size);
+ wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
+ wr.xlt_size = sg.length;
+
+ dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
+ err = mlx5_ib_post_send_wait(dev, &wr);
+
+err:
+ sg.length = orig_sg_length;
+ mlx5_ib_unmap_free_xlt(dev, mtt, &sg);
+ return err;
+}
+
/*
* If ibmr is NULL it will be allocated by reg_create.
* Else, the given ibmr will be used.
@@ -1483,10 +1540,14 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
*/
int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
- err = mlx5_ib_update_xlt(
- mr, 0,
- ib_umem_num_dma_blocks(umem, 1UL << mr->page_shift),
- mr->page_shift, update_xlt_flags);
+ if (is_odp_mr(mr))
+ err = mlx5_ib_update_xlt(
+ mr, 0,
+ ib_umem_num_dma_blocks(umem,
+ 1UL << mr->page_shift),
+ mr->page_shift, update_xlt_flags);
+ else
+ err = mlx5_ib_update_mr_pas(mr, update_xlt_flags);
if (err) {
dereg_mr(dev, mr);
return ERR_PTR(err);
@@ -1652,11 +1713,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
upd_flags |= MLX5_IB_UPD_XLT_PD;
if (flags & IB_MR_REREG_ACCESS)
upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
- err = mlx5_ib_update_xlt(
- mr, 0,
- ib_umem_num_dma_blocks(mr->umem,
- 1UL << mr->page_shift),
- mr->page_shift, upd_flags);
+ err = mlx5_ib_update_mr_pas(mr, upd_flags);
} else {
err = rereg_umr(pd, mr, access_flags, flags);
}
--
2.26.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases
2020-10-26 13:23 ` [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases Leon Romanovsky
@ 2020-10-29 12:33 ` Jason Gunthorpe
0 siblings, 0 replies; 8+ messages in thread
From: Jason Gunthorpe @ 2020-10-29 12:33 UTC (permalink / raw)
To: Leon Romanovsky; +Cc: Doug Ledford, linux-rdma
On Mon, Oct 26, 2020 at 03:23:13PM +0200, Leon Romanovsky wrote:
> @@ -1483,10 +1540,14 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
> */
> int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
>
> - err = mlx5_ib_update_xlt(
> - mr, 0,
> - ib_umem_num_dma_blocks(umem, 1UL << mr->page_shift),
> - mr->page_shift, update_xlt_flags);
> + if (is_odp_mr(mr))
> + err = mlx5_ib_update_xlt(
> + mr, 0,
> + ib_umem_num_dma_blocks(umem,
> + 1UL << mr->page_shift),
> + mr->page_shift, update_xlt_flags);
> + else
> + err = mlx5_ib_update_mr_pas(mr, update_xlt_flags);
This rebase looks a bit weird, this whole block is !ODP already, so
why is there an 'if is_odp_mr()' ?
Should just be this:
if (xlt_with_umr && !(access_flags & IB_ACCESS_ON_DEMAND)) {
/*
* If the MR was created with reg_create then it will be
* configured properly but left disabled. It is safe to go ahead
* and configure it again via UMR while enabling it.
*/
err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
if (err) {
dereg_mr(dev, mr);
return ERR_PTR(err);
}
}
Jason
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH rdma-next 5/5] RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's
2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
` (3 preceding siblings ...)
2020-10-26 13:23 ` [PATCH rdma-next 4/5] RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases Leon Romanovsky
@ 2020-10-26 13:23 ` Leon Romanovsky
2020-11-02 19:11 ` [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Jason Gunthorpe
5 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2020-10-26 13:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma
From: Jason Gunthorpe <jgg@nvidia.com>
Now that all the PAS arrays or UMR XLT's for mkcs are filled using
rdma_for_each_block() we can use the common ib_umem_find_best_pgsz()
algorithm.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/core/umem.c | 9 +++++++
drivers/infiniband/hw/mlx5/mlx5_ib.h | 27 ++++++++++++++++++++
drivers/infiniband/hw/mlx5/mr.c | 37 +++++++++++++++-------------
3 files changed, 56 insertions(+), 17 deletions(-)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index e9fecbdf391b..f1fc7e39c782 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -84,6 +84,15 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
dma_addr_t mask;
int i;
+ if (umem->is_odp) {
+ unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift);
+
+ /* ODP must always be self consistent. */
+ if (!(pgsz_bitmap & page_size))
+ return 0;
+ return page_size;
+ }
+
/* rdma_for_each_block() has a bug if the page size is smaller than the
* page size used to build the umem. For now prevent smaller page sizes
* from being returned.
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index aadd43425a58..bb44080170be 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -42,6 +42,33 @@
#define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size)
+static __always_inline unsigned long
+__mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits,
+ unsigned int pgsz_shift)
+{
+ unsigned int largest_pg_shift =
+ min_t(unsigned long, (1ULL << log_pgsz_bits) - 1 + pgsz_shift,
+ BITS_PER_LONG - 1);
+
+ /*
+ * Despite a command allowing it, the device does not support lower than
+ * 4k page size.
+ */
+ pgsz_shift = max_t(unsigned int, MLX5_ADAPTER_PAGE_SHIFT, pgsz_shift);
+ return GENMASK(largest_pg_shift, pgsz_shift);
+}
+
+/*
+ * For mkc users, instead of a page_offset the command has a start_iova which
+ * specifies both the page_offset and the on-the-wire IOVA
+ */
+#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \
+ ib_umem_find_best_pgsz(umem, \
+ __mlx5_log_page_size_to_bitmap( \
+ __mlx5_bit_sz(typ, log_pgsz_fld), \
+ pgsz_shift), \
+ iova)
+
enum {
MLX5_IB_MMAP_OFFSET_START = 9,
MLX5_IB_MMAP_OFFSET_END = 255,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 10f13acc88c9..660b721df64d 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -964,11 +964,13 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_cache_ent *ent;
struct mlx5_ib_mr *mr;
- int page_shift;
+ unsigned int page_size;
- mlx5_ib_cont_pages(umem, iova, MLX5_MKEY_PAGE_SHIFT_MASK, &page_shift);
- ent = mr_cache_ent_from_order(dev, order_base_2(ib_umem_num_dma_blocks(
- umem, 1UL << page_shift)));
+ page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
+ if (WARN_ON(!page_size))
+ return ERR_PTR(-EINVAL);
+ ent = mr_cache_ent_from_order(
+ dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
if (!ent)
return ERR_PTR(-E2BIG);
@@ -990,7 +992,7 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd,
mr->mmkey.iova = iova;
mr->mmkey.size = umem->length;
mr->mmkey.pd = to_mpd(pd)->pdn;
- mr->page_shift = page_shift;
+ mr->page_shift = order_base_2(page_size);
return mr;
}
@@ -1280,8 +1282,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
int access_flags, bool populate)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ unsigned int page_size;
struct mlx5_ib_mr *mr;
- int page_shift;
__be64 *pas;
void *mkc;
int inlen;
@@ -1289,22 +1291,23 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
int err;
bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
+ page_size =
+ mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
+ if (WARN_ON(!page_size))
+ return ERR_PTR(-EINVAL);
+
mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
if (!mr)
return ERR_PTR(-ENOMEM);
- mlx5_ib_cont_pages(umem, iova, MLX5_MKEY_PAGE_SHIFT_MASK, &page_shift);
-
- mr->page_shift = page_shift;
mr->ibmr.pd = pd;
mr->access_flags = access_flags;
+ mr->page_shift = order_base_2(page_size);
inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
if (populate)
- inlen +=
- sizeof(*pas) *
- roundup(ib_umem_num_dma_blocks(umem, 1UL << page_shift),
- 2);
+ inlen += sizeof(*pas) *
+ roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
in = kvzalloc(inlen, GFP_KERNEL);
if (!in) {
err = -ENOMEM;
@@ -1316,7 +1319,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
err = -EINVAL;
goto err_2;
}
- mlx5_ib_populate_pas(umem, 1ULL << page_shift, pas,
+ mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
pg_cap ? MLX5_IB_MTT_PRESENT : 0);
}
@@ -1334,11 +1337,11 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
MLX5_SET64(mkc, mkc, len, umem->length);
MLX5_SET(mkc, mkc, bsf_octword_size, 0);
MLX5_SET(mkc, mkc, translations_octword_size,
- get_octo_len(iova, umem->length, page_shift));
- MLX5_SET(mkc, mkc, log_page_size, page_shift);
+ get_octo_len(iova, umem->length, mr->page_shift));
+ MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
if (populate) {
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
- get_octo_len(iova, umem->length, page_shift));
+ get_octo_len(iova, umem->length, mr->page_shift));
}
err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
--
2.26.2
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs
2020-10-26 13:23 [PATCH rdma-next 0/5] Use ib_umem_find_best_pgsz() when creating MRs Leon Romanovsky
` (4 preceding siblings ...)
2020-10-26 13:23 ` [PATCH rdma-next 5/5] RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's Leon Romanovsky
@ 2020-11-02 19:11 ` Jason Gunthorpe
5 siblings, 0 replies; 8+ messages in thread
From: Jason Gunthorpe @ 2020-11-02 19:11 UTC (permalink / raw)
To: Leon Romanovsky; +Cc: Doug Ledford, Leon Romanovsky, linux-kernel, linux-rdma
On Mon, Oct 26, 2020 at 03:23:09PM +0200, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
>
> >From Jason:
>
> The new common code does a better job finding large page sizes. Use it in
> mlx5 for MRs.
>
> This requires moving the MTT population for mailboxes and UMR over to
> rdma_for_each_dma_block().
>
> Thanks
>
> Jason Gunthorpe (5):
> RDMA/mlx5: Change mlx5_ib_populate_pas() to use rdma_for_each_block()
> RDMA/mlx5: Move xlt_emergency_page_mutex into mr.c
> RDMA/mlx5: Split the WR setup out of mlx5_ib_update_xlt()
> RDMA/mlx5: Split mlx5_ib_update_xlt() into ODP and non-ODP cases
> RDMA/mlx5: Use ib_umem_find_best_pgsz() for mkc's
Applied to for-next, with the updated hunk
Thanks,
Jason
^ permalink raw reply [flat|nested] 8+ messages in thread