Linux-RDMA Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH rdma-rc 0/3] ODP Fixes
@ 2019-12-19 13:46 Leon Romanovsky
  2019-12-19 13:46 ` [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility Leon Romanovsky
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Leon Romanovsky @ 2019-12-19 13:46 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Artemy Kovalyov,
	Aviad Yehezkel, Jason Gunthorpe, Yishai Hadas

From: Leon Romanovsky <leonro@mellanox.com>

Hi,

Please find below three patches that fix ODP flow. The title of first
patch seems not -rc material, but implementation yes. The fact that we
unified code paths allowed us to fix some of the corner cases at the
same time.

Thanks

Artemy Kovalyov (1):
  IB/mlx5: Unify ODP MR code paths to allow extra flexibility

Yishai Hadas (2):
  IB/core: Fix ODP get user pages flow
  IB/core: Fix ODP with IB_ACCESS_HUGETLB handling

 drivers/infiniband/core/umem_odp.c   | 39 ++++++++++---------
 drivers/infiniband/hw/mlx5/mem.c     | 25 ------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  9 ++---
 drivers/infiniband/hw/mlx5/mr.c      | 58 +++++++++++-----------------
 drivers/infiniband/hw/mlx5/odp.c     | 42 +++++++++++++++++++-
 5 files changed, 87 insertions(+), 86 deletions(-)

--
2.20.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility
  2019-12-19 13:46 [PATCH rdma-rc 0/3] ODP Fixes Leon Romanovsky
@ 2019-12-19 13:46 ` Leon Romanovsky
  2019-12-19 19:07   ` Jason Gunthorpe
  2019-12-19 13:46 ` [PATCH rdma-rc 2/3] IB/core: Fix ODP get user pages flow Leon Romanovsky
  2019-12-19 13:46 ` [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling Leon Romanovsky
  2 siblings, 1 reply; 13+ messages in thread
From: Leon Romanovsky @ 2019-12-19 13:46 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Artemy Kovalyov,
	Aviad Yehezkel, Jason Gunthorpe, Yishai Hadas

From: Artemy Kovalyov <artemyko@mellanox.com>

Building MR translation table in ODP case requires additional flexibility,
namely random access to DMA addresses. Make both direct and indirect ODP MR
use same code path, separated from non-ODP MR code path.

Fixes: d2183c6f1958 ("RDMA/umem: Move page_shift from ib_umem to ib_odp_umem")
Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mem.c     | 25 ------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  9 ++---
 drivers/infiniband/hw/mlx5/mr.c      | 58 +++++++++++-----------------
 drivers/infiniband/hw/mlx5/odp.c     | 42 +++++++++++++++++++-
 4 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 048f4e974a61..b90a3649e7d1 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -101,18 +101,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 	*count = i;
 }
 
-static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
-{
-	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
-
-	if (umem_dma & ODP_READ_ALLOWED_BIT)
-		mtt_entry |= MLX5_IB_MTT_READ;
-	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
-		mtt_entry |= MLX5_IB_MTT_WRITE;
-
-	return mtt_entry;
-}
-
 /*
  * Populate the given array with bus addresses from the umem.
  *
@@ -139,19 +127,6 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	struct scatterlist *sg;
 	int entry;
 
-	if (umem->is_odp) {
-		WARN_ON(shift != 0);
-		WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
-
-		for (i = 0; i < num_pages; ++i) {
-			dma_addr_t pa =
-				to_ib_umem_odp(umem)->dma_list[offset + i];
-
-			pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
-		}
-		return;
-	}
-
 	i = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
 		len = sg_dma_len(sg) >> PAGE_SHIFT;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b06f32ff5748..927948328184 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1276,8 +1276,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
-void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
-			   size_t nentries, struct mlx5_ib_mr *mr, int flags);
+void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+			   struct mlx5_ib_mr *mr, int flags);
 
 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 			       enum ib_uverbs_advise_mr_advice advice,
@@ -1293,9 +1293,8 @@ static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
 static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
-static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
-					 size_t nentries, struct mlx5_ib_mr *mr,
-					 int flags) {}
+static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+					 struct mlx5_ib_mr *mr, int flags) {}
 
 static inline int
 mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index ea8bfc3e2d8d..1e38ba0da16a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -868,36 +868,6 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(
 	return mr;
 }
 
-static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
-			       void *xlt, int page_shift, size_t size,
-			       int flags)
-{
-	struct mlx5_ib_dev *dev = mr->dev;
-	struct ib_umem *umem = mr->umem;
-
-	if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
-		if (!umr_can_use_indirect_mkey(dev))
-			return -EPERM;
-		mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
-		return npages;
-	}
-
-	npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
-
-	if (!(flags & MLX5_IB_UPD_XLT_ZAP)) {
-		__mlx5_ib_populate_pas(dev, umem, page_shift,
-				       idx, npages, xlt,
-				       MLX5_IB_MTT_PRESENT);
-		/* Clear padding after the pages
-		 * brought from the umem.
-		 */
-		memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0,
-		       size - npages * sizeof(struct mlx5_mtt));
-	}
-
-	return npages;
-}
-
 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
 			    MLX5_UMR_MTT_ALIGNMENT)
 #define MLX5_SPARE_UMR_CHUNK 0x10000
@@ -921,6 +891,7 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	size_t pages_mapped = 0;
 	size_t pages_to_map = 0;
 	size_t pages_iter = 0;
+	size_t size_to_map = 0;
 	gfp_t gfp;
 	bool use_emergency_page = false;
 
@@ -967,6 +938,15 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		goto free_xlt;
 	}
 
+	if (mr->umem->is_odp) {
+		if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
+			struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+			size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
+
+			pages_to_map = min_t(size_t, pages_to_map, max_pages);
+		}
+	}
+
 	sg.addr = dma;
 	sg.lkey = dev->umrc.pd->local_dma_lkey;
 
@@ -989,14 +969,22 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 	     pages_mapped < pages_to_map && !err;
 	     pages_mapped += pages_iter, idx += pages_iter) {
 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
+		size_to_map = npages * desc_size;
 		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
-		npages = populate_xlt(mr, idx, npages, xlt,
-				      page_shift, size, flags);
-
+		if (mr->umem->is_odp) {
+			mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
+		} else {
+			__mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
+					       npages, xlt,
+					       MLX5_IB_MTT_PRESENT);
+			/* Clear padding after the pages
+			 * brought from the umem.
+			 */
+			memset(xlt + size_to_map, 0, size - size_to_map);
+		}
 		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
 
-		sg.length = ALIGN(npages * desc_size,
-				  MLX5_UMR_MTT_ALIGNMENT);
+		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
 
 		if (pages_mapped + pages_iter >= pages_to_map) {
 			if (flags & MLX5_IB_UPD_XLT_ENABLE)
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index f924250f80c2..92da6c4f7ddd 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -93,8 +93,8 @@ struct mlx5_pagefault {
 
 static u64 mlx5_imr_ksm_entries;
 
-void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
-			   struct mlx5_ib_mr *imr, int flags)
+static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
+			struct mlx5_ib_mr *imr, int flags)
 {
 	struct mlx5_klm *end = pklm + nentries;
 
@@ -144,6 +144,44 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 	}
 }
 
+static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
+{
+	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
+
+	if (umem_dma & ODP_READ_ALLOWED_BIT)
+		mtt_entry |= MLX5_IB_MTT_READ;
+	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
+		mtt_entry |= MLX5_IB_MTT_WRITE;
+
+	return mtt_entry;
+}
+
+static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
+			 struct mlx5_ib_mr *mr, int flags)
+{
+	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+	dma_addr_t pa;
+	size_t i;
+
+	if (flags & MLX5_IB_UPD_XLT_ZAP)
+		return;
+
+	for (i = 0; i < nentries; i++) {
+		pa = odp->dma_list[idx + i];
+		pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+	}
+}
+
+void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+			   struct mlx5_ib_mr *mr, int flags)
+{
+	if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
+		populate_klm(xlt, idx, nentries, mr, flags);
+	} else {
+		populate_mtt(xlt, idx, nentries, mr, flags);
+	}
+}
+
 static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
 {
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
-- 
2.20.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH rdma-rc 2/3] IB/core: Fix ODP get user pages flow
  2019-12-19 13:46 [PATCH rdma-rc 0/3] ODP Fixes Leon Romanovsky
  2019-12-19 13:46 ` [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility Leon Romanovsky
@ 2019-12-19 13:46 ` Leon Romanovsky
  2019-12-19 19:05   ` Jason Gunthorpe
  2019-12-19 13:46 ` [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling Leon Romanovsky
  2 siblings, 1 reply; 13+ messages in thread
From: Leon Romanovsky @ 2019-12-19 13:46 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Artemy Kovalyov,
	Aviad Yehezkel, Jason Gunthorpe, Yishai Hadas

From: Yishai Hadas <yishaih@mellanox.com>

When calling get_user_pages_remote() need to work with granularity of
PAGE_SIZE.

Fix the calculation of how many entries can be read in one call to
consider that.

Fixes: 403cd12e2cf7 ("IB/umem: Add contiguous ODP support")
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/umem_odp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e42d44e501fd..2e9ee7adab13 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -440,7 +440,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 
 	while (bcnt > 0) {
 		const size_t gup_num_pages = min_t(size_t,
-				(bcnt + BIT(page_shift) - 1) >> page_shift,
+				ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
 				PAGE_SIZE / sizeof(struct page *));
 
 		down_read(&owning_mm->mmap_sem);
-- 
2.20.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling
  2019-12-19 13:46 [PATCH rdma-rc 0/3] ODP Fixes Leon Romanovsky
  2019-12-19 13:46 ` [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility Leon Romanovsky
  2019-12-19 13:46 ` [PATCH rdma-rc 2/3] IB/core: Fix ODP get user pages flow Leon Romanovsky
@ 2019-12-19 13:46 ` Leon Romanovsky
  2019-12-19 18:32   ` Jason Gunthorpe
  2020-01-08 12:56   ` Geert Uytterhoeven
  2 siblings, 2 replies; 13+ messages in thread
From: Leon Romanovsky @ 2019-12-19 13:46 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Artemy Kovalyov,
	Aviad Yehezkel, Jason Gunthorpe, Yishai Hadas

From: Yishai Hadas <yishaih@mellanox.com>

As VMAs for a given range might not be available as part of the
registration phase in ODP, IB_ACCESS_HUGETLB/page_shift must be checked
as part of the page fault flow.

If the application didn't mmap the backed memory with huge pages or
released part of that hugepage area, an error will be set as part of the
page fault flow once be detected.

Fixes: 0008b84ea9af ("IB/umem: Add support to huge ODP")
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/umem_odp.c | 37 +++++++++++++++---------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 2e9ee7adab13..533271897908 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -241,22 +241,10 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
 	umem_odp->umem.owning_mm = mm = current->mm;
 	umem_odp->notifier.ops = ops;
 
-	umem_odp->page_shift = PAGE_SHIFT;
-	if (access & IB_ACCESS_HUGETLB) {
-		struct vm_area_struct *vma;
-		struct hstate *h;
-
-		down_read(&mm->mmap_sem);
-		vma = find_vma(mm, ib_umem_start(umem_odp));
-		if (!vma || !is_vm_hugetlb_page(vma)) {
-			up_read(&mm->mmap_sem);
-			ret = -EINVAL;
-			goto err_free;
-		}
-		h = hstate_vma(vma);
-		umem_odp->page_shift = huge_page_shift(h);
-		up_read(&mm->mmap_sem);
-	}
+	if (access & IB_ACCESS_HUGETLB)
+		umem_odp->page_shift = HPAGE_SHIFT;
+	else
+		umem_odp->page_shift = PAGE_SHIFT;
 
 	umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
 	ret = ib_init_umem_odp(umem_odp, ops);
@@ -266,7 +254,6 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
 
 err_put_pid:
 	put_pid(umem_odp->tgid);
-err_free:
 	kfree(umem_odp);
 	return ERR_PTR(ret);
 }
@@ -403,6 +390,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 	int j, k, ret = 0, start_idx, npages = 0;
 	unsigned int flags = 0, page_shift;
 	phys_addr_t p = 0;
+	struct vm_area_struct **vmas;
 
 	if (access_mask == 0)
 		return -EINVAL;
@@ -415,6 +403,12 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 	if (!local_page_list)
 		return -ENOMEM;
 
+	vmas = (struct vm_area_struct **)__get_free_page(GFP_KERNEL);
+	if (!vmas) {
+		ret = -ENOMEM;
+		goto out_free_page_list;
+	}
+
 	page_shift = umem_odp->page_shift;
 	page_mask = ~(BIT(page_shift) - 1);
 	off = user_virt & (~page_mask);
@@ -453,7 +447,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 		 */
 		npages = get_user_pages_remote(owning_process, owning_mm,
 				user_virt, gup_num_pages,
-				flags, local_page_list, NULL, NULL);
+				flags, local_page_list, vmas, NULL);
 		up_read(&owning_mm->mmap_sem);
 
 		if (npages < 0) {
@@ -477,6 +471,11 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 				continue;
 			}
 
+			if ((1 << page_shift) > vma_kernel_pagesize(vmas[j])) {
+				ret = -EFAULT;
+				break;
+			}
+
 			ret = ib_umem_odp_map_dma_single_page(
 					umem_odp, k, local_page_list[j],
 					access_mask, current_seq);
@@ -517,6 +516,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 out_put_task:
 	if (owning_process)
 		put_task_struct(owning_process);
+	free_page((unsigned long)vmas);
+out_free_page_list:
 	free_page((unsigned long)local_page_list);
 	return ret;
 }
-- 
2.20.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling
  2019-12-19 13:46 ` [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling Leon Romanovsky
@ 2019-12-19 18:32   ` Jason Gunthorpe
  2019-12-20  4:51     ` Artemy Kovalyov
  2020-01-08 12:56   ` Geert Uytterhoeven
  1 sibling, 1 reply; 13+ messages in thread
From: Jason Gunthorpe @ 2019-12-19 18:32 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list,
	Artemy Kovalyov, Aviad Yehezkel, Yishai Hadas

On Thu, Dec 19, 2019 at 03:46:46PM +0200, Leon Romanovsky wrote:

> @@ -403,6 +390,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  	int j, k, ret = 0, start_idx, npages = 0;
>  	unsigned int flags = 0, page_shift;
>  	phys_addr_t p = 0;
> +	struct vm_area_struct **vmas;
>  
>  	if (access_mask == 0)
>  		return -EINVAL;
> @@ -415,6 +403,12 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  	if (!local_page_list)
>  		return -ENOMEM;
>  
> +	vmas = (struct vm_area_struct **)__get_free_page(GFP_KERNEL);
> +	if (!vmas) {
> +		ret = -ENOMEM;
> +		goto out_free_page_list;
> +	}

I'd rather not do this on the fast path

> +			if ((1 << page_shift) > vma_kernel_pagesize(vmas[j])) {
> +				ret = -EFAULT;
> +				break;
> +			}

And vma's cannot be de-refenced outside the mmap_sem

There is already logic checking for linear contiguous pages:

                        if (user_virt & ~page_mask) {
                                p += PAGE_SIZE;
                                if (page_to_phys(local_page_list[j]) != p) {
                                        ret = -EFAULT;
                                        break;
                                }

Why do we need to add the vma check?

Jason

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH rdma-rc 2/3] IB/core: Fix ODP get user pages flow
  2019-12-19 13:46 ` [PATCH rdma-rc 2/3] IB/core: Fix ODP get user pages flow Leon Romanovsky
@ 2019-12-19 19:05   ` Jason Gunthorpe
  0 siblings, 0 replies; 13+ messages in thread
From: Jason Gunthorpe @ 2019-12-19 19:05 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list,
	Artemy Kovalyov, Aviad Yehezkel, Yishai Hadas

On Thu, Dec 19, 2019 at 03:46:45PM +0200, Leon Romanovsky wrote:
> From: Yishai Hadas <yishaih@mellanox.com>
> 
> When calling get_user_pages_remote() need to work with granularity of
> PAGE_SIZE.
> 
> Fix the calculation of how many entries can be read in one call to
> consider that.
> 
> Fixes: 403cd12e2cf7 ("IB/umem: Add contiguous ODP support")
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> ---
>  drivers/infiniband/core/umem_odp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>

Jason

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility
  2019-12-19 13:46 ` [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility Leon Romanovsky
@ 2019-12-19 19:07   ` Jason Gunthorpe
  2019-12-20  4:54     ` Artemy Kovalyov
  0 siblings, 1 reply; 13+ messages in thread
From: Jason Gunthorpe @ 2019-12-19 19:07 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list,
	Artemy Kovalyov, Aviad Yehezkel, Yishai Hadas

On Thu, Dec 19, 2019 at 03:46:44PM +0200, Leon Romanovsky wrote:
> From: Artemy Kovalyov <artemyko@mellanox.com>
> 
> Building MR translation table in ODP case requires additional flexibility,
> namely random access to DMA addresses. Make both direct and indirect ODP MR
> use same code path, separated from non-ODP MR code path.
> 
> Fixes: d2183c6f1958 ("RDMA/umem: Move page_shift from ib_umem to
> ib_odp_umem")

What does it fix?

Jason

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling
  2019-12-19 18:32   ` Jason Gunthorpe
@ 2019-12-20  4:51     ` Artemy Kovalyov
  2019-12-20 13:35       ` Jason Gunthorpe
  0 siblings, 1 reply; 13+ messages in thread
From: Artemy Kovalyov @ 2019-12-20  4:51 UTC (permalink / raw)
  To: Jason Gunthorpe, Leon Romanovsky
  Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, Aviad Yehezkel,
	Yishai Hadas



-----Original Message-----
From: Jason Gunthorpe <jgg@ziepe.ca> 
Sent: Thursday, December 19, 2019 8:33 PM
To: Leon Romanovsky <leon@kernel.org>
Cc: Doug Ledford <dledford@redhat.com>; Leon Romanovsky <leonro@mellanox.com>; RDMA mailing list <linux-rdma@vger.kernel.org>; Artemy Kovalyov <artemyko@mellanox.com>; Aviad Yehezkel <aviadye@mellanox.com>; Yishai Hadas <yishaih@mellanox.com>
Subject: Re: [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling

On Thu, Dec 19, 2019 at 03:46:46PM +0200, Leon Romanovsky wrote:

> @@ -403,6 +390,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  	int j, k, ret = 0, start_idx, npages = 0;
>  	unsigned int flags = 0, page_shift;
>  	phys_addr_t p = 0;
> +	struct vm_area_struct **vmas;
>  
>  	if (access_mask == 0)
>  		return -EINVAL;
> @@ -415,6 +403,12 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>  	if (!local_page_list)
>  		return -ENOMEM;
>  
> +	vmas = (struct vm_area_struct **)__get_free_page(GFP_KERNEL);
> +	if (!vmas) {
> +		ret = -ENOMEM;
> +		goto out_free_page_list;
> +	}

I'd rather not do this on the fast path

> +			if ((1 << page_shift) > vma_kernel_pagesize(vmas[j])) {
> +				ret = -EFAULT;
> +				break;
> +			}

And vma's cannot be de-refenced outside the mmap_sem

There is already logic checking for linear contiguous pages:

                        if (user_virt & ~page_mask) {
                                p += PAGE_SIZE;
                                if (page_to_phys(local_page_list[j]) != p) {
                                        ret = -EFAULT;
                                        break;
                                }

Why do we need to add the vma check?

AK: checking for linear contiguity may be not enough, page may be transparent huge page, so in addition we also ensure its indeed persistent.

Jason

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility
  2019-12-19 19:07   ` Jason Gunthorpe
@ 2019-12-20  4:54     ` Artemy Kovalyov
  0 siblings, 0 replies; 13+ messages in thread
From: Artemy Kovalyov @ 2019-12-20  4:54 UTC (permalink / raw)
  To: Jason Gunthorpe, Leon Romanovsky
  Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, Aviad Yehezkel,
	Yishai Hadas



-----Original Message-----
From: Jason Gunthorpe <jgg@ziepe.ca> 
Sent: Thursday, December 19, 2019 9:07 PM
To: Leon Romanovsky <leon@kernel.org>
Cc: Doug Ledford <dledford@redhat.com>; Leon Romanovsky <leonro@mellanox.com>; RDMA mailing list <linux-rdma@vger.kernel.org>; Artemy Kovalyov <artemyko@mellanox.com>; Aviad Yehezkel <aviadye@mellanox.com>; Yishai Hadas <yishaih@mellanox.com>
Subject: Re: [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility

On Thu, Dec 19, 2019 at 03:46:44PM +0200, Leon Romanovsky wrote:
> From: Artemy Kovalyov <artemyko@mellanox.com>
> 
> Building MR translation table in ODP case requires additional 
> flexibility, namely random access to DMA addresses. Make both direct 
> and indirect ODP MR use same code path, separated from non-ODP MR code path.
> 
> Fixes: d2183c6f1958 ("RDMA/umem: Move page_shift from ib_umem to
> ib_odp_umem")

What does it fix?

AK: fix huge page ODP MR handling, was broken in __mlx5_ib_populate_pas shift calculation

Jason

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling
  2019-12-20  4:51     ` Artemy Kovalyov
@ 2019-12-20 13:35       ` Jason Gunthorpe
  2019-12-22  9:56         ` Yishai Hadas
  0 siblings, 1 reply; 13+ messages in thread
From: Jason Gunthorpe @ 2019-12-20 13:35 UTC (permalink / raw)
  To: Artemy Kovalyov
  Cc: Leon Romanovsky, Doug Ledford, Leon Romanovsky,
	RDMA mailing list, Aviad Yehezkel, Yishai Hadas

On Fri, Dec 20, 2019 at 04:51:00AM +0000, Artemy Kovalyov wrote:

> AK: checking for linear contiguity may be not enough, page may be
> transparent huge page, so in addition we also ensure its indeed
> persistent.

??

I don't understand how it makes a difference. If the physical is
linear we don't care if it is THP or just a lucky list of 4k pages?

Jason

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling
  2019-12-20 13:35       ` Jason Gunthorpe
@ 2019-12-22  9:56         ` Yishai Hadas
  0 siblings, 0 replies; 13+ messages in thread
From: Yishai Hadas @ 2019-12-22  9:56 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Artemy Kovalyov, Leon Romanovsky, Doug Ledford, Leon Romanovsky,
	RDMA mailing list, Aviad Yehezkel, Yishai Hadas

On 12/20/2019 3:35 PM, Jason Gunthorpe wrote:
> On Fri, Dec 20, 2019 at 04:51:00AM +0000, Artemy Kovalyov wrote:
> 
>> AK: checking for linear contiguity may be not enough, page may be
>> transparent huge page, so in addition we also ensure its indeed
>> persistent.
> 
> ??
> 
> I don't understand how it makes a difference. If the physical is
> linear we don't care if it is THP or just a lucky list of 4k pages?
> 

Agree, once we are fine with enabling THP or lucky list of 4k there is 
no need for the VMA extra checking to force persistent huge pages.

In V1 will cleaned-up the VMA part from the patch and adapt the commit 
message accordingly.

Yishai

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling
  2019-12-19 13:46 ` [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling Leon Romanovsky
  2019-12-19 18:32   ` Jason Gunthorpe
@ 2020-01-08 12:56   ` Geert Uytterhoeven
  2020-01-08 15:24     ` Yishai Hadas
  1 sibling, 1 reply; 13+ messages in thread
From: Geert Uytterhoeven @ 2020-01-08 12:56 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Jason Gunthorpe, Leon Romanovsky,
	RDMA mailing list, Artemy Kovalyov, Aviad Yehezkel,
	Jason Gunthorpe, Yishai Hadas, linux-kernel, linux-next,
	linux-mm

 	Hi Leon,

On Thu, 19 Dec 2019, Leon Romanovsky wrote:
> From: Yishai Hadas <yishaih@mellanox.com>
>
> As VMAs for a given range might not be available as part of the
> registration phase in ODP, IB_ACCESS_HUGETLB/page_shift must be checked
> as part of the page fault flow.
>
> If the application didn't mmap the backed memory with huge pages or
> released part of that hugepage area, an error will be set as part of the
> page fault flow once be detected.
>
> Fixes: 0008b84ea9af ("IB/umem: Add support to huge ODP")
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> Reviewed-by: Aviad Yehezkel <aviadye@mellanox.com>
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>

Thanks for your patch!

> --- a/drivers/infiniband/core/umem_odp.c
> +++ b/drivers/infiniband/core/umem_odp.c
> @@ -241,22 +241,10 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
> 	umem_odp->umem.owning_mm = mm = current->mm;
> 	umem_odp->notifier.ops = ops;
>
> -	umem_odp->page_shift = PAGE_SHIFT;
> -	if (access & IB_ACCESS_HUGETLB) {
> -		struct vm_area_struct *vma;
> -		struct hstate *h;
> -
> -		down_read(&mm->mmap_sem);
> -		vma = find_vma(mm, ib_umem_start(umem_odp));
> -		if (!vma || !is_vm_hugetlb_page(vma)) {
> -			up_read(&mm->mmap_sem);
> -			ret = -EINVAL;
> -			goto err_free;
> -		}
> -		h = hstate_vma(vma);
> -		umem_odp->page_shift = huge_page_shift(h);
> -		up_read(&mm->mmap_sem);
> -	}
> +	if (access & IB_ACCESS_HUGETLB)
> +		umem_odp->page_shift = HPAGE_SHIFT;
> +	else
> +		umem_odp->page_shift = PAGE_SHIFT;
>
> 	umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
> 	ret = ib_init_umem_odp(umem_odp, ops);

noreply@ellerman.id.au reports for linux-next/m68k-allmodconfig/m68k:

     drivers/infiniband/core/umem_odp.c:245:26: error: 'HPAGE_SHIFT' undeclared (first use in this function); did you mean 'PAGE_SHIFT'?

Should this depend on some HUGETLBFS option?

Gr{oetje,eeting}s,

 						Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
 							    -- Linus Torvalds

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling
  2020-01-08 12:56   ` Geert Uytterhoeven
@ 2020-01-08 15:24     ` Yishai Hadas
  0 siblings, 0 replies; 13+ messages in thread
From: Yishai Hadas @ 2020-01-08 15:24 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Leon Romanovsky, Doug Ledford, Jason Gunthorpe, Leon Romanovsky,
	RDMA mailing list, Artemy Kovalyov, Aviad Yehezkel,
	Jason Gunthorpe, Yishai Hadas, linux-kernel, linux-next,
	linux-mm

On 1/8/2020 2:56 PM, Geert Uytterhoeven wrote:
>      Hi Leon,
> 
> On Thu, 19 Dec 2019, Leon Romanovsky wrote:
>> From: Yishai Hadas <yishaih@mellanox.com>
>>
>> As VMAs for a given range might not be available as part of the
>> registration phase in ODP, IB_ACCESS_HUGETLB/page_shift must be checked
>> as part of the page fault flow.
>>
>> If the application didn't mmap the backed memory with huge pages or
>> released part of that hugepage area, an error will be set as part of the
>> page fault flow once be detected.
>>
>> Fixes: 0008b84ea9af ("IB/umem: Add support to huge ODP")
>> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
>> Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
>> Reviewed-by: Aviad Yehezkel <aviadye@mellanox.com>
>> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> 
> Thanks for your patch!
> 
>> --- a/drivers/infiniband/core/umem_odp.c
>> +++ b/drivers/infiniband/core/umem_odp.c
>> @@ -241,22 +241,10 @@ struct ib_umem_odp *ib_umem_odp_get(struct 
>> ib_udata *udata, unsigned long addr,
>>     umem_odp->umem.owning_mm = mm = current->mm;
>>     umem_odp->notifier.ops = ops;
>>
>> -    umem_odp->page_shift = PAGE_SHIFT;
>> -    if (access & IB_ACCESS_HUGETLB) {
>> -        struct vm_area_struct *vma;
>> -        struct hstate *h;
>> -
>> -        down_read(&mm->mmap_sem);
>> -        vma = find_vma(mm, ib_umem_start(umem_odp));
>> -        if (!vma || !is_vm_hugetlb_page(vma)) {
>> -            up_read(&mm->mmap_sem);
>> -            ret = -EINVAL;
>> -            goto err_free;
>> -        }
>> -        h = hstate_vma(vma);
>> -        umem_odp->page_shift = huge_page_shift(h);
>> -        up_read(&mm->mmap_sem);
>> -    }
>> +    if (access & IB_ACCESS_HUGETLB)
>> +        umem_odp->page_shift = HPAGE_SHIFT;
>> +    else
>> +        umem_odp->page_shift = PAGE_SHIFT;
>>
>>     umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
>>     ret = ib_init_umem_odp(umem_odp, ops);
> 
> noreply@ellerman.id.au reports for linux-next/m68k-allmodconfig/m68k:
> 
>      drivers/infiniband/core/umem_odp.c:245:26: error: 'HPAGE_SHIFT' 
> undeclared (first use in this function); did you mean 'PAGE_SHIFT'?
> 
> Should this depend on some HUGETLBFS option?
> 

Thanks for pointing on,
We would expect to use #ifdef CONFIG_HUGETLB_PAGE as done in below 
kernel code [1] that also used HPAGE_SHIFT.

I'll send some patch to 'for-next' to handle it.

[1] 
https://elixir.bootlin.com/linux/v5.3-rc7/source/drivers/misc/sgi-gru/grufault.c#L183

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, back to index

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-19 13:46 [PATCH rdma-rc 0/3] ODP Fixes Leon Romanovsky
2019-12-19 13:46 ` [PATCH rdma-rc 1/3] IB/mlx5: Unify ODP MR code paths to allow extra flexibility Leon Romanovsky
2019-12-19 19:07   ` Jason Gunthorpe
2019-12-20  4:54     ` Artemy Kovalyov
2019-12-19 13:46 ` [PATCH rdma-rc 2/3] IB/core: Fix ODP get user pages flow Leon Romanovsky
2019-12-19 19:05   ` Jason Gunthorpe
2019-12-19 13:46 ` [PATCH rdma-rc 3/3] IB/core: Fix ODP with IB_ACCESS_HUGETLB handling Leon Romanovsky
2019-12-19 18:32   ` Jason Gunthorpe
2019-12-20  4:51     ` Artemy Kovalyov
2019-12-20 13:35       ` Jason Gunthorpe
2019-12-22  9:56         ` Yishai Hadas
2020-01-08 12:56   ` Geert Uytterhoeven
2020-01-08 15:24     ` Yishai Hadas

Linux-RDMA Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-rdma/0 linux-rdma/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-rdma linux-rdma/ https://lore.kernel.org/linux-rdma \
		linux-rdma@vger.kernel.org
	public-inbox-index linux-rdma

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-rdma


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git