linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jason Gunthorpe <jgg@ziepe.ca>
To: linux-rdma@vger.kernel.org
Cc: Jason Gunthorpe <jgg@mellanox.com>,
	Artemy Kovalyov <artemyko@mellanox.com>
Subject: [PATCH 01/15] RDMA/mlx5: Use SRCU properly in ODP prefetch
Date: Wed,  9 Oct 2019 13:09:21 -0300	[thread overview]
Message-ID: <20191009160934.3143-2-jgg@ziepe.ca> (raw)
In-Reply-To: <20191009160934.3143-1-jgg@ziepe.ca>

From: Jason Gunthorpe <jgg@mellanox.com>

When working with SRCU protected xarrays the xarray itself should be the
SRCU 'update' point. Instead prefetch is using live as the SRCU update
point and this prevents switching the locking design to use the xarray
instead.

To solve this the prefetch must only read from the xarray once, and hold
on to the actual MR pointer for the duration of the async
operation. Incrementing num_pending_prefetch delays destruction of the MR,
so it is suitable.

Prefetch calls directly to the pagefault_mr using the MR pointer and only
does a single xarray lookup.

All the testing if a MR is prefetchable or not is now done only in the
prefetch code and removed from the pagefault critical path.

Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c | 261 ++++++++++++++-----------------
 1 file changed, 120 insertions(+), 141 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 3f9478d1937668..4cc5b9420d3ec4 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -606,16 +606,13 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 	wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
 }
 
-#define MLX5_PF_FLAGS_PREFETCH  BIT(0)
 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
-static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
-			u64 io_virt, size_t bcnt, u32 *bytes_mapped,
-			u32 flags)
+static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
+			u32 *bytes_mapped, u32 flags)
 {
 	int npages = 0, current_seq, page_shift, ret, np;
 	struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
 	bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
-	bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
 	u64 access_mask;
 	u64 start_idx, page_mask;
 	struct ib_umem_odp *odp;
@@ -639,14 +636,6 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 	start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
 	access_mask = ODP_READ_ALLOWED_BIT;
 
-	if (prefetch && !downgrade && !odp->umem.writable) {
-		/* prefetch with write-access must
-		 * be supported by the MR
-		 */
-		ret = -EINVAL;
-		goto out;
-	}
-
 	if (odp->umem.writable && !downgrade)
 		access_mask |= ODP_WRITE_ALLOWED_BIT;
 
@@ -681,7 +670,8 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 
 	if (ret < 0) {
 		if (ret != -EAGAIN)
-			mlx5_ib_err(dev, "Failed to update mkey page tables\n");
+			mlx5_ib_err(mr->dev,
+				    "Failed to update mkey page tables\n");
 		goto out;
 	}
 
@@ -700,8 +690,10 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 		io_virt += size;
 		next = odp_next(odp);
 		if (unlikely(!next || ib_umem_start(next) != io_virt)) {
-			mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
-				    io_virt, next);
+			mlx5_ib_dbg(
+				mr->dev,
+				"next implicit leaf removed at 0x%llx. got %p\n",
+				io_virt, next);
 			return -EAGAIN;
 		}
 		odp = next;
@@ -718,7 +710,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 		if (!wait_for_completion_timeout(&odp->notifier_completion,
 						 timeout)) {
 			mlx5_ib_warn(
-				dev,
+				mr->dev,
 				"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
 				current_seq, odp->notifiers_seq,
 				odp->notifiers_count);
@@ -775,10 +767,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 					 struct ib_pd *pd, u32 key,
 					 u64 io_virt, size_t bcnt,
 					 u32 *bytes_committed,
-					 u32 *bytes_mapped, u32 flags)
+					 u32 *bytes_mapped)
 {
 	int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
-	bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
 	struct pf_frame *head = NULL, *frame;
 	struct mlx5_core_mkey *mmkey;
 	struct mlx5_ib_mr *mr;
@@ -800,12 +791,6 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 		goto srcu_unlock;
 	}
 
-	if (prefetch && mmkey->type != MLX5_MKEY_MR) {
-		mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
-		ret = -EINVAL;
-		goto srcu_unlock;
-	}
-
 	switch (mmkey->type) {
 	case MLX5_MKEY_MR:
 		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
@@ -815,17 +800,6 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 			goto srcu_unlock;
 		}
 
-		if (prefetch) {
-			if (!is_odp_mr(mr) ||
-			    mr->ibmr.pd != pd) {
-				mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n",
-					    is_odp_mr(mr) ?  "MR is not ODP" :
-					    "PD is not of the MR");
-				ret = -EINVAL;
-				goto srcu_unlock;
-			}
-		}
-
 		if (!is_odp_mr(mr)) {
 			mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
 				    key);
@@ -835,7 +809,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 			goto srcu_unlock;
 		}
 
-		ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags);
+		ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
 		if (ret < 0)
 			goto srcu_unlock;
 
@@ -1009,7 +983,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
 		ret = pagefault_single_data_segment(dev, NULL, key,
 						    io_virt, bcnt,
 						    &pfault->bytes_committed,
-						    bytes_mapped, 0);
+						    bytes_mapped);
 		if (ret < 0)
 			break;
 		npages += ret;
@@ -1292,8 +1266,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
 	}
 
 	ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
-					    &pfault->bytes_committed, NULL,
-					    0);
+					    &pfault->bytes_committed, NULL);
 	if (ret == -EAGAIN) {
 		/* We're racing with an invalidation, don't prefetch */
 		prefetch_activated = 0;
@@ -1320,8 +1293,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
 
 		ret = pagefault_single_data_segment(dev, NULL, rkey, address,
 						    prefetch_len,
-						    &bytes_committed, NULL,
-						    0);
+						    &bytes_committed, NULL);
 		if (ret < 0 && ret != -EAGAIN) {
 			mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
 				    ret, pfault->token, address, prefetch_len);
@@ -1624,114 +1596,137 @@ int mlx5_ib_odp_init(void)
 
 struct prefetch_mr_work {
 	struct work_struct work;
-	struct ib_pd *pd;
 	u32 pf_flags;
 	u32 num_sge;
-	struct ib_sge sg_list[0];
+	struct {
+		u64 io_virt;
+		struct mlx5_ib_mr *mr;
+		size_t length;
+	} frags[];
 };
 
-static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev,
-				     struct ib_sge *sg_list, u32 num_sge,
-				     u32 from)
+static void destroy_prefetch_work(struct prefetch_mr_work *work)
 {
 	u32 i;
-	int srcu_key;
-
-	srcu_key = srcu_read_lock(&dev->mr_srcu);
 
-	for (i = from; i < num_sge; ++i) {
-		struct mlx5_core_mkey *mmkey;
-		struct mlx5_ib_mr *mr;
-
-		mmkey = xa_load(&dev->mdev->priv.mkey_table,
-				mlx5_base_mkey(sg_list[i].lkey));
-		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
-		atomic_dec(&mr->num_pending_prefetch);
-	}
-
-	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	for (i = 0; i < work->num_sge; ++i)
+		atomic_dec(&work->frags[i].mr->num_pending_prefetch);
+	kvfree(work);
 }
 
-static bool num_pending_prefetch_inc(struct ib_pd *pd,
-				     struct ib_sge *sg_list, u32 num_sge)
+static struct mlx5_ib_mr *
+get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
+		    u32 lkey)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	bool ret = true;
-	u32 i;
+	struct mlx5_core_mkey *mmkey;
+	struct ib_umem_odp *odp;
+	struct mlx5_ib_mr *mr;
 
-	for (i = 0; i < num_sge; ++i) {
-		struct mlx5_core_mkey *mmkey;
-		struct mlx5_ib_mr *mr;
+	lockdep_assert_held(&dev->mr_srcu);
 
-		mmkey = xa_load(&dev->mdev->priv.mkey_table,
-				mlx5_base_mkey(sg_list[i].lkey));
-		if (!mmkey || mmkey->key != sg_list[i].lkey) {
-			ret = false;
-			break;
-		}
+	mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(lkey));
+	if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
+		return NULL;
 
-		if (mmkey->type != MLX5_MKEY_MR) {
-			ret = false;
-			break;
-		}
+	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
 
-		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+	if (!smp_load_acquire(&mr->live))
+		return NULL;
 
-		if (!smp_load_acquire(&mr->live)) {
-			ret = false;
-			break;
-		}
+	if (mr->ibmr.pd != pd || !is_odp_mr(mr))
+		return NULL;
 
-		if (mr->ibmr.pd != pd) {
-			ret = false;
-			break;
-		}
+	/*
+	 * Implicit child MRs are internal and userspace should not refer to
+	 * them.
+	 */
+	if (mr->parent)
+		return NULL;
 
-		atomic_inc(&mr->num_pending_prefetch);
-	}
+	odp = to_ib_umem_odp(mr->umem);
 
-	if (!ret)
-		num_pending_prefetch_dec(dev, sg_list, i, 0);
+	/* prefetch with write-access must be supported by the MR */
+	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
+	    !odp->umem.writable)
+		return NULL;
 
-	return ret;
+	return mr;
 }
 
-static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags,
-				    struct ib_sge *sg_list, u32 num_sge)
+static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
 {
+	struct prefetch_mr_work *work =
+		container_of(w, struct prefetch_mr_work, work);
+	u32 bytes_mapped = 0;
 	u32 i;
-	int ret = 0;
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+
+	for (i = 0; i < work->num_sge; ++i)
+		pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
+			     work->frags[i].length, &bytes_mapped,
+			     work->pf_flags);
+
+	destroy_prefetch_work(work);
+}
+
+static bool init_prefetch_work(struct ib_pd *pd,
+			       enum ib_uverbs_advise_mr_advice advice,
+			       u32 pf_flags, struct prefetch_mr_work *work,
+			       struct ib_sge *sg_list, u32 num_sge)
+{
+	u32 i;
+
+	INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
+	work->pf_flags = pf_flags;
 
 	for (i = 0; i < num_sge; ++i) {
-		struct ib_sge *sg = &sg_list[i];
-		int bytes_committed = 0;
+		work->frags[i].io_virt = sg_list[i].addr;
+		work->frags[i].length = sg_list[i].length;
+		work->frags[i].mr =
+			get_prefetchable_mr(pd, advice, sg_list[i].lkey);
+		if (!work->frags[i].mr) {
+			work->num_sge = i - 1;
+			if (i)
+				destroy_prefetch_work(work);
+			return false;
+		}
 
-		ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr,
-						    sg->length,
-						    &bytes_committed, NULL,
-						    pf_flags);
-		if (ret < 0)
-			break;
+		/* Keep the MR pointer will valid outside the SRCU */
+		atomic_inc(&work->frags[i].mr->num_pending_prefetch);
 	}
-
-	return ret < 0 ? ret : 0;
+	work->num_sge = num_sge;
+	return true;
 }
 
-static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
+static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
+				    enum ib_uverbs_advise_mr_advice advice,
+				    u32 pf_flags, struct ib_sge *sg_list,
+				    u32 num_sge)
 {
-	struct prefetch_mr_work *w =
-		container_of(work, struct prefetch_mr_work, work);
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	u32 bytes_mapped = 0;
+	int srcu_key;
+	int ret = 0;
+	u32 i;
 
-	if (ib_device_try_get(w->pd->device)) {
-		mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list,
-					 w->num_sge);
-		ib_device_put(w->pd->device);
+	srcu_key = srcu_read_lock(&dev->mr_srcu);
+	for (i = 0; i < num_sge; ++i) {
+		struct mlx5_ib_mr *mr;
+
+		mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
+		if (!mr) {
+			ret = -ENOENT;
+			goto out;
+		}
+		ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
+				   &bytes_mapped, pf_flags);
+		if (ret < 0)
+			goto out;
 	}
 
-	num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list,
-				 w->num_sge, 0);
-	kvfree(w);
+out:
+	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	return ret;
 }
 
 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
@@ -1739,43 +1734,27 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 			       u32 flags, struct ib_sge *sg_list, u32 num_sge)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
+	u32 pf_flags = 0;
 	struct prefetch_mr_work *work;
-	bool valid_req;
 	int srcu_key;
 
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
 		pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
 
 	if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
-		return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list,
+		return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
 						num_sge);
 
-	work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL);
+	work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
 	if (!work)
 		return -ENOMEM;
 
-	memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
-
-	/* It is guaranteed that the pd when work is executed is the pd when
-	 * work was queued since pd can't be destroyed while it holds MRs and
-	 * destroying a MR leads to flushing the workquque
-	 */
-	work->pd = pd;
-	work->pf_flags = pf_flags;
-	work->num_sge = num_sge;
-
-	INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
-
 	srcu_key = srcu_read_lock(&dev->mr_srcu);
-
-	valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge);
-	if (valid_req)
-		queue_work(system_unbound_wq, &work->work);
-	else
-		kvfree(work);
-
+	if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
+		srcu_read_unlock(&dev->mr_srcu, srcu_key);
+		return -EINVAL;
+	}
+	queue_work(system_unbound_wq, &work->work);
 	srcu_read_unlock(&dev->mr_srcu, srcu_key);
-
-	return valid_req ? 0 : -EINVAL;
+	return 0;
 }
-- 
2.23.0


  reply	other threads:[~2019-10-09 16:10 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-09 16:09 [PATCH 00/15] Rework the locking and datastructures for mlx5 implicit ODP Jason Gunthorpe
2019-10-09 16:09 ` Jason Gunthorpe [this message]
2019-10-25 19:21   ` [PATCH 01/15] RDMA/mlx5: Use SRCU properly in ODP prefetch Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 02/15] RDMA/mlx5: Split sig_err MR data into its own xarray Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 03/15] RDMA/mlx5: Use a dedicated mkey xarray for ODP Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 04/15] RDMA/mlx5: Delete struct mlx5_priv->mkey_table Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 05/15] RDMA/mlx5: Rework implicit_mr_get_data Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 06/15] RDMA/mlx5: Lift implicit_mr_alloc() into the two routines that call it Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 07/15] RDMA/mlx5: Set the HW IOVA of the child MRs to their place in the tree Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 08/15] RDMA/mlx5: Split implicit handling from pagefault_mr Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 09/15] RDMA/mlx5: Use an xarray for the children of an implicit ODP Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 10/15] RDMA/mlx5: Reduce locking in implicit_mr_get_data() Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 11/15] RDMA/mlx5: Avoid double lookups on the pagefault path Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 12/15] RDMA/mlx5: Rework implicit ODP destroy Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 13/15] RDMA/mlx5: Do not store implicit children in the odp_mkeys xarray Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 14/15] RDMA/mlx5: Do not race with mlx5_ib_invalidate_range during create and destroy Jason Gunthorpe
2019-10-28 14:18   ` Jason Gunthorpe
2019-10-09 16:09 ` [PATCH 15/15] RDMA/odp: Remove broken debugging call to invalidate_range Jason Gunthorpe
2019-10-28 19:47 ` [PATCH 00/15] Rework the locking and datastructures for mlx5 implicit ODP Jason Gunthorpe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191009160934.3143-2-jgg@ziepe.ca \
    --to=jgg@ziepe.ca \
    --cc=artemyko@mellanox.com \
    --cc=jgg@mellanox.com \
    --cc=linux-rdma@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).