All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] IB/rdmavt: Fix concurrency panics in QP post_send and modify to error
@ 2019-03-25 21:16 Michael J. Ruhl
  0 siblings, 0 replies; only message in thread
From: Michael J. Ruhl @ 2019-03-25 21:16 UTC (permalink / raw)
  To: stable; +Cc: ruhlbiz

From: Michael J. Ruhl <michael.j.ruhl@intel.com>

commit d757c60eca9b22f4d108929a24401e0fdecda0b1 upstream

The RC/UC code path can go through a software loopback. In this code path
the receive side QP is manipulated.

If two threads are working on the QP receive side (i.e. post_send, and
modify_qp to an error state), QP information can be corrupted.

(post_send via loopback)
  set r_sge
  loop
     update r_sge
(modify_qp)
     take r_lock
     update r_sge <---- r_sge is now incorrect
(post_send)
     update r_sge <---- crash, etc.
     ...

This can lead to one of the two following crashes:

 BUG: unable to handle kernel NULL pointer dereference at (null)
  IP:  hfi1_copy_sge+0xf1/0x2e0 [hfi1]
  PGD 8000001fe6a57067 PUD 1fd9e0c067 PMD 0
 Call Trace:
  ruc_loopback+0x49b/0xbc0 [hfi1]
  hfi1_do_send+0x38e/0x3e0 [hfi1]
  _hfi1_do_send+0x1e/0x20 [hfi1]
  process_one_work+0x17f/0x440
  worker_thread+0x126/0x3c0
  kthread+0xd1/0xe0
  ret_from_fork_nospec_begin+0x21/0x21

or:

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000048
  IP:  rvt_clear_mr_refs+0x45/0x370 [rdmavt]
  PGD 80000006ae5eb067 PUD ef15d0067 PMD 0
 Call Trace:
  rvt_error_qp+0xaa/0x240 [rdmavt]
  rvt_modify_qp+0x47f/0xaa0 [rdmavt]
  ib_security_modify_qp+0x8f/0x400 [ib_core]
  ib_modify_qp_with_udata+0x44/0x70 [ib_core]
  modify_qp.isra.23+0x1eb/0x2b0 [ib_uverbs]
  ib_uverbs_modify_qp+0xaa/0xf0 [ib_uverbs]
  ib_uverbs_write+0x272/0x430 [ib_uverbs]
  vfs_write+0xc0/0x1f0
  SyS_write+0x7f/0xf0
  system_call_fastpath+0x1c/0x21

Fix by using the appropriate locking on the receiving QP.

Fixes: 83693bd14606 ("staging/rdma/hfi1: Use rdmavt version of post_send")
Cc: <stable@vger.kernel.org> #v4.9
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/ruc.c |   33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index d151429..2723284 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -346,6 +346,18 @@ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct ib_header *hdr,
 	return 1;
 }
 
+static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
+					  struct rvt_qp *sqp)
+{
+	rvp->n_pkt_drops++;
+	/*
+	 * For RC, the requester would timeout and retry so
+	 * shortcut the timeouts and just signal too many retries.
+	 */
+	return sqp->ibqp.qp_type == IB_QPT_RC ?
+		IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
+}
+
 /**
  * ruc_loopback - handle UC and RC loopback requests
  * @sqp: the sending QP
@@ -418,17 +430,14 @@ static void ruc_loopback(struct rvt_qp *sqp)
 	}
 	spin_unlock_irqrestore(&sqp->s_lock, flags);
 
-	if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+	if (!qp) {
+		send_status = loopback_qp_drop(&ibp->rvp, sqp);
+		goto serr_no_r_lock;
+	}
+	spin_lock_irqsave(&qp->r_lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
 	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
-		ibp->rvp.n_pkt_drops++;
-		/*
-		 * For RC, the requester would timeout and retry so
-		 * shortcut the timeouts and just signal too many retries.
-		 */
-		if (sqp->ibqp.qp_type == IB_QPT_RC)
-			send_status = IB_WC_RETRY_EXC_ERR;
-		else
-			send_status = IB_WC_SUCCESS;
+		send_status = loopback_qp_drop(&ibp->rvp, sqp);
 		goto serr;
 	}
 
@@ -601,6 +610,7 @@ static void ruc_loopback(struct rvt_qp *sqp)
 		     wqe->wr.send_flags & IB_SEND_SOLICITED);
 
 send_comp:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
 	spin_lock_irqsave(&sqp->s_lock, flags);
 	ibp->rvp.n_loop_pkts++;
 flush_send:
@@ -627,6 +637,7 @@ static void ruc_loopback(struct rvt_qp *sqp)
 	}
 	if (sqp->s_rnr_retry_cnt < 7)
 		sqp->s_rnr_retry--;
+	spin_unlock_irqrestore(&qp->r_lock, flags);
 	spin_lock_irqsave(&sqp->s_lock, flags);
 	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
 		goto clr_busy;
@@ -655,6 +666,8 @@ static void ruc_loopback(struct rvt_qp *sqp)
 	hfi1_rc_error(qp, wc.status);
 
 serr:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+serr_no_r_lock:
 	spin_lock_irqsave(&sqp->s_lock, flags);
 	hfi1_send_complete(sqp, wqe, send_status);
 	if (sqp->ibqp.qp_type == IB_QPT_RC) {


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2019-03-25 21:16 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-25 21:16 [PATCH] IB/rdmavt: Fix concurrency panics in QP post_send and modify to error Michael J. Ruhl

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.