linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH rdma-next v1 0/5] Fix memory corruption in CM
@ 2021-04-11 12:21 Leon Romanovsky
  2021-04-11 12:21 ` [PATCH rdma-next v1 1/5] Revert "IB/cm: Mark stale CM id's whenever the mad agent was unregistered" Leon Romanovsky
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Leon Romanovsky @ 2021-04-11 12:21 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: Leon Romanovsky, linux-rdma, Mark Zhang

From: Leon Romanovsky <leonro@nvidia.com>

Changelog:
v1:
 * Squashed "remove mad_agent ..." patches to make sure that we don't
   need to check for the NULL argument.
v0: https://lore.kernel.org/lkml/20210318100309.670344-1-leon@kernel.org

-------------------------------------------------------------------------------

Hi,

This series from Mark fixes long standing bug in CM migration logic,
reported by Ryan [1].

Thanks

[1] https://lore.kernel.org/linux-rdma/CAFMmRNx9cg--NUnZjFM8yWqFaEtsmAWV4EogKb3a0+hnjdtJFA@mail.gmail.com/


Mark Zhang (5):
  Revert "IB/cm: Mark stale CM id's whenever the mad agent was
    unregistered"
  IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls
  IB/cm: Clear all associated AV's ports when remove a cm device
  IB/cm: Add lock protection when access av/alt_av's port of a cm_id
  IB/cm: Initialize av before aquire the spin lock in cm_lap_handler

 drivers/infiniband/core/cm.c       | 398 +++++++++++++++++------------
 drivers/infiniband/core/mad.c      |  17 +-
 drivers/infiniband/core/sa_query.c |   4 +-
 include/rdma/ib_mad.h              |  27 +-
 4 files changed, 250 insertions(+), 196 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH rdma-next v1 1/5] Revert "IB/cm: Mark stale CM id's whenever the mad agent was unregistered"
  2021-04-11 12:21 [PATCH rdma-next v1 0/5] Fix memory corruption in CM Leon Romanovsky
@ 2021-04-11 12:21 ` Leon Romanovsky
  2021-04-11 12:21 ` [PATCH rdma-next v1 2/5] IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls Leon Romanovsky
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Leon Romanovsky @ 2021-04-11 12:21 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: Mark Zhang, linux-rdma

From: Mark Zhang <markzhang@nvidia.com>

This reverts commit 9db0ff53cb9b43ed75bacd42a89c1a0ab048b2b0, which
wasn't full and still causes to the following panic:

panic @ time 1605623870.843, thread 0xfffffeb63b552000: vm_fault_lookup: fault on nofault entry, addr: 0xfffffe811a94e000
    time = 1605623870
    cpuid = 9, TSC = 0xb7937acc1b6
    Panic occurred in module kernel loaded at 0xffffffff80200000:Stack: --------------------------------------------------
    kernel:vm_fault+0x19da
    kernel:vm_fault_trap+0x6e
    kernel:trap_pfault+0x1f1
    kernel:trap+0x31e
    kernel:cm_destroy_id+0x38c
    kernel:rdma_destroy_id+0x127
    kernel:sdp_shutdown_task+0x3ae
    kernel:taskqueue_run_locked+0x10b
    kernel:taskqueue_thread_loop+0x87
    kernel:fork_exit+0x83

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cm.c | 123 +++++------------------------------
 1 file changed, 15 insertions(+), 108 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 32c836b7ae97..e33b730107b4 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -121,8 +121,6 @@ static struct ib_cm {
 	__be32 random_id_operand;
 	struct list_head timewait_list;
 	struct workqueue_struct *wq;
-	/* Sync on cm change port state */
-	spinlock_t state_lock;
 } cm;
 
 /* Counter indexes ordered by attribute ID */
@@ -203,8 +201,6 @@ struct cm_port {
 	struct cm_device *cm_dev;
 	struct ib_mad_agent *mad_agent;
 	u32 port_num;
-	struct list_head cm_priv_prim_list;
-	struct list_head cm_priv_altr_list;
 	struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
 };
 
@@ -284,12 +280,6 @@ struct cm_id_private {
 	u8 service_timeout;
 	u8 target_ack_delay;
 
-	struct list_head prim_list;
-	struct list_head altr_list;
-	/* Indicates that the send port mad is registered and av is set */
-	int prim_send_port_not_ready;
-	int altr_send_port_not_ready;
-
 	struct list_head work_list;
 	atomic_t work_count;
 
@@ -310,47 +300,20 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
 	struct ib_mad_agent *mad_agent;
 	struct ib_mad_send_buf *m;
 	struct ib_ah *ah;
-	struct cm_av *av;
-	unsigned long flags, flags2;
-	int ret = 0;
 
-	/* don't let the port to be released till the agent is down */
-	spin_lock_irqsave(&cm.state_lock, flags2);
-	spin_lock_irqsave(&cm.lock, flags);
-	if (!cm_id_priv->prim_send_port_not_ready)
-		av = &cm_id_priv->av;
-	else if (!cm_id_priv->altr_send_port_not_ready &&
-		 (cm_id_priv->alt_av.port))
-		av = &cm_id_priv->alt_av;
-	else {
-		pr_info("%s: not valid CM id\n", __func__);
-		ret = -ENODEV;
-		spin_unlock_irqrestore(&cm.lock, flags);
-		goto out;
-	}
-	spin_unlock_irqrestore(&cm.lock, flags);
-	/* Make sure the port haven't released the mad yet */
 	mad_agent = cm_id_priv->av.port->mad_agent;
-	if (!mad_agent) {
-		pr_info("%s: not a valid MAD agent\n", __func__);
-		ret = -ENODEV;
-		goto out;
-	}
-	ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr, 0);
-	if (IS_ERR(ah)) {
-		ret = PTR_ERR(ah);
-		goto out;
-	}
+	ah = rdma_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr, 0);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
 
 	m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
-			       av->pkey_index,
+			       cm_id_priv->av.pkey_index,
 			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
 			       GFP_ATOMIC,
 			       IB_MGMT_BASE_VERSION);
 	if (IS_ERR(m)) {
 		rdma_destroy_ah(ah, 0);
-		ret = PTR_ERR(m);
-		goto out;
+		return PTR_ERR(m);
 	}
 
 	/* Timeout set by caller if response is expected. */
@@ -360,10 +323,7 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
 	refcount_inc(&cm_id_priv->refcount);
 	m->context[0] = cm_id_priv;
 	*msg = m;
-
-out:
-	spin_unlock_irqrestore(&cm.state_lock, flags2);
-	return ret;
+	return 0;
 }
 
 static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
@@ -481,21 +441,6 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
 				       grh, &av->ah_attr);
 }
 
-static void add_cm_id_to_port_list(struct cm_id_private *cm_id_priv,
-				   struct cm_av *av, struct cm_port *port)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cm.lock, flags);
-	if (&cm_id_priv->av == av)
-		list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list);
-	else if (&cm_id_priv->alt_av == av)
-		list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list);
-	else
-		WARN_ON(true);
-	spin_unlock_irqrestore(&cm.lock, flags);
-}
-
 static struct cm_port *
 get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr)
 {
@@ -539,8 +484,7 @@ get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr)
 
 static int cm_init_av_by_path(struct sa_path_rec *path,
 			      const struct ib_gid_attr *sgid_attr,
-			      struct cm_av *av,
-			      struct cm_id_private *cm_id_priv)
+			      struct cm_av *av)
 {
 	struct rdma_ah_attr new_ah_attr;
 	struct cm_device *cm_dev;
@@ -574,7 +518,6 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 		return ret;
 
 	av->timeout = path->packet_life_time + 1;
-	add_cm_id_to_port_list(cm_id_priv, av, port);
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
 }
@@ -854,8 +797,6 @@ static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
 	spin_lock_init(&cm_id_priv->lock);
 	init_completion(&cm_id_priv->comp);
 	INIT_LIST_HEAD(&cm_id_priv->work_list);
-	INIT_LIST_HEAD(&cm_id_priv->prim_list);
-	INIT_LIST_HEAD(&cm_id_priv->altr_list);
 	atomic_set(&cm_id_priv->work_count, -1);
 	refcount_set(&cm_id_priv->refcount, 1);
 
@@ -1156,12 +1097,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		kfree(cm_id_priv->timewait_info);
 		cm_id_priv->timewait_info = NULL;
 	}
-	if (!list_empty(&cm_id_priv->altr_list) &&
-	    (!cm_id_priv->altr_send_port_not_ready))
-		list_del(&cm_id_priv->altr_list);
-	if (!list_empty(&cm_id_priv->prim_list) &&
-	    (!cm_id_priv->prim_send_port_not_ready))
-		list_del(&cm_id_priv->prim_list);
+
 	WARN_ON(cm_id_priv->listen_sharecount);
 	WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
 	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
@@ -1528,13 +1464,12 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 	}
 
 	ret = cm_init_av_by_path(param->primary_path,
-				 param->ppath_sgid_attr, &cm_id_priv->av,
-				 cm_id_priv);
+				 param->ppath_sgid_attr, &cm_id_priv->av);
 	if (ret)
 		goto out;
 	if (param->alternate_path) {
 		ret = cm_init_av_by_path(param->alternate_path, NULL,
-					 &cm_id_priv->alt_av, cm_id_priv);
+					 &cm_id_priv->alt_av);
 		if (ret)
 			goto out;
 	}
@@ -2167,8 +2102,7 @@ static int cm_req_handler(struct cm_work *work)
 		sa_path_set_dmac(&work->path[0],
 				 cm_id_priv->av.ah_attr.roce.dmac);
 	work->path[0].hop_limit = grh->hop_limit;
-	ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av,
-				 cm_id_priv);
+	ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av);
 	if (ret) {
 		int err;
 
@@ -2187,7 +2121,7 @@ static int cm_req_handler(struct cm_work *work)
 	}
 	if (cm_req_has_alt_path(req_msg)) {
 		ret = cm_init_av_by_path(&work->path[1], NULL,
-					 &cm_id_priv->alt_av, cm_id_priv);
+					 &cm_id_priv->alt_av);
 		if (ret) {
 			ib_send_cm_rej(&cm_id_priv->id,
 				       IB_CM_REJ_INVALID_ALT_GID,
@@ -3364,7 +3298,7 @@ static int cm_lap_handler(struct cm_work *work)
 		goto unlock;
 
 	ret = cm_init_av_by_path(param->alternate_path, NULL,
-				 &cm_id_priv->alt_av, cm_id_priv);
+				 &cm_id_priv->alt_av);
 	if (ret)
 		goto unlock;
 
@@ -3484,8 +3418,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
 
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	ret = cm_init_av_by_path(param->path, param->sgid_attr,
-				 &cm_id_priv->av,
-				 cm_id_priv);
+				 &cm_id_priv->av);
 	if (ret)
 		goto out;
 
@@ -3968,9 +3901,7 @@ static int cm_establish(struct ib_cm_id *cm_id)
 static int cm_migrate(struct ib_cm_id *cm_id)
 {
 	struct cm_id_private *cm_id_priv;
-	struct cm_av tmp_av;
 	unsigned long flags;
-	int tmp_send_port_not_ready;
 	int ret = 0;
 
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -3979,14 +3910,7 @@ static int cm_migrate(struct ib_cm_id *cm_id)
 	    (cm_id->lap_state == IB_CM_LAP_UNINIT ||
 	     cm_id->lap_state == IB_CM_LAP_IDLE)) {
 		cm_id->lap_state = IB_CM_LAP_IDLE;
-		/* Swap address vector */
-		tmp_av = cm_id_priv->av;
 		cm_id_priv->av = cm_id_priv->alt_av;
-		cm_id_priv->alt_av = tmp_av;
-		/* Swap port send ready state */
-		tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready;
-		cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready;
-		cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready;
 	} else
 		ret = -EINVAL;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -4361,9 +4285,6 @@ static int cm_add_one(struct ib_device *ib_device)
 		port->cm_dev = cm_dev;
 		port->port_num = i;
 
-		INIT_LIST_HEAD(&port->cm_priv_prim_list);
-		INIT_LIST_HEAD(&port->cm_priv_altr_list);
-
 		ret = cm_create_port_fs(port);
 		if (ret)
 			goto error1;
@@ -4427,8 +4348,6 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 {
 	struct cm_device *cm_dev = client_data;
 	struct cm_port *port;
-	struct cm_id_private *cm_id_priv;
-	struct ib_mad_agent *cur_mad_agent;
 	struct ib_port_modify port_modify = {
 		.clr_port_cap_mask = IB_PORT_CM_SUP
 	};
@@ -4449,24 +4368,13 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 
 		port = cm_dev->port[i-1];
 		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
-		/* Mark all the cm_id's as not valid */
-		spin_lock_irq(&cm.lock);
-		list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list)
-			cm_id_priv->altr_send_port_not_ready = 1;
-		list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list)
-			cm_id_priv->prim_send_port_not_ready = 1;
-		spin_unlock_irq(&cm.lock);
 		/*
 		 * We flush the queue here after the going_down set, this
 		 * verify that no new works will be queued in the recv handler,
 		 * after that we can call the unregister_mad_agent
 		 */
 		flush_workqueue(cm.wq);
-		spin_lock_irq(&cm.state_lock);
-		cur_mad_agent = port->mad_agent;
-		port->mad_agent = NULL;
-		spin_unlock_irq(&cm.state_lock);
-		ib_unregister_mad_agent(cur_mad_agent);
+		ib_unregister_mad_agent(port->mad_agent);
 		cm_remove_port_fs(port);
 		kfree(port);
 	}
@@ -4481,7 +4389,6 @@ static int __init ib_cm_init(void)
 	INIT_LIST_HEAD(&cm.device_list);
 	rwlock_init(&cm.device_lock);
 	spin_lock_init(&cm.lock);
-	spin_lock_init(&cm.state_lock);
 	cm.listen_service_table = RB_ROOT;
 	cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
 	cm.remote_id_table = RB_ROOT;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH rdma-next v1 2/5] IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls
  2021-04-11 12:21 [PATCH rdma-next v1 0/5] Fix memory corruption in CM Leon Romanovsky
  2021-04-11 12:21 ` [PATCH rdma-next v1 1/5] Revert "IB/cm: Mark stale CM id's whenever the mad agent was unregistered" Leon Romanovsky
@ 2021-04-11 12:21 ` Leon Romanovsky
  2021-04-13 16:26   ` Jason Gunthorpe
  2021-04-11 12:21 ` [PATCH rdma-next v1 3/5] IB/cm: Clear all associated AV's ports when remove a cm device Leon Romanovsky
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 7+ messages in thread
From: Leon Romanovsky @ 2021-04-11 12:21 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: Mark Zhang, linux-rdma

From: Mark Zhang <markzhang@nvidia.com>

The mad_agent parameter is redundant since the struct ib_mad_send_buf
already has a pointer of it.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cm.c       | 101 ++++++++++++++++++-----------
 drivers/infiniband/core/mad.c      |  17 ++---
 drivers/infiniband/core/sa_query.c |   4 +-
 include/rdma/ib_mad.h              |  27 ++++----
 4 files changed, 84 insertions(+), 65 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index e33b730107b4..f7f094861f79 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1023,7 +1023,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		break;
 	case IB_CM_SIDR_REQ_SENT:
 		cm_id->state = IB_CM_IDLE;
-		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		ib_cancel_mad(cm_id_priv->msg);
 		break;
 	case IB_CM_SIDR_REQ_RCVD:
 		cm_send_sidr_rep_locked(cm_id_priv,
@@ -1034,7 +1034,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		break;
 	case IB_CM_REQ_SENT:
 	case IB_CM_MRA_REQ_RCVD:
-		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		ib_cancel_mad(cm_id_priv->msg);
 		cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT,
 				   &cm_id_priv->id.device->node_guid,
 				   sizeof(cm_id_priv->id.device->node_guid),
@@ -1052,7 +1052,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		break;
 	case IB_CM_REP_SENT:
 	case IB_CM_MRA_REP_RCVD:
-		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		ib_cancel_mad(cm_id_priv->msg);
 		cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL,
 				   0, NULL, 0);
 		goto retest;
@@ -1070,7 +1070,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		cm_send_dreq_locked(cm_id_priv, NULL, 0);
 		goto retest;
 	case IB_CM_DREQ_SENT:
-		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		ib_cancel_mad(cm_id_priv->msg);
 		cm_enter_timewait(cm_id_priv);
 		goto retest;
 	case IB_CM_DREQ_RCVD:
@@ -1473,6 +1473,8 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 		if (ret)
 			goto out;
 	}
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	cm_id->service_id = param->service_id;
 	cm_id->service_mask = ~cpu_to_be64(0);
 	cm_id_priv->timeout_ms = cm_convert_to_ms(
@@ -1489,7 +1491,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 
 	ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
 	if (ret)
-		goto out;
+		goto error_alloc;
 
 	req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
 	cm_format_req(req_msg, cm_id_priv, param);
@@ -1501,19 +1503,21 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 	cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
 
 	trace_icm_send_req(&cm_id_priv->id);
-	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	ret = ib_post_send_mad(cm_id_priv->msg, NULL);
-	if (ret) {
-		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-		goto error2;
-	}
-	BUG_ON(cm_id->state != IB_CM_IDLE);
+	if (ret)
+		goto error_post_send_mad;
+
 	cm_id->state = IB_CM_REQ_SENT;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return 0;
 
-error2:	cm_free_msg(cm_id_priv->msg);
-out:	return ret;
+error_post_send_mad:
+	cm_free_msg(cm_id_priv->msg);
+	cm_id_priv->msg = NULL;
+error_alloc:
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+	return ret;
 }
 EXPORT_SYMBOL(ib_send_cm_req);
 
@@ -2491,7 +2495,7 @@ static int cm_rep_handler(struct cm_work *work)
 			cm_ack_timeout(cm_id_priv->target_ack_delay,
 				       cm_id_priv->alt_av.timeout - 1);
 
-	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ib_cancel_mad(cm_id_priv->msg);
 	cm_queue_work_unlock(cm_id_priv, work);
 	return 0;
 
@@ -2515,7 +2519,7 @@ static int cm_establish_handler(struct cm_work *work)
 		goto out;
 	}
 
-	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ib_cancel_mad(cm_id_priv->msg);
 	cm_queue_work_unlock(cm_id_priv, work);
 	return 0;
 out:
@@ -2548,7 +2552,7 @@ static int cm_rtu_handler(struct cm_work *work)
 	}
 	cm_id_priv->id.state = IB_CM_ESTABLISHED;
 
-	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ib_cancel_mad(cm_id_priv->msg);
 	cm_queue_work_unlock(cm_id_priv, work);
 	return 0;
 out:
@@ -2593,7 +2597,7 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
 
 	if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
 	    cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
-		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		ib_cancel_mad(cm_id_priv->msg);
 
 	ret = cm_alloc_msg(cm_id_priv, &msg);
 	if (ret) {
@@ -2768,12 +2772,12 @@ static int cm_dreq_handler(struct cm_work *work)
 	switch (cm_id_priv->id.state) {
 	case IB_CM_REP_SENT:
 	case IB_CM_DREQ_SENT:
-		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		ib_cancel_mad(cm_id_priv->msg);
 		break;
 	case IB_CM_ESTABLISHED:
 		if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
 		    cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
-			ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+			ib_cancel_mad(cm_id_priv->msg);
 		break;
 	case IB_CM_MRA_REP_RCVD:
 		break;
@@ -2834,7 +2838,7 @@ static int cm_drep_handler(struct cm_work *work)
 	}
 	cm_enter_timewait(cm_id_priv);
 
-	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ib_cancel_mad(cm_id_priv->msg);
 	cm_queue_work_unlock(cm_id_priv, work);
 	return 0;
 out:
@@ -2970,7 +2974,7 @@ static int cm_rej_handler(struct cm_work *work)
 	case IB_CM_MRA_REQ_RCVD:
 	case IB_CM_REP_SENT:
 	case IB_CM_MRA_REP_RCVD:
-		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		ib_cancel_mad(cm_id_priv->msg);
 		fallthrough;
 	case IB_CM_REQ_RCVD:
 	case IB_CM_MRA_REQ_SENT:
@@ -2980,7 +2984,7 @@ static int cm_rej_handler(struct cm_work *work)
 			cm_reset_to_idle(cm_id_priv);
 		break;
 	case IB_CM_DREQ_SENT:
-		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		ib_cancel_mad(cm_id_priv->msg);
 		fallthrough;
 	case IB_CM_REP_RCVD:
 	case IB_CM_MRA_REP_SENT:
@@ -2990,8 +2994,7 @@ static int cm_rej_handler(struct cm_work *work)
 		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
 		    cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
 			if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
-				ib_cancel_mad(cm_id_priv->av.port->mad_agent,
-					      cm_id_priv->msg);
+				ib_cancel_mad(cm_id_priv->msg);
 			cm_enter_timewait(cm_id_priv);
 			break;
 		}
@@ -3130,16 +3133,14 @@ static int cm_mra_handler(struct cm_work *work)
 	case IB_CM_REQ_SENT:
 		if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
 			    CM_MSG_RESPONSE_REQ ||
-		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
-				  cm_id_priv->msg, timeout))
+		    ib_modify_mad(cm_id_priv->msg, timeout))
 			goto out;
 		cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
 		break;
 	case IB_CM_REP_SENT:
 		if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
 			    CM_MSG_RESPONSE_REP ||
-		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
-				  cm_id_priv->msg, timeout))
+		    ib_modify_mad(cm_id_priv->msg, timeout))
 			goto out;
 		cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
 		break;
@@ -3147,8 +3148,7 @@ static int cm_mra_handler(struct cm_work *work)
 		if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
 			    CM_MSG_RESPONSE_OTHER ||
 		    cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
-		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
-				  cm_id_priv->msg, timeout)) {
+		    ib_modify_mad(cm_id_priv->msg, timeout)) {
 			if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
 				atomic_long_inc(&work->port->
 						counter_group[CM_RECV_DUPLICATES].
@@ -3348,7 +3348,7 @@ static int cm_apr_handler(struct cm_work *work)
 		goto out;
 	}
 	cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
-	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ib_cancel_mad(cm_id_priv->msg);
 	cm_id_priv->msg = NULL;
 	cm_queue_work_unlock(cm_id_priv, work);
 	return 0;
@@ -3674,7 +3674,7 @@ static int cm_sidr_rep_handler(struct cm_work *work)
 		goto out;
 	}
 	cm_id_priv->id.state = IB_CM_IDLE;
-	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ib_cancel_mad(cm_id_priv->msg);
 	spin_unlock_irq(&cm_id_priv->lock);
 
 	cm_format_sidr_rep_event(work, cm_id_priv);
@@ -3730,17 +3730,23 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg,
 
 	/* No other events can occur on the cm_id at this point. */
 	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+	spin_lock_irq(&cm_id_priv->lock);
 	cm_free_msg(msg);
+	cm_id_priv->msg = NULL;
+	spin_unlock_irq(&cm_id_priv->lock);
 	if (ret)
 		ib_destroy_cm_id(&cm_id_priv->id);
 	return;
 discard:
-	spin_unlock_irq(&cm_id_priv->lock);
+	if (msg == cm_id_priv->msg)
+		cm_id_priv->msg = NULL;
 	cm_free_msg(msg);
+	spin_unlock_irq(&cm_id_priv->lock);
 }
 
-static void cm_send_handler(struct ib_mad_agent *mad_agent,
-			    struct ib_mad_send_wc *mad_send_wc)
+static void __cm_send_handler(struct ib_mad_agent *mad_agent,
+			      struct ib_mad_send_wc *mad_send_wc,
+			      struct cm_id_private *cm_id_priv)
 {
 	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
 	struct cm_port *port;
@@ -3769,16 +3775,37 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
 	case IB_WC_SUCCESS:
 	case IB_WC_WR_FLUSH_ERR:
 		cm_free_msg(msg);
+		if (cm_id_priv)
+			cm_id_priv->msg = NULL;
 		break;
 	default:
-		if (msg->context[0] && msg->context[1])
+		if (msg->context[0] && msg->context[1]) {
 			cm_process_send_error(msg, mad_send_wc->status);
-		else
+		} else {
 			cm_free_msg(msg);
+			if (cm_id_priv)
+				cm_id_priv->msg = NULL;
+		}
 		break;
 	}
 }
 
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+	struct cm_id_private *cm_id_priv;
+
+	cm_id_priv = msg->context[0];
+	if (!cm_id_priv || cm_id_priv->msg != msg) {
+		__cm_send_handler(mad_agent, mad_send_wc, NULL);
+	} else {
+		spin_lock_irq(&cm_id_priv->lock);
+		__cm_send_handler(mad_agent, mad_send_wc, cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+	}
+}
+
 static void cm_work_handler(struct work_struct *_work)
 {
 	struct cm_work *work = container_of(_work, struct cm_work, work.work);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index ce0397fd4b7d..e7ff4420777e 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2464,16 +2464,18 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
 	return NULL;
 }
 
-int ib_modify_mad(struct ib_mad_agent *mad_agent,
-		  struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
 {
 	struct ib_mad_agent_private *mad_agent_priv;
 	struct ib_mad_send_wr_private *mad_send_wr;
 	unsigned long flags;
 	int active;
 
-	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
-				      agent);
+	if (!send_buf)
+		return -EINVAL;
+
+	mad_agent_priv = container_of(send_buf->mad_agent,
+				      struct ib_mad_agent_private, agent);
 	spin_lock_irqsave(&mad_agent_priv->lock, flags);
 	mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
 	if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
@@ -2498,13 +2500,6 @@ int ib_modify_mad(struct ib_mad_agent *mad_agent,
 }
 EXPORT_SYMBOL(ib_modify_mad);
 
-void ib_cancel_mad(struct ib_mad_agent *mad_agent,
-		   struct ib_mad_send_buf *send_buf)
-{
-	ib_modify_mad(mad_agent, send_buf, 0);
-}
-EXPORT_SYMBOL(ib_cancel_mad);
-
 static void local_completions(struct work_struct *work)
 {
 	struct ib_mad_agent_private *mad_agent_priv;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 8f1705c403b4..9a4a49c37922 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -1172,7 +1172,6 @@ EXPORT_SYMBOL(ib_sa_unregister_client);
 void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 {
 	unsigned long flags;
-	struct ib_mad_agent *agent;
 	struct ib_mad_send_buf *mad_buf;
 
 	xa_lock_irqsave(&queries, flags);
@@ -1180,7 +1179,6 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 		xa_unlock_irqrestore(&queries, flags);
 		return;
 	}
-	agent = query->port->agent;
 	mad_buf = query->mad_buf;
 	xa_unlock_irqrestore(&queries, flags);
 
@@ -1190,7 +1188,7 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 	 * sent to the MAD layer and has to be cancelled from there.
 	 */
 	if (!ib_nl_cancel_request(query))
-		ib_cancel_mad(agent, mad_buf);
+		ib_cancel_mad(mad_buf);
 }
 EXPORT_SYMBOL(ib_sa_cancel_query);
 
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index f1d34f06a68b..465b0d0bdaf8 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -717,28 +717,27 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
  */
 void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
 
-/**
- * ib_cancel_mad - Cancels an outstanding send MAD operation.
- * @mad_agent: Specifies the registration associated with sent MAD.
- * @send_buf: Indicates the MAD to cancel.
- *
- * MADs will be returned to the user through the corresponding
- * ib_mad_send_handler.
- */
-void ib_cancel_mad(struct ib_mad_agent *mad_agent,
-		   struct ib_mad_send_buf *send_buf);
-
 /**
  * ib_modify_mad - Modifies an outstanding send MAD operation.
- * @mad_agent: Specifies the registration associated with sent MAD.
  * @send_buf: Indicates the MAD to modify.
  * @timeout_ms: New timeout value for sent MAD.
  *
  * This call will reset the timeout value for a sent MAD to the specified
  * value.
  */
-int ib_modify_mad(struct ib_mad_agent *mad_agent,
-		  struct ib_mad_send_buf *send_buf, u32 timeout_ms);
+int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @send_buf: Indicates the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+static inline void ib_cancel_mad(struct ib_mad_send_buf *send_buf)
+{
+	ib_modify_mad(send_buf, 0);
+}
 
 /**
  * ib_create_send_mad - Allocate and initialize a data buffer and work request
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH rdma-next v1 3/5] IB/cm: Clear all associated AV's ports when remove a cm device
  2021-04-11 12:21 [PATCH rdma-next v1 0/5] Fix memory corruption in CM Leon Romanovsky
  2021-04-11 12:21 ` [PATCH rdma-next v1 1/5] Revert "IB/cm: Mark stale CM id's whenever the mad agent was unregistered" Leon Romanovsky
  2021-04-11 12:21 ` [PATCH rdma-next v1 2/5] IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls Leon Romanovsky
@ 2021-04-11 12:21 ` Leon Romanovsky
  2021-04-11 12:21 ` [PATCH rdma-next v1 4/5] IB/cm: Add lock protection when access av/alt_av's port of a cm_id Leon Romanovsky
  2021-04-11 12:21 ` [PATCH rdma-next v1 5/5] IB/cm: Initialize av before aquire the spin lock in cm_lap_handler Leon Romanovsky
  4 siblings, 0 replies; 7+ messages in thread
From: Leon Romanovsky @ 2021-04-11 12:21 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: Mark Zhang, linux-rdma

From: Mark Zhang <markzhang@nvidia.com>

When removed a cm device all ports are removed as well, so all AV's ports
needs to be cleared.
This patch adds a cm_id_priv list for each cm_devices; For a cm_id when
it's primary AV is initialized it is added to this list, so when removing
the device all cm_id's on this list will be removed from this list and
have its av->port and alt_av->port pointer cleared.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cm.c | 75 ++++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index f7f094861f79..b4f4a569c0b9 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -209,6 +209,7 @@ struct cm_device {
 	struct ib_device *ib_device;
 	u8 ack_delay;
 	int going_down;
+	struct list_head cm_id_priv_list;
 	struct cm_port *port[];
 };
 
@@ -284,6 +285,8 @@ struct cm_id_private {
 	atomic_t work_count;
 
 	struct rdma_ucm_ece ece;
+
+	struct list_head cm_dev_list;
 };
 
 static void cm_work_handler(struct work_struct *work);
@@ -405,9 +408,28 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv,
 	cm_id_priv->private_data_len = private_data_len;
 }
 
+static void add_cm_id_to_cm_dev_list(struct cm_id_private *cm_id_priv,
+				     struct cm_device *cm_dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	if (cm_dev->going_down)
+		goto out;
+
+	if (!list_empty(&cm_id_priv->cm_dev_list))
+		list_del(&cm_id_priv->cm_dev_list);
+	list_add_tail(&cm_id_priv->cm_dev_list, &cm_dev->cm_id_priv_list);
+
+out:
+	spin_unlock_irqrestore(&cm.lock, flags);
+}
+
 static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
-			      struct ib_grh *grh, struct cm_av *av)
+			      struct ib_grh *grh,
+			      struct cm_id_private *cm_id_priv)
 {
+	struct cm_av *av = &cm_id_priv->av;
 	struct rdma_ah_attr new_ah_attr;
 	int ret;
 
@@ -427,14 +449,20 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
 	if (ret)
 		return ret;
 
+	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
+
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
 }
 
 static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
-				   struct ib_grh *grh, struct cm_av *av)
+				   struct ib_grh *grh,
+				   struct cm_id_private *cm_id_priv)
 {
+	struct cm_av *av = &cm_id_priv->av;
+
 	av->port = port;
+	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
 	av->pkey_index = wc->pkey_index;
 	return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
 				       port->port_num, wc,
@@ -484,11 +512,13 @@ get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr)
 
 static int cm_init_av_by_path(struct sa_path_rec *path,
 			      const struct ib_gid_attr *sgid_attr,
-			      struct cm_av *av)
+			      struct cm_id_private *cm_id_priv,
+			      bool is_priv_av)
 {
 	struct rdma_ah_attr new_ah_attr;
 	struct cm_device *cm_dev;
 	struct cm_port *port;
+	struct cm_av *av;
 	int ret;
 
 	port = get_cm_port_from_path(path, sgid_attr);
@@ -496,6 +526,11 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 		return -EINVAL;
 	cm_dev = port->cm_dev;
 
+	if (!is_priv_av && cm_dev != cm_id_priv->av.port->cm_dev)
+		return -EINVAL;
+
+	av = is_priv_av ? &cm_id_priv->av : &cm_id_priv->alt_av;
+
 	ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
 				  be16_to_cpu(path->pkey), &av->pkey_index);
 	if (ret)
@@ -519,6 +554,9 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 
 	av->timeout = path->packet_life_time + 1;
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
+	if (is_priv_av)
+		add_cm_id_to_cm_dev_list(cm_id_priv, cm_dev);
+
 	return 0;
 }
 
@@ -797,6 +835,7 @@ static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
 	spin_lock_init(&cm_id_priv->lock);
 	init_completion(&cm_id_priv->comp);
 	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	INIT_LIST_HEAD(&cm_id_priv->cm_dev_list);
 	atomic_set(&cm_id_priv->work_count, -1);
 	refcount_set(&cm_id_priv->refcount, 1);
 
@@ -1098,6 +1137,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		cm_id_priv->timewait_info = NULL;
 	}
 
+	if (!list_empty(&cm_id_priv->cm_dev_list))
+		list_del(&cm_id_priv->cm_dev_list);
 	WARN_ON(cm_id_priv->listen_sharecount);
 	WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
 	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
@@ -1464,12 +1505,12 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 	}
 
 	ret = cm_init_av_by_path(param->primary_path,
-				 param->ppath_sgid_attr, &cm_id_priv->av);
+				 param->ppath_sgid_attr, cm_id_priv, true);
 	if (ret)
 		goto out;
 	if (param->alternate_path) {
 		ret = cm_init_av_by_path(param->alternate_path, NULL,
-					 &cm_id_priv->alt_av);
+					 cm_id_priv, false);
 		if (ret)
 			goto out;
 	}
@@ -2048,7 +2089,7 @@ static int cm_req_handler(struct cm_work *work)
 
 	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
 				      work->mad_recv_wc->recv_buf.grh,
-				      &cm_id_priv->av);
+				      cm_id_priv);
 	if (ret)
 		goto destroy;
 	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
@@ -2106,7 +2147,7 @@ static int cm_req_handler(struct cm_work *work)
 		sa_path_set_dmac(&work->path[0],
 				 cm_id_priv->av.ah_attr.roce.dmac);
 	work->path[0].hop_limit = grh->hop_limit;
-	ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av);
+	ret = cm_init_av_by_path(&work->path[0], gid_attr, cm_id_priv, true);
 	if (ret) {
 		int err;
 
@@ -2125,7 +2166,7 @@ static int cm_req_handler(struct cm_work *work)
 	}
 	if (cm_req_has_alt_path(req_msg)) {
 		ret = cm_init_av_by_path(&work->path[1], NULL,
-					 &cm_id_priv->alt_av);
+					 cm_id_priv, false);
 		if (ret) {
 			ib_send_cm_rej(&cm_id_priv->id,
 				       IB_CM_REJ_INVALID_ALT_GID,
@@ -3293,12 +3334,12 @@ static int cm_lap_handler(struct cm_work *work)
 
 	ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
 				 work->mad_recv_wc->recv_buf.grh,
-				 &cm_id_priv->av);
+				 cm_id_priv);
 	if (ret)
 		goto unlock;
 
 	ret = cm_init_av_by_path(param->alternate_path, NULL,
-				 &cm_id_priv->alt_av);
+				 cm_id_priv, false);
 	if (ret)
 		goto unlock;
 
@@ -3418,7 +3459,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
 
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	ret = cm_init_av_by_path(param->path, param->sgid_attr,
-				 &cm_id_priv->av);
+				 cm_id_priv, true);
 	if (ret)
 		goto out;
 
@@ -3505,7 +3546,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	cm_id_priv->av.dgid.global.interface_id = 0;
 	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
 				      work->mad_recv_wc->recv_buf.grh,
-				      &cm_id_priv->av);
+				      cm_id_priv);
 	if (ret)
 		goto out;
 
@@ -4296,6 +4337,7 @@ static int cm_add_one(struct ib_device *ib_device)
 	cm_dev->ib_device = ib_device;
 	cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
 	cm_dev->going_down = 0;
+	INIT_LIST_HEAD(&cm_dev->cm_id_priv_list);
 
 	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
 	rdma_for_each_port (ib_device, i) {
@@ -4374,6 +4416,7 @@ static int cm_add_one(struct ib_device *ib_device)
 static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 {
 	struct cm_device *cm_dev = client_data;
+	struct cm_id_private *cm_id_priv, *tmp;
 	struct cm_port *port;
 	struct ib_port_modify port_modify = {
 		.clr_port_cap_mask = IB_PORT_CM_SUP
@@ -4389,6 +4432,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 	cm_dev->going_down = 1;
 	spin_unlock_irq(&cm.lock);
 
+	list_for_each_entry_safe(cm_id_priv, tmp,
+				 &cm_dev->cm_id_priv_list, cm_dev_list) {
+		if (!list_empty(&cm_id_priv->cm_dev_list))
+			list_del(&cm_id_priv->cm_dev_list);
+		cm_id_priv->av.port = NULL;
+		cm_id_priv->alt_av.port = NULL;
+	}
+
 	rdma_for_each_port (ib_device, i) {
 		if (!rdma_cap_ib_cm(ib_device, i))
 			continue;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH rdma-next v1 4/5] IB/cm: Add lock protection when access av/alt_av's port of a cm_id
  2021-04-11 12:21 [PATCH rdma-next v1 0/5] Fix memory corruption in CM Leon Romanovsky
                   ` (2 preceding siblings ...)
  2021-04-11 12:21 ` [PATCH rdma-next v1 3/5] IB/cm: Clear all associated AV's ports when remove a cm device Leon Romanovsky
@ 2021-04-11 12:21 ` Leon Romanovsky
  2021-04-11 12:21 ` [PATCH rdma-next v1 5/5] IB/cm: Initialize av before aquire the spin lock in cm_lap_handler Leon Romanovsky
  4 siblings, 0 replies; 7+ messages in thread
From: Leon Romanovsky @ 2021-04-11 12:21 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: Mark Zhang, linux-rdma

From: Mark Zhang <markzhang@nvidia.com>

Add a rwlock protection when access the av/alt_av's port pointer.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cm.c | 131 +++++++++++++++++++++++++++--------
 1 file changed, 104 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index b4f4a569c0b9..d8368d71ac7c 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -261,6 +261,7 @@ struct cm_id_private {
 	/* todo: use alternate port on send failure */
 	struct cm_av av;
 	struct cm_av alt_av;
+	rwlock_t av_rwlock;	/* Do not acquire inside cm.lock */
 
 	void *private_data;
 	__be64 tid;
@@ -303,17 +304,33 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
 	struct ib_mad_agent *mad_agent;
 	struct ib_mad_send_buf *m;
 	struct ib_ah *ah;
+	int ret;
+
+	read_lock(&cm_id_priv->av_rwlock);
+	if (!cm_id_priv->av.port) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	mad_agent = cm_id_priv->av.port->mad_agent;
+	if (!mad_agent) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	ah = rdma_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr, 0);
-	if (IS_ERR(ah))
-		return PTR_ERR(ah);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto out;
+	}
 
 	m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
 			       cm_id_priv->av.pkey_index,
 			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
 			       GFP_ATOMIC,
 			       IB_MGMT_BASE_VERSION);
+
+	read_unlock(&cm_id_priv->av_rwlock);
 	if (IS_ERR(m)) {
 		rdma_destroy_ah(ah, 0);
 		return PTR_ERR(m);
@@ -327,6 +344,10 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
 	m->context[0] = cm_id_priv;
 	*msg = m;
 	return 0;
+
+out:
+	read_unlock(&cm_id_priv->av_rwlock);
+	return ret;
 }
 
 static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
@@ -420,7 +441,6 @@ static void add_cm_id_to_cm_dev_list(struct cm_id_private *cm_id_priv,
 	if (!list_empty(&cm_id_priv->cm_dev_list))
 		list_del(&cm_id_priv->cm_dev_list);
 	list_add_tail(&cm_id_priv->cm_dev_list, &cm_dev->cm_id_priv_list);
-
 out:
 	spin_unlock_irqrestore(&cm.lock, flags);
 }
@@ -433,8 +453,8 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
 	struct rdma_ah_attr new_ah_attr;
 	int ret;
 
-	av->port = port;
-	av->pkey_index = wc->pkey_index;
+	if (!port)
+		return -EINVAL;
 
 	/*
 	 * av->ah_attr might be initialized based on past wc during incoming
@@ -449,7 +469,11 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
 	if (ret)
 		return ret;
 
+	write_lock(&cm_id_priv->av_rwlock);
+	av->port = port;
+	av->pkey_index = wc->pkey_index;
 	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
+	write_unlock(&cm_id_priv->av_rwlock);
 
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
@@ -461,8 +485,10 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
 {
 	struct cm_av *av = &cm_id_priv->av;
 
+	write_lock(&cm_id_priv->av_rwlock);
 	av->port = port;
 	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
+	write_unlock(&cm_id_priv->av_rwlock);
 	av->pkey_index = wc->pkey_index;
 	return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
 				       port->port_num, wc,
@@ -519,15 +545,21 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 	struct cm_device *cm_dev;
 	struct cm_port *port;
 	struct cm_av *av;
-	int ret;
+	int ret = 0;
 
 	port = get_cm_port_from_path(path, sgid_attr);
 	if (!port)
 		return -EINVAL;
 	cm_dev = port->cm_dev;
 
-	if (!is_priv_av && cm_dev != cm_id_priv->av.port->cm_dev)
-		return -EINVAL;
+	read_lock(&cm_id_priv->av_rwlock);
+	if (!is_priv_av &&
+	    (!cm_id_priv->av.port || cm_dev != cm_id_priv->av.port->cm_dev))
+		ret = -EINVAL;
+
+	read_unlock(&cm_id_priv->av_rwlock);
+	if (ret)
+		return ret;
 
 	av = is_priv_av ? &cm_id_priv->av : &cm_id_priv->alt_av;
 
@@ -536,8 +568,6 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 	if (ret)
 		return ret;
 
-	av->port = port;
-
 	/*
 	 * av->ah_attr might be initialized based on wc or during
 	 * request processing time which might have reference to sgid_attr.
@@ -552,11 +582,15 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 	if (ret)
 		return ret;
 
+	write_lock(&cm_id_priv->av_rwlock);
+	av->port = port;
 	av->timeout = path->packet_life_time + 1;
-	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	if (is_priv_av)
 		add_cm_id_to_cm_dev_list(cm_id_priv, cm_dev);
 
+	write_unlock(&cm_id_priv->av_rwlock);
+
+	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
 }
 
@@ -838,6 +872,7 @@ static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
 	INIT_LIST_HEAD(&cm_id_priv->cm_dev_list);
 	atomic_set(&cm_id_priv->work_count, -1);
 	refcount_set(&cm_id_priv->refcount, 1);
+	rwlock_init(&cm_id_priv->av_rwlock);
 
 	ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b,
 			      &cm.local_id_next, GFP_KERNEL);
@@ -951,6 +986,26 @@ static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
 	return min(31, ack_timeout);
 }
 
+static u8 cm_ack_timeout_req(struct cm_id_private *cm_id_priv,
+			     u8 packet_life_time)
+{
+	u8 ack_delay = 0;
+
+	read_lock(&cm_id_priv->av_rwlock);
+	if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev)
+		ack_delay = cm_id_priv->av.port->cm_dev->ack_delay;
+	read_unlock(&cm_id_priv->av_rwlock);
+
+	return cm_ack_timeout(ack_delay, packet_life_time);
+}
+
+static u8 cm_ack_timeout_rep(struct cm_id_private *cm_id_priv,
+			     u8 packet_life_time)
+{
+	return cm_ack_timeout(cm_id_priv->target_ack_delay,
+			      packet_life_time);
+}
+
 static void cm_remove_remote(struct cm_id_private *cm_id_priv)
 {
 	struct cm_timewait_info *timewait_info = cm_id_priv->timewait_info;
@@ -1285,9 +1340,13 @@ EXPORT_SYMBOL(ib_cm_insert_listen);
 
 static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
 {
-	u64 hi_tid, low_tid;
+	u64 hi_tid = 0, low_tid;
+
+	read_lock(&cm_id_priv->av_rwlock);
+	if (cm_id_priv->av.port && cm_id_priv->av.port->mad_agent)
+		hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+	read_unlock(&cm_id_priv->av_rwlock);
 
-	hi_tid   = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
 	low_tid  = (u64)cm_id_priv->id.local_id;
 	return cpu_to_be64(hi_tid | low_tid);
 }
@@ -1391,8 +1450,7 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 	IBA_SET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg,
 		(pri_path->hop_limit <= 1));
 	IBA_SET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg,
-		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
-			       pri_path->packet_life_time));
+		cm_ack_timeout_req(cm_id_priv, pri_path->packet_life_time));
 
 	if (alt_path) {
 		bool alt_ext = false;
@@ -1443,8 +1501,8 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 		IBA_SET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg,
 			(alt_path->hop_limit <= 1));
 		IBA_SET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg,
-			cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
-				       alt_path->packet_life_time));
+			cm_ack_timeout_req(cm_id_priv,
+					   alt_path->packet_life_time));
 	}
 	IBA_SET(CM_REQ_VENDOR_ID, req_msg, param->ece.vendor_id);
 
@@ -1785,7 +1843,12 @@ static void cm_format_req_event(struct cm_work *work,
 	param = &work->cm_event.param.req_rcvd;
 	param->listen_id = listen_id;
 	param->bth_pkey = cm_get_bth_pkey(work);
-	param->port = cm_id_priv->av.port->port_num;
+	read_lock(&cm_id_priv->av_rwlock);
+	if (cm_id_priv->av.port)
+		param->port = cm_id_priv->av.port->port_num;
+	else
+		param->port = 0;
+	read_unlock(&cm_id_priv->av_rwlock);
 	param->primary_path = &work->path[0];
 	cm_opa_to_ib_sgid(work, param->primary_path);
 	if (cm_req_has_alt_path(req_msg)) {
@@ -2216,8 +2279,13 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg,
 	IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn);
 	IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg,
 		param->responder_resources);
-	IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg,
-		cm_id_priv->av.port->cm_dev->ack_delay);
+	read_lock(&cm_id_priv->av_rwlock);
+	if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev)
+		IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg,
+			cm_id_priv->av.port->cm_dev->ack_delay);
+	else
+		IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, 0);
+	read_unlock(&cm_id_priv->av_rwlock);
 	IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted);
 	IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count);
 	IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg,
@@ -2530,11 +2598,9 @@ static int cm_rep_handler(struct cm_work *work)
 	cm_id_priv->target_ack_delay =
 		IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg);
 	cm_id_priv->av.timeout =
-			cm_ack_timeout(cm_id_priv->target_ack_delay,
-				       cm_id_priv->av.timeout - 1);
+		cm_ack_timeout_rep(cm_id_priv, cm_id_priv->av.timeout - 1);
 	cm_id_priv->alt_av.timeout =
-			cm_ack_timeout(cm_id_priv->target_ack_delay,
-				       cm_id_priv->alt_av.timeout - 1);
+		cm_ack_timeout_rep(cm_id_priv, cm_id_priv->alt_av.timeout - 1);
 
 	ib_cancel_mad(cm_id_priv->msg);
 	cm_queue_work_unlock(cm_id_priv, work);
@@ -4113,7 +4179,10 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
 			qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
 						    IB_ACCESS_REMOTE_ATOMIC;
 		qp_attr->pkey_index = cm_id_priv->av.pkey_index;
-		qp_attr->port_num = cm_id_priv->av.port->port_num;
+		read_lock(&cm_id_priv->av_rwlock);
+		qp_attr->port_num = cm_id_priv->av.port ?
+			cm_id_priv->av.port->port_num : 0;
+		read_unlock(&cm_id_priv->av_rwlock);
 		ret = 0;
 		break;
 	default:
@@ -4157,7 +4226,10 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
 		}
 		if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
 			*qp_attr_mask |= IB_QP_ALT_PATH;
-			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			read_lock(&cm_id_priv->av_rwlock);
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port ?
+				cm_id_priv->alt_av.port->port_num : 0;
+			read_unlock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
 			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
 			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
@@ -4216,7 +4288,10 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
 			}
 		} else {
 			*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
-			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			read_lock(&cm_id_priv->av_rwlock);
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port ?
+				cm_id_priv->alt_av.port->port_num : 0;
+			read_unlock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
 			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
 			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
@@ -4434,10 +4509,12 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 
 	list_for_each_entry_safe(cm_id_priv, tmp,
 				 &cm_dev->cm_id_priv_list, cm_dev_list) {
+		write_lock(&cm_id_priv->av_rwlock);
 		if (!list_empty(&cm_id_priv->cm_dev_list))
 			list_del(&cm_id_priv->cm_dev_list);
 		cm_id_priv->av.port = NULL;
 		cm_id_priv->alt_av.port = NULL;
+		write_unlock(&cm_id_priv->av_rwlock);
 	}
 
 	rdma_for_each_port (ib_device, i) {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH rdma-next v1 5/5] IB/cm: Initialize av before aquire the spin lock in cm_lap_handler
  2021-04-11 12:21 [PATCH rdma-next v1 0/5] Fix memory corruption in CM Leon Romanovsky
                   ` (3 preceding siblings ...)
  2021-04-11 12:21 ` [PATCH rdma-next v1 4/5] IB/cm: Add lock protection when access av/alt_av's port of a cm_id Leon Romanovsky
@ 2021-04-11 12:21 ` Leon Romanovsky
  4 siblings, 0 replies; 7+ messages in thread
From: Leon Romanovsky @ 2021-04-11 12:21 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: Mark Zhang, linux-rdma

From: Mark Zhang <markzhang@nvidia.com>

Both cm_init_av_for_lap and cm_init_av_by_path and might sleep and should
not be called within a spin_lock.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cm.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index d8368d71ac7c..5ca328f3e34a 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3364,6 +3364,17 @@ static int cm_lap_handler(struct cm_work *work)
 	work->cm_event.private_data =
 		IBA_GET_MEM_PTR(CM_LAP_PRIVATE_DATA, lap_msg);
 
+	ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
+				 work->mad_recv_wc->recv_buf.grh,
+				 cm_id_priv);
+	if (ret)
+		goto deref;
+
+	ret = cm_init_av_by_path(param->alternate_path, NULL,
+				 cm_id_priv, false);
+	if (ret)
+		goto deref;
+
 	spin_lock_irq(&cm_id_priv->lock);
 	if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
 		goto unlock;
@@ -3398,17 +3409,6 @@ static int cm_lap_handler(struct cm_work *work)
 		goto unlock;
 	}
 
-	ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
-				 work->mad_recv_wc->recv_buf.grh,
-				 cm_id_priv);
-	if (ret)
-		goto unlock;
-
-	ret = cm_init_av_by_path(param->alternate_path, NULL,
-				 cm_id_priv, false);
-	if (ret)
-		goto unlock;
-
 	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
 	cm_id_priv->tid = lap_msg->hdr.tid;
 	cm_queue_work_unlock(cm_id_priv, work);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-next v1 2/5] IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls
  2021-04-11 12:21 ` [PATCH rdma-next v1 2/5] IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls Leon Romanovsky
@ 2021-04-13 16:26   ` Jason Gunthorpe
  0 siblings, 0 replies; 7+ messages in thread
From: Jason Gunthorpe @ 2021-04-13 16:26 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Doug Ledford, Mark Zhang, linux-rdma

On Sun, Apr 11, 2021 at 03:21:49PM +0300, Leon Romanovsky wrote:
> From: Mark Zhang <markzhang@nvidia.com>
> 
> The mad_agent parameter is redundant since the struct ib_mad_send_buf
> already has a pointer of it.
> 
> Signed-off-by: Mark Zhang <markzhang@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
>  drivers/infiniband/core/cm.c       | 101 ++++++++++++++++++-----------
>  drivers/infiniband/core/mad.c      |  17 ++---
>  drivers/infiniband/core/sa_query.c |   4 +-
>  include/rdma/ib_mad.h              |  27 ++++----
>  4 files changed, 84 insertions(+), 65 deletions(-)
> 
> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
> index e33b730107b4..f7f094861f79 100644
> +++ b/drivers/infiniband/core/cm.c
> @@ -1023,7 +1023,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
>  		break;
>  	case IB_CM_SIDR_REQ_SENT:
>  		cm_id->state = IB_CM_IDLE;
> -		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
> +		ib_cancel_mad(cm_id_priv->msg);
>  		break;
>  	case IB_CM_SIDR_REQ_RCVD:
>  		cm_send_sidr_rep_locked(cm_id_priv,
> @@ -1034,7 +1034,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
>  		break;
>  	case IB_CM_REQ_SENT:
>  	case IB_CM_MRA_REQ_RCVD:
> -		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
> +		ib_cancel_mad(cm_id_priv->msg);
>  		cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT,
>  				   &cm_id_priv->id.device->node_guid,
>  				   sizeof(cm_id_priv->id.device->node_guid),
> @@ -1052,7 +1052,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
>  		break;
>  	case IB_CM_REP_SENT:
>  	case IB_CM_MRA_REP_RCVD:
> -		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
> +		ib_cancel_mad(cm_id_priv->msg);
>  		cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL,
>  				   0, NULL, 0);
>  		goto retest;
> @@ -1070,7 +1070,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
>  		cm_send_dreq_locked(cm_id_priv, NULL, 0);
>  		goto retest;
>  	case IB_CM_DREQ_SENT:
> -		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
> +		ib_cancel_mad(cm_id_priv->msg);
>  		cm_enter_timewait(cm_id_priv);
>  		goto retest;
>  	case IB_CM_DREQ_RCVD:
> @@ -1473,6 +1473,8 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
>  		if (ret)
>  			goto out;
>  	}
> +
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
>  	cm_id->service_id = param->service_id;
>  	cm_id->service_mask = ~cpu_to_be64(0);
>  	cm_id_priv->timeout_ms = cm_convert_to_ms(
> @@ -1489,7 +1491,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
>  
>  	ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
>  	if (ret)
> -		goto out;
> +		goto error_alloc;
>  
>  	req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
>  	cm_format_req(req_msg, cm_id_priv, param);
> @@ -1501,19 +1503,21 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
>  	cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
>  
>  	trace_icm_send_req(&cm_id_priv->id);
> -	spin_lock_irqsave(&cm_id_priv->lock, flags);
>  	ret = ib_post_send_mad(cm_id_priv->msg, NULL);
> -	if (ret) {
> -		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> -		goto error2;
> -	}
> -	BUG_ON(cm_id->state != IB_CM_IDLE);
> +	if (ret)
> +		goto error_post_send_mad;
> +
>  	cm_id->state = IB_CM_REQ_SENT;
>  	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
>  	return 0;
>  
> -error2:	cm_free_msg(cm_id_priv->msg);
> -out:	return ret;
> +error_post_send_mad:
> +	cm_free_msg(cm_id_priv->msg);
> +	cm_id_priv->msg = NULL;

No, please use the patch series I made instead of this

https://lore.kernel.org/linux-rdma/20210329124101.GA887238@nvidia.com/

Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-04-13 16:26 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-11 12:21 [PATCH rdma-next v1 0/5] Fix memory corruption in CM Leon Romanovsky
2021-04-11 12:21 ` [PATCH rdma-next v1 1/5] Revert "IB/cm: Mark stale CM id's whenever the mad agent was unregistered" Leon Romanovsky
2021-04-11 12:21 ` [PATCH rdma-next v1 2/5] IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls Leon Romanovsky
2021-04-13 16:26   ` Jason Gunthorpe
2021-04-11 12:21 ` [PATCH rdma-next v1 3/5] IB/cm: Clear all associated AV's ports when remove a cm device Leon Romanovsky
2021-04-11 12:21 ` [PATCH rdma-next v1 4/5] IB/cm: Add lock protection when access av/alt_av's port of a cm_id Leon Romanovsky
2021-04-11 12:21 ` [PATCH rdma-next v1 5/5] IB/cm: Initialize av before aquire the spin lock in cm_lap_handler Leon Romanovsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).