All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jason Gunthorpe <jgg@nvidia.com>
To: Leon Romanovsky <leon@kernel.org>
Cc: Doug Ledford <dledford@redhat.com>,
	Mark Zhang <markzhang@nvidia.com>,
	linux-rdma@vger.kernel.org
Subject: Re: [PATCH rdma-next v2 7/9] IB/cm: Clear all associated AV's ports when remove a cm device
Date: Thu, 22 Apr 2021 16:34:17 -0300	[thread overview]
Message-ID: <20210422193417.GA2435405@nvidia.com> (raw)
In-Reply-To: <00c97755c41b06af84f621a1b3e0e8adfe0771cc.1619004798.git.leonro@nvidia.com>

On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote:
> @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
>  	cm_dev->going_down = 1;
>  	spin_unlock_irq(&cm.lock);
>  
> +	list_for_each_entry_safe(cm_id_priv, tmp,
> +				 &cm_dev->cm_id_priv_list, cm_dev_list) {
> +		if (!list_empty(&cm_id_priv->cm_dev_list))
> +			list_del(&cm_id_priv->cm_dev_list);
> +		cm_id_priv->av.port = NULL;
> +		cm_id_priv->alt_av.port = NULL;
> +	}

Ugh, this is in the wrong order, it has to be after the work queue
flush..

Hurm, I didn't see an easy way to fix it up, but I did think of a much
better design!

Generally speaking all we need is the memory of the cm_dev and port to
remain active, we don't need to block or fence with cm_remove_one(),
so just stick a memory kref on this thing and keep the memory. The
only things that needs to seralize with cm_remove_one() are on the
workqueue or take a spinlock (eg because they touch mad_agent)

Try this, I didn't finish every detail, applies on top of your series,
but you'll need to reflow it into new commits:

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3feff999a5e003..c26367006a4485 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -205,8 +205,11 @@ struct cm_port {
 };
 
 struct cm_device {
+	struct kref kref;
 	struct list_head list;
+	struct mutex unregistration_lock;
 	struct ib_device *ib_device;
+	unsigned int num_ports;
 	u8 ack_delay;
 	int going_down;
 	struct list_head cm_id_priv_list;
@@ -262,7 +265,6 @@ struct cm_id_private {
 	/* todo: use alternate port on send failure */
 	struct cm_av av;
 	struct cm_av alt_av;
-	rwlock_t av_rwlock;	/* Do not acquire inside cm.lock */
 
 	void *private_data;
 	__be64 tid;
@@ -287,10 +289,23 @@ struct cm_id_private {
 	atomic_t work_count;
 
 	struct rdma_ucm_ece ece;
-
-	struct list_head cm_dev_list;
 };
 
+static void cm_dev_release(struct kref *kref)
+{
+	struct cm_device *cm_dev = container_of(kref, struct cm_device, kref);
+	unsigned int i;
+
+	for (i = 0; i != cm_dev->num_ports; i++)
+		kfree(cm_dev->port[i]);
+	kfree(cm_dev);
+}
+
+static void cm_device_put(struct cm_device *cm_dev)
+{
+	kref_put(&cm_dev->kref, cm_dev_release);
+}
+
 static void cm_work_handler(struct work_struct *work);
 
 static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
@@ -306,12 +321,12 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
 	struct ib_ah *ah;
 	int ret;
 
-	read_lock(&cm_id_priv->av_rwlock);
 	if (!cm_id_priv->av.port) {
 		ret = -EINVAL;
 		goto out;
 	}
 
+	spin_lock(&cm_id_priv->av.port.cm_dev->unregistration_lock);
 	mad_agent = cm_id_priv->av.port->mad_agent;
 	if (!mad_agent) {
 		ret = -EINVAL;
@@ -330,7 +345,6 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
 			       GFP_ATOMIC,
 			       IB_MGMT_BASE_VERSION);
 
-	read_unlock(&cm_id_priv->av_rwlock);
 	if (IS_ERR(m)) {
 		rdma_destroy_ah(ah, 0);
 		ret = PTR_ERR(m);
@@ -346,7 +360,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
 	return m;
 
 out:
-	read_unlock(&cm_id_priv->av_rwlock);
+	spin_unlock(&cm_id_priv->av.port.cm_dev->unregistration_lock);
 	return ERR_PTR(ret);
 }
 
@@ -465,20 +479,18 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv,
 	cm_id_priv->private_data_len = private_data_len;
 }
 
-static void add_cm_id_to_cm_dev_list(struct cm_id_private *cm_id_priv,
-				     struct cm_device *cm_dev)
+static void cm_set_av_port(struct cm_av *av, struct cm_port *port)
 {
-	unsigned long flags;
+	struct cm_port *old_port = av->port;
 
-	spin_lock_irqsave(&cm.lock, flags);
-	if (cm_dev->going_down)
-		goto out;
+	if (old_port == port)
+		return;
 
-	if (!list_empty(&cm_id_priv->cm_dev_list))
-		list_del(&cm_id_priv->cm_dev_list);
-	list_add_tail(&cm_id_priv->cm_dev_list, &cm_dev->cm_id_priv_list);
-out:
-	spin_unlock_irqrestore(&cm.lock, flags);
+	av->port = port;
+	if (old_port)
+		cm_device_put(old_port->cm_dev);
+	if (port)
+		kref_get(&old_port->cm_dev->kref);
 }
 
 static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
@@ -505,11 +517,8 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
 	if (ret)
 		return ret;
 
-	write_lock(&cm_id_priv->av_rwlock);
-	av->port = port;
+	cm_set_av_port(av, port);
 	av->pkey_index = wc->pkey_index;
-	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
-	write_unlock(&cm_id_priv->av_rwlock);
 
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
@@ -521,10 +530,7 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
 {
 	struct cm_av *av = &cm_id_priv->av;
 
-	write_lock(&cm_id_priv->av_rwlock);
-	av->port = port;
-	add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
-	write_unlock(&cm_id_priv->av_rwlock);
+	cm_set_av_port(av, port);
 	av->pkey_index = wc->pkey_index;
 	return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
 				       port->port_num, wc,
@@ -588,12 +594,9 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 		return -EINVAL;
 	cm_dev = port->cm_dev;
 
-	read_lock(&cm_id_priv->av_rwlock);
 	if (!is_priv_av &&
 	    (!cm_id_priv->av.port || cm_dev != cm_id_priv->av.port->cm_dev))
 		ret = -EINVAL;
-
-	read_unlock(&cm_id_priv->av_rwlock);
 	if (ret)
 		return ret;
 
@@ -618,13 +621,8 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 	if (ret)
 		return ret;
 
-	write_lock(&cm_id_priv->av_rwlock);
-	av->port = port;
+	cm_set_av_port(av, port);
 	av->timeout = path->packet_life_time + 1;
-	if (is_priv_av)
-		add_cm_id_to_cm_dev_list(cm_id_priv, cm_dev);
-
-	write_unlock(&cm_id_priv->av_rwlock);
 
 	rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
 	return 0;
@@ -905,10 +903,8 @@ static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
 	spin_lock_init(&cm_id_priv->lock);
 	init_completion(&cm_id_priv->comp);
 	INIT_LIST_HEAD(&cm_id_priv->work_list);
-	INIT_LIST_HEAD(&cm_id_priv->cm_dev_list);
 	atomic_set(&cm_id_priv->work_count, -1);
 	refcount_set(&cm_id_priv->refcount, 1);
-	rwlock_init(&cm_id_priv->av_rwlock);
 
 	ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b,
 			      &cm.local_id_next, GFP_KERNEL);
@@ -1027,10 +1023,8 @@ static u8 cm_ack_timeout_req(struct cm_id_private *cm_id_priv,
 {
 	u8 ack_delay = 0;
 
-	read_lock(&cm_id_priv->av_rwlock);
-	if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev)
+	if (cm_id_priv->av.port)
 		ack_delay = cm_id_priv->av.port->cm_dev->ack_delay;
-	read_unlock(&cm_id_priv->av_rwlock);
 
 	return cm_ack_timeout(ack_delay, packet_life_time);
 }
@@ -1228,8 +1222,6 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		cm_id_priv->timewait_info = NULL;
 	}
 
-	if (!list_empty(&cm_id_priv->cm_dev_list))
-		list_del(&cm_id_priv->cm_dev_list);
 	WARN_ON(cm_id_priv->listen_sharecount);
 	WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
 	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
@@ -1246,6 +1238,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	rdma_destroy_ah_attr(&cm_id_priv->av.ah_attr);
 	rdma_destroy_ah_attr(&cm_id_priv->alt_av.ah_attr);
 	kfree(cm_id_priv->private_data);
+	cm_set_av_port(&cm_id_priv->av, NULL);
+	cm_set_av_port(&cm_id_priv->alt_av, NULL);
 	kfree_rcu(cm_id_priv, rcu);
 }
 
@@ -1378,10 +1372,13 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
 {
 	u64 hi_tid = 0, low_tid;
 
-	read_lock(&cm_id_priv->av_rwlock);
-	if (cm_id_priv->av.port && cm_id_priv->av.port->mad_agent)
-		hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
-	read_unlock(&cm_id_priv->av_rwlock);
+	if (cm_id_priv->av.port) {
+		spin_lock(&cm_id_priv->av.port->cm_dev->unregistration_lock);
+		if (cm_id_priv->av.port->mad_agent)
+			hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid)
+				 << 32;
+		spin_unlock(&cm_id_priv->av.port->cm_dev->unregistration_lock);
+	}
 
 	low_tid  = (u64)cm_id_priv->id.local_id;
 	return cpu_to_be64(hi_tid | low_tid);
@@ -1879,12 +1876,10 @@ static void cm_format_req_event(struct cm_work *work,
 	param = &work->cm_event.param.req_rcvd;
 	param->listen_id = listen_id;
 	param->bth_pkey = cm_get_bth_pkey(work);
-	read_lock(&cm_id_priv->av_rwlock);
 	if (cm_id_priv->av.port)
 		param->port = cm_id_priv->av.port->port_num;
 	else
 		param->port = 0;
-	read_unlock(&cm_id_priv->av_rwlock);
 	param->primary_path = &work->path[0];
 	cm_opa_to_ib_sgid(work, param->primary_path);
 	if (cm_req_has_alt_path(req_msg)) {
@@ -2311,13 +2306,11 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg,
 	IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn);
 	IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg,
 		param->responder_resources);
-	read_lock(&cm_id_priv->av_rwlock);
 	if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev)
 		IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg,
 			cm_id_priv->av.port->cm_dev->ack_delay);
 	else
 		IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, 0);
-	read_unlock(&cm_id_priv->av_rwlock);
 	IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted);
 	IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count);
 	IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg,
@@ -4187,10 +4180,8 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
 			qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
 						    IB_ACCESS_REMOTE_ATOMIC;
 		qp_attr->pkey_index = cm_id_priv->av.pkey_index;
-		read_lock(&cm_id_priv->av_rwlock);
 		qp_attr->port_num = cm_id_priv->av.port ?
 			cm_id_priv->av.port->port_num : 0;
-		read_unlock(&cm_id_priv->av_rwlock);
 		ret = 0;
 		break;
 	default:
@@ -4234,10 +4225,8 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
 		}
 		if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
 			*qp_attr_mask |= IB_QP_ALT_PATH;
-			read_lock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_port_num = cm_id_priv->alt_av.port ?
 				cm_id_priv->alt_av.port->port_num : 0;
-			read_unlock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
 			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
 			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
@@ -4296,10 +4285,8 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
 			}
 		} else {
 			*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
-			read_lock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_port_num = cm_id_priv->alt_av.port ?
 				cm_id_priv->alt_av.port->port_num : 0;
-			read_unlock(&cm_id_priv->av_rwlock);
 			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
 			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
 			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
@@ -4417,9 +4404,11 @@ static int cm_add_one(struct ib_device *ib_device)
 	if (!cm_dev)
 		return -ENOMEM;
 
+	kref_init(&cm_dev->kref);
 	cm_dev->ib_device = ib_device;
 	cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
 	cm_dev->going_down = 0;
+	cm_dev->num_ports = ib_device->phys_port_cnt;
 	INIT_LIST_HEAD(&cm_dev->cm_id_priv_list);
 
 	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
@@ -4489,10 +4478,9 @@ static int cm_add_one(struct ib_device *ib_device)
 		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
 		ib_unregister_mad_agent(port->mad_agent);
 		cm_remove_port_fs(port);
-		kfree(port);
 	}
 free:
-	kfree(cm_dev);
+	cm_device_put(cm_dev);
 	return ret;
 }
 
@@ -4515,21 +4503,15 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 	cm_dev->going_down = 1;
 	spin_unlock_irq(&cm.lock);
 
-	list_for_each_entry_safe(cm_id_priv, tmp,
-				 &cm_dev->cm_id_priv_list, cm_dev_list) {
-		write_lock(&cm_id_priv->av_rwlock);
-		if (!list_empty(&cm_id_priv->cm_dev_list))
-			list_del(&cm_id_priv->cm_dev_list);
-		cm_id_priv->av.port = NULL;
-		cm_id_priv->alt_av.port = NULL;
-		write_unlock(&cm_id_priv->av_rwlock);
-	}
-
 	rdma_for_each_port (ib_device, i) {
+		struct ib_mad_agent *mad_agent;
+
 		if (!rdma_cap_ib_cm(ib_device, i))
 			continue;
 
 		port = cm_dev->port[i-1];
+		mad_agent = port->mad_agent;
+
 		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
 		/*
 		 * We flush the queue here after the going_down set, this
@@ -4537,12 +4519,20 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 		 * after that we can call the unregister_mad_agent
 		 */
 		flush_workqueue(cm.wq);
-		ib_unregister_mad_agent(port->mad_agent);
+		/*
+		 * The above ensures no call paths from the work are running,
+		 * the remaining paths all take the unregistration lock
+		 */
+		spin_lock(&cm_dev->unregistration_lock);
+		port->mad_agent = NULL;
+		spin_unlock(&cm_dev->unregistration_lock);
+		ib_unregister_mad_agent(mad_agent);
 		cm_remove_port_fs(port);
-		kfree(port);
 	}
 
-	kfree(cm_dev);
+	/* All touches can only be on call path from the work */
+	cm_dev->ib_device = NULL;
+	cm_device_put(cm_dev);
 }
 
 static int __init ib_cm_init(void)


  reply	other threads:[~2021-04-22 19:34 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-21 11:40 [PATCH rdma-next v2 0/9] Fix memory corruption in CM Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 1/9] IB/cm: Pair cm_alloc_response_msg() with a cm_free_response_msg() Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 2/9] IB/cm: Split cm_alloc_msg() Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 3/9] IB/cm: Call the correct message free functions in cm_send_handler() Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 4/9] IB/cm: Tidy remaining cm_msg free paths Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 5/9] Revert "IB/cm: Mark stale CM id's whenever the mad agent was unregistered" Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 6/9] IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 7/9] IB/cm: Clear all associated AV's ports when remove a cm device Leon Romanovsky
2021-04-22 19:34   ` Jason Gunthorpe [this message]
2021-04-23 13:14     ` Mark Zhang
2021-04-23 14:24       ` Jason Gunthorpe
2021-04-24  2:33         ` Mark Zhang
2021-04-26 13:56           ` Jason Gunthorpe
2021-04-27  1:59             ` Mark Zhang
2021-04-21 11:40 ` [PATCH rdma-next v2 8/9] IB/cm: Add lock protection when access av/alt_av's port of a cm_id Leon Romanovsky
2021-04-22 19:08   ` Jason Gunthorpe
2021-04-25 13:21     ` Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 9/9] IB/cm: Initialize av before aquire the spin lock in cm_lap_handler Leon Romanovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210422193417.GA2435405@nvidia.com \
    --to=jgg@nvidia.com \
    --cc=dledford@redhat.com \
    --cc=leon@kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=markzhang@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.