All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object
@ 2021-06-08  5:23 Leon Romanovsky
  2021-06-24 17:48 ` Jason Gunthorpe
  0 siblings, 1 reply; 7+ messages in thread
From: Leon Romanovsky @ 2021-06-08  5:23 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, linux-kernel, linux-rdma, Pavel Skripkin, Shay Drory

From: Leon Romanovsky <leonro@nvidia.com>

Change location of rdma_restrack_add() callers to be near attachment
to device logic. Such improvement fixes the bug where task_struct was
acquired but not released, causing to resource leak.

  ucma_create_id() {
    ucma_alloc_ctx();
    rdma_create_user_id() {
      rdma_restrack_new();
      rdma_restrack_set_name() {
        rdma_restrack_attach_task.part.0(); <--- task_struct was gotten
      }
    }
    ucma_destroy_private_ctx() {
      ucma_put_ctx();
      rdma_destroy_id() {
        _destroy_id()                       <--- id_priv was freed
      }
    }
  }

Fixes: cb5cd0ea4eb3 ("RDMA/core: Add CM to restrack after successful attachment to a device")
Reported-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
Changelog
v2:
 * Added bug report analysis
v1: https://lore.kernel.org/linux-rdma/f72e27d5c82cd9beec7670141afa62786836c569.1622956637.git.leonro@nvidia.com/T/#u
---
 drivers/infiniband/core/cma.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index ab148a696c0c..e6b81cd4775a 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -462,6 +462,7 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv,
 			      struct cma_device *cma_dev)
 {
 	_cma_attach_to_dev(id_priv, cma_dev);
+	rdma_restrack_add(&id_priv->res);
 	id_priv->gid_type =
 		cma_dev->default_gid_type[id_priv->id.port_num -
 					  rdma_start_port(cma_dev->device)];
@@ -691,7 +692,6 @@ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv,
 	mutex_lock(&lock);
 	cma_attach_to_dev(id_priv, listen_id_priv->cma_dev);
 	mutex_unlock(&lock);
-	rdma_restrack_add(&id_priv->res);
 	return 0;
 }
 
@@ -746,10 +746,8 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
 	}
 
 out:
-	if (!ret) {
+	if (!ret)
 		cma_attach_to_dev(id_priv, cma_dev);
-		rdma_restrack_add(&id_priv->res);
-	}
 
 	mutex_unlock(&lock);
 	return ret;
@@ -810,7 +808,6 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 
 found:
 	cma_attach_to_dev(id_priv, cma_dev);
-	rdma_restrack_add(&id_priv->res);
 	mutex_unlock(&lock);
 	addr = (struct sockaddr_ib *)cma_src_addr(id_priv);
 	memcpy(&addr->sib_addr, &sgid, sizeof(sgid));
@@ -1852,6 +1849,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
 {
 	cma_cancel_operation(id_priv, state);
 
+	rdma_restrack_del(&id_priv->res);
 	if (id_priv->cma_dev) {
 		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
 			if (id_priv->cm_id.ib)
@@ -1861,7 +1859,6 @@ static void _destroy_id(struct rdma_id_private *id_priv,
 				iw_destroy_cm_id(id_priv->cm_id.iw);
 		}
 		cma_leave_mc_groups(id_priv);
-		rdma_restrack_del(&id_priv->res);
 		cma_release_dev(id_priv);
 	}
 
@@ -3208,7 +3205,6 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv)
 	ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
 	id_priv->id.port_num = p;
 	cma_attach_to_dev(id_priv, cma_dev);
-	rdma_restrack_add(&id_priv->res);
 	cma_set_loopback(cma_src_addr(id_priv));
 out:
 	mutex_unlock(&lock);
@@ -3241,7 +3237,6 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 		if (status)
 			pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
 					     status);
-		rdma_restrack_add(&id_priv->res);
 	} else if (status) {
 		pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status);
 	}
@@ -3853,12 +3848,12 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
 	if (ret)
 		goto err2;
 
-	if (!cma_any_addr(addr))
-		rdma_restrack_add(&id_priv->res);
 	return 0;
 err2:
 	if (id_priv->cma_dev)
 		cma_release_dev(id_priv);
+	if (!cma_any_addr(addr))
+		rdma_restrack_del(&id_priv->res);
 err1:
 	cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
 	return ret;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object
  2021-06-08  5:23 [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object Leon Romanovsky
@ 2021-06-24 17:48 ` Jason Gunthorpe
  2021-06-27  8:07   ` Leon Romanovsky
  0 siblings, 1 reply; 7+ messages in thread
From: Jason Gunthorpe @ 2021-06-24 17:48 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, Leon Romanovsky, linux-kernel, linux-rdma,
	Pavel Skripkin, Shay Drory

On Tue, Jun 08, 2021 at 08:23:48AM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Change location of rdma_restrack_add() callers to be near attachment
> to device logic. Such improvement fixes the bug where task_struct was
> acquired but not released, causing to resource leak.
> 
>   ucma_create_id() {
>     ucma_alloc_ctx();
>     rdma_create_user_id() {
>       rdma_restrack_new();
>       rdma_restrack_set_name() {
>         rdma_restrack_attach_task.part.0(); <--- task_struct was gotten
>       }
>     }
>     ucma_destroy_private_ctx() {
>       ucma_put_ctx();
>       rdma_destroy_id() {
>         _destroy_id()                       <--- id_priv was freed
>       }
>     }
>   }

I still don't understand this patch

> @@ -1852,6 +1849,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
>  {
>  	cma_cancel_operation(id_priv, state);
>  
> +	rdma_restrack_del(&id_priv->res);
>  	if (id_priv->cma_dev) {
>  		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
>  			if (id_priv->cm_id.ib)
> @@ -1861,7 +1859,6 @@ static void _destroy_id(struct rdma_id_private *id_priv,
>  				iw_destroy_cm_id(id_priv->cm_id.iw);
>  		}
>  		cma_leave_mc_groups(id_priv);
> -		rdma_restrack_del(&id_priv->res);
>  		cma_release_dev(id_priv);

This seems to be the only hunk that is actually necessary, ensuring a
non-added ID is always cleaned up is the necessary step to fixing the
trace above.

What is the rest of this doing?? It looks wrong:

int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
{
[..]
	ret = cma_get_port(id_priv);
	if (ret)
		goto err2;
err2:
[..]
	if (!cma_any_addr(addr))
		rdma_restrack_del(&id_priv->res);

Which means if rdma_bind_addr() fails then restrack will discard the
task, even though the cm_id is still valid! The ucma is free to try
bind again and keep using the ID.

Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object
  2021-06-24 17:48 ` Jason Gunthorpe
@ 2021-06-27  8:07   ` Leon Romanovsky
  2021-06-27 23:15     ` Jason Gunthorpe
  0 siblings, 1 reply; 7+ messages in thread
From: Leon Romanovsky @ 2021-06-27  8:07 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, linux-kernel, linux-rdma, Pavel Skripkin, Shay Drory

On Thu, Jun 24, 2021 at 02:48:41PM -0300, Jason Gunthorpe wrote:
> On Tue, Jun 08, 2021 at 08:23:48AM +0300, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> > 
> > Change location of rdma_restrack_add() callers to be near attachment
> > to device logic. Such improvement fixes the bug where task_struct was
> > acquired but not released, causing to resource leak.
> > 
> >   ucma_create_id() {
> >     ucma_alloc_ctx();
> >     rdma_create_user_id() {
> >       rdma_restrack_new();
> >       rdma_restrack_set_name() {
> >         rdma_restrack_attach_task.part.0(); <--- task_struct was gotten
> >       }
> >     }
> >     ucma_destroy_private_ctx() {
> >       ucma_put_ctx();
> >       rdma_destroy_id() {
> >         _destroy_id()                       <--- id_priv was freed
> >       }
> >     }
> >   }
> 
> I still don't understand this patch
> 
> > @@ -1852,6 +1849,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
> >  {
> >  	cma_cancel_operation(id_priv, state);
> >  
> > +	rdma_restrack_del(&id_priv->res);
> >  	if (id_priv->cma_dev) {
> >  		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
> >  			if (id_priv->cm_id.ib)
> > @@ -1861,7 +1859,6 @@ static void _destroy_id(struct rdma_id_private *id_priv,
> >  				iw_destroy_cm_id(id_priv->cm_id.iw);
> >  		}
> >  		cma_leave_mc_groups(id_priv);
> > -		rdma_restrack_del(&id_priv->res);
> >  		cma_release_dev(id_priv);
> 
> This seems to be the only hunk that is actually necessary, ensuring a
> non-added ID is always cleaned up is the necessary step to fixing the
> trace above.
> 
> What is the rest of this doing?? It looks wrong:
> 
> int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
> {
> [..]
> 	ret = cma_get_port(id_priv);
> 	if (ret)
> 		goto err2;
> err2:
> [..]
> 	if (!cma_any_addr(addr))
> 		rdma_restrack_del(&id_priv->res);
> 
> Which means if rdma_bind_addr() fails then restrack will discard the
> task, even though the cm_id is still valid! The ucma is free to try
> bind again and keep using the ID.

"Failure to bind" means that cma_attach_to_dev() needs to be unwind.

It is the same if rdma_restrack_add() inside that function like in this
patch or in the line before rdma_bind_addr() returns as it was in
previous code.

Thanks

> 
> Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object
  2021-06-27  8:07   ` Leon Romanovsky
@ 2021-06-27 23:15     ` Jason Gunthorpe
  2021-06-28  5:22       ` Leon Romanovsky
  0 siblings, 1 reply; 7+ messages in thread
From: Jason Gunthorpe @ 2021-06-27 23:15 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, linux-kernel, linux-rdma, Pavel Skripkin, Shay Drory

On Sun, Jun 27, 2021 at 11:07:33AM +0300, Leon Romanovsky wrote:
> On Thu, Jun 24, 2021 at 02:48:41PM -0300, Jason Gunthorpe wrote:
> > On Tue, Jun 08, 2021 at 08:23:48AM +0300, Leon Romanovsky wrote:
> > > From: Leon Romanovsky <leonro@nvidia.com>
> > > 
> > > Change location of rdma_restrack_add() callers to be near attachment
> > > to device logic. Such improvement fixes the bug where task_struct was
> > > acquired but not released, causing to resource leak.
> > > 
> > >   ucma_create_id() {
> > >     ucma_alloc_ctx();
> > >     rdma_create_user_id() {
> > >       rdma_restrack_new();
> > >       rdma_restrack_set_name() {
> > >         rdma_restrack_attach_task.part.0(); <--- task_struct was gotten
> > >       }
> > >     }
> > >     ucma_destroy_private_ctx() {
> > >       ucma_put_ctx();
> > >       rdma_destroy_id() {
> > >         _destroy_id()                       <--- id_priv was freed
> > >       }
> > >     }
> > >   }
> > 
> > I still don't understand this patch
> > 
> > > @@ -1852,6 +1849,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
> > >  {
> > >  	cma_cancel_operation(id_priv, state);
> > >  
> > > +	rdma_restrack_del(&id_priv->res);
> > >  	if (id_priv->cma_dev) {
> > >  		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
> > >  			if (id_priv->cm_id.ib)
> > > @@ -1861,7 +1859,6 @@ static void _destroy_id(struct rdma_id_private *id_priv,
> > >  				iw_destroy_cm_id(id_priv->cm_id.iw);
> > >  		}
> > >  		cma_leave_mc_groups(id_priv);
> > > -		rdma_restrack_del(&id_priv->res);
> > >  		cma_release_dev(id_priv);
> > 
> > This seems to be the only hunk that is actually necessary, ensuring a
> > non-added ID is always cleaned up is the necessary step to fixing the
> > trace above.
> > 
> > What is the rest of this doing?? It looks wrong:
> > 
> > int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
> > {
> > [..]
> > 	ret = cma_get_port(id_priv);
> > 	if (ret)
> > 		goto err2;
> > err2:
> > [..]
> > 	if (!cma_any_addr(addr))
> > 		rdma_restrack_del(&id_priv->res);
> > 
> > Which means if rdma_bind_addr() fails then restrack will discard the
> > task, even though the cm_id is still valid! The ucma is free to try
> > bind again and keep using the ID.
> 
> "Failure to bind" means that cma_attach_to_dev() needs to be unwind.
> 
> It is the same if rdma_restrack_add() inside that function like in this
> patch or in the line before rdma_bind_addr() returns as it was in
> previous code.

The previous code didn't call restrack_del. restrack_del undoes the
restrack_set_name stuff, not just add - so it does not leave things
back the way it found them

Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object
  2021-06-27 23:15     ` Jason Gunthorpe
@ 2021-06-28  5:22       ` Leon Romanovsky
  2021-06-28 11:38         ` Jason Gunthorpe
  0 siblings, 1 reply; 7+ messages in thread
From: Leon Romanovsky @ 2021-06-28  5:22 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, linux-kernel, linux-rdma, Pavel Skripkin, Shay Drory

On Sun, Jun 27, 2021 at 08:15:28PM -0300, Jason Gunthorpe wrote:
> On Sun, Jun 27, 2021 at 11:07:33AM +0300, Leon Romanovsky wrote:
> > On Thu, Jun 24, 2021 at 02:48:41PM -0300, Jason Gunthorpe wrote:
> > > On Tue, Jun 08, 2021 at 08:23:48AM +0300, Leon Romanovsky wrote:
> > > > From: Leon Romanovsky <leonro@nvidia.com>
> > > > 
> > > > Change location of rdma_restrack_add() callers to be near attachment
> > > > to device logic. Such improvement fixes the bug where task_struct was
> > > > acquired but not released, causing to resource leak.
> > > > 
> > > >   ucma_create_id() {
> > > >     ucma_alloc_ctx();
> > > >     rdma_create_user_id() {
> > > >       rdma_restrack_new();
> > > >       rdma_restrack_set_name() {
> > > >         rdma_restrack_attach_task.part.0(); <--- task_struct was gotten
> > > >       }
> > > >     }
> > > >     ucma_destroy_private_ctx() {
> > > >       ucma_put_ctx();
> > > >       rdma_destroy_id() {
> > > >         _destroy_id()                       <--- id_priv was freed
> > > >       }
> > > >     }
> > > >   }
> > > 
> > > I still don't understand this patch
> > > 
> > > > @@ -1852,6 +1849,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
> > > >  {
> > > >  	cma_cancel_operation(id_priv, state);
> > > >  
> > > > +	rdma_restrack_del(&id_priv->res);
> > > >  	if (id_priv->cma_dev) {
> > > >  		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
> > > >  			if (id_priv->cm_id.ib)
> > > > @@ -1861,7 +1859,6 @@ static void _destroy_id(struct rdma_id_private *id_priv,
> > > >  				iw_destroy_cm_id(id_prgtiv->cm_id.iw);
> > > >  		}
> > > >  		cma_leave_mc_groups(id_priv);
> > > > -		rdma_restrack_del(&id_priv->res);
> > > >  		cma_release_dev(id_priv);
> > > 
> > > This seems to be the only hunk that is actually necessary, ensuring a
> > > non-added ID is always cleaned up is the necessary step to fixing the
> > > trace above.
> > > 
> > > What is the rest of this doing?? It looks wrong:
> > > 
> > > int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
> > > {
> > > [..]
> > > 	ret = cma_get_port(id_priv);
> > > 	if (ret)
> > > 		goto err2;
> > > err2:
> > > [..]
> > > 	if (!cma_any_addr(addr))
> > > 		rdma_restrack_del(&id_priv->res);
> > > 
> > > Which means if rdma_bind_addr() fails then restrack will discard the
> > > task, even though the cm_id is still valid! The ucma is free to try
> > > bind again and keep using the ID.
> > 
> > "Failure to bind" means that cma_attach_to_dev() needs to be unwind.
> > 
> > It is the same if rdma_restrack_add() inside that function like in this
> > patch or in the line before rdma_bind_addr() returns as it was in
> > previous code.
> 
> The previous code didn't call restrack_del. restrack_del undoes the
> restrack_set_name stuff, not just add - so it does not leave things
> back the way it found them

The previous code didn't call to restrack_add and this is why it didn't
call to restrack_del later. In old and new code, we are still calling to
acquire and release dev (cma_acquire_dev_by_src_ip/cma_release_dev) and
this is where the CM_ID is actually attached.

Thanks

> 
> Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object
  2021-06-28  5:22       ` Leon Romanovsky
@ 2021-06-28 11:38         ` Jason Gunthorpe
  2021-06-29  6:40           ` Leon Romanovsky
  0 siblings, 1 reply; 7+ messages in thread
From: Jason Gunthorpe @ 2021-06-28 11:38 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Doug Ledford, linux-kernel, linux-rdma, Pavel Skripkin, Shay Drory

On Mon, Jun 28, 2021 at 08:22:45AM +0300, Leon Romanovsky wrote:
> > The previous code didn't call restrack_del. restrack_del undoes the
> > restrack_set_name stuff, not just add - so it does not leave things
> > back the way it found them
> 
> The previous code didn't call to restrack_add and this is why it didn't
> call to restrack_del later. In old and new code, we are still calling to
> acquire and release dev (cma_acquire_dev_by_src_ip/cma_release_dev) and
> this is where the CM_ID is actually attached.

Which is my point, you can't call restrack_del anyplace except the
final destroy. It cannot be used for error unwinding in these kinds of
functions

Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object
  2021-06-28 11:38         ` Jason Gunthorpe
@ 2021-06-29  6:40           ` Leon Romanovsky
  0 siblings, 0 replies; 7+ messages in thread
From: Leon Romanovsky @ 2021-06-29  6:40 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, linux-kernel, linux-rdma, Pavel Skripkin, Shay Drory

On Mon, Jun 28, 2021 at 08:38:13AM -0300, Jason Gunthorpe wrote:
> On Mon, Jun 28, 2021 at 08:22:45AM +0300, Leon Romanovsky wrote:
> > > The previous code didn't call restrack_del. restrack_del undoes the
> > > restrack_set_name stuff, not just add - so it does not leave things
> > > back the way it found them
> > 
> > The previous code didn't call to restrack_add and this is why it didn't
> > call to restrack_del later. In old and new code, we are still calling to
> > acquire and release dev (cma_acquire_dev_by_src_ip/cma_release_dev) and
> > this is where the CM_ID is actually attached.
> 
> Which is my point, you can't call restrack_del anyplace except the
> final destroy. It cannot be used for error unwinding in these kinds of
> functions

ok, let's remove the controversial hunks.

> 
> Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-06-29  6:40 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-08  5:23 [PATCH rdma-rc v2] RDMA/core: Simplify addition of restrack object Leon Romanovsky
2021-06-24 17:48 ` Jason Gunthorpe
2021-06-27  8:07   ` Leon Romanovsky
2021-06-27 23:15     ` Jason Gunthorpe
2021-06-28  5:22       ` Leon Romanovsky
2021-06-28 11:38         ` Jason Gunthorpe
2021-06-29  6:40           ` Leon Romanovsky

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.