linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] rdma_rxe: Stop passing AV from user space
@ 2020-10-16 17:01 Bob Pearson
  2020-10-19 18:53 ` Jason Gunthorpe
  0 siblings, 1 reply; 4+ messages in thread
From: Bob Pearson @ 2020-10-16 17:01 UTC (permalink / raw)
  To: jgg, linux-rdma; +Cc: Bob Pearson

This patch shows it is possible to replace the current
method used by rdma_rxe to handle UD transport where
user space creates and then passes an rxe_av *av to the
kernel driver as part of each rxe_send_wqe.

Here user space passes the handle created by the call to
ibv_create_ah in the send wqe. The kernel driver uses that
handle to get a pointer to the 'real' rxe_av that was
created in the kernel by the create ah verb.

To do this requires executing code in the driver that mimics
what is done in rdma_core to convert handles to objects.
This is not ideal but gets the job done. It would probably be
better for rdma_core to provide a service that can do this
for software drivers that have to do this.

The alternative (used by the MW code) is to create a driver
private index and pass that back from the create verb to
user space and then have user space use that in the send wqe.

I would like to avoid replicating work already being done by
rdma core by using the handle that already exists. I don't
know the relative performance of the lookup in rdma_core and
the red black tree used by rxe currently.

There is a matching change to provider/rxe.

Signed-off-by: Bob Pearson <rpearson@hpe.com>
---

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index ffe11b03724c..faa70c4f14bb 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -425,6 +425,7 @@ struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
 	uverbs_uobject_put(uobj);
 	return ERR_PTR(ret);
 }
+EXPORT_SYMBOL(rdma_lookup_get_uobject);
 
 static struct ib_uobject *
 alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
@@ -726,6 +727,7 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj,
 	/* Pairs with the kref obtained by type->lookup_get */
 	uverbs_uobject_put(uobj);
 }
+EXPORT_SYMBOL(rdma_lookup_put_uobject);
 
 void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile)
 {
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
index 38021e2c8688..db83e1cde38e 100644
--- a/drivers/infiniband/sw/rxe/rxe_av.c
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -6,6 +6,8 @@
 
 #include "rxe.h"
 #include "rxe_loc.h"
+#include "../../core/uverbs.h"
+#include "../../core/rdma_core.h"
 
 void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av)
 {
@@ -72,13 +74,50 @@ void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr)
 	av->network_type = rdma_gid_attr_network_type(sgid_attr);
 }
 
+static struct ib_ah *get_ah_from_handle(struct rxe_qp *qp, u32 handle)
+{
+	struct ib_uverbs_file *ufile;
+	struct uverbs_api *uapi;
+	const struct uverbs_api_object *type;
+	struct ib_uobject *uobj;
+
+	ufile = qp->ibqp.uobject->uevent.uobject.ufile;
+	uapi = ufile->device->uapi;
+	type = uapi_get_object(uapi, UVERBS_OBJECT_AH);
+	if (IS_ERR(type))
+		return NULL;
+	uobj = rdma_lookup_get_uobject(type, ufile, (s64)handle,
+				       UVERBS_LOOKUP_READ, NULL);
+	if (IS_ERR(uobj)) {
+		pr_warn("unable to lookup ah handle\n");
+		return NULL;
+	}
+
+	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
+	return uobj->object;
+}
+
 struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt)
 {
-	if (!pkt || !pkt->qp)
+	struct rxe_qp *qp = pkt->qp;
+	struct rxe_send_wqe *wqe = pkt->wqe;
+	u32 handle;
+	struct ib_ah *ibah;
+	struct rxe_ah *ah;
+
+	if (!pkt || !qp)
 		return NULL;
 
-	if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC)
-		return &pkt->qp->pri_av;
+	if (qp_type(qp) == IB_QPT_RC || qp_type(qp) == IB_QPT_UC)
+		return &qp->pri_av;
+
+	if (qp->is_user) {
+		handle = wqe->wr.wr.ud.ah_handle;
+		ibah = get_ah_from_handle(qp, handle);
+	} else {
+		ibah = wqe->wr.wr.ud.ah;
+	}
 
-	return (pkt->wqe) ? &pkt->wqe->av : NULL;
+	ah = to_rah(ibah);
+	return &ah->av;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index d4917646641a..2a887dca9a28 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -380,12 +380,16 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp,
 
 	/* init skb */
 	av = rxe_get_av(pkt);
+	if (!av) {
+		pr_warn("unable to get av\n");
+		return NULL;
+	}
 	skb = rxe_init_packet(rxe, av, paylen, pkt);
 	if (unlikely(!skb))
 		return NULL;
 
 	/* init bth */
-	solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
+	solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
 			(pkt->mask & RXE_END_MASK) &&
 			((pkt->mask & (RXE_SEND_MASK)) ||
 			(pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) ==
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 0a7d7c55d8d6..4d4ded90f793 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -506,6 +506,7 @@ static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr,
 	if (qp_type(qp) == IB_QPT_UD ||
 	    qp_type(qp) == IB_QPT_SMI ||
 	    qp_type(qp) == IB_QPT_GSI) {
+		wr->wr.ud.ah = ud_wr(ibwr)->ah;
 		wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn;
 		wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey;
 		if (qp_type(qp) == IB_QPT_GSI)
@@ -562,11 +563,6 @@ static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
 
 	init_send_wr(qp, &wqe->wr, ibwr);
 
-	if (qp_type(qp) == IB_QPT_UD ||
-	    qp_type(qp) == IB_QPT_SMI ||
-	    qp_type(qp) == IB_QPT_GSI)
-		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
-
 	if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) {
 		p = wqe->dma.inline_data;
 
diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h
index d8f2e0e46dab..d57271451052 100644
--- a/include/uapi/rdma/rdma_user_rxe.h
+++ b/include/uapi/rdma/rdma_user_rxe.h
@@ -89,6 +89,10 @@ struct rxe_send_wr {
 			__u32	reserved;
 		} atomic;
 		struct {
+			union {
+				__aligned_u64 ah_handle;
+				struct ib_ah *ah;
+			};
 			__u32	remote_qpn;
 			__u32	remote_qkey;
 			__u16	pkey_index;
@@ -132,7 +136,6 @@ struct rxe_dma_info {
 
 struct rxe_send_wqe {
 	struct rxe_send_wr	wr;
-	struct rxe_av		av;
 	__u32			status;
 	__u32			state;
 	__aligned_u64		iova;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC] rdma_rxe: Stop passing AV from user space
  2020-10-16 17:01 [PATCH RFC] rdma_rxe: Stop passing AV from user space Bob Pearson
@ 2020-10-19 18:53 ` Jason Gunthorpe
  2020-10-19 19:06   ` Bob Pearson
  0 siblings, 1 reply; 4+ messages in thread
From: Jason Gunthorpe @ 2020-10-19 18:53 UTC (permalink / raw)
  To: Bob Pearson; +Cc: linux-rdma, Bob Pearson

On Fri, Oct 16, 2020 at 12:01:48PM -0500, Bob Pearson wrote:
>  
> +static struct ib_ah *get_ah_from_handle(struct rxe_qp *qp, u32 handle)
> +{
> +	struct ib_uverbs_file *ufile;
> +	struct uverbs_api *uapi;
> +	const struct uverbs_api_object *type;
> +	struct ib_uobject *uobj;
> +
> +	ufile = qp->ibqp.uobject->uevent.uobject.ufile;
> +	uapi = ufile->device->uapi;
> +	type = uapi_get_object(uapi, UVERBS_OBJECT_AH);
> +	if (IS_ERR(type))
> +		return NULL;
> +	uobj = rdma_lookup_get_uobject(type, ufile, (s64)handle,
> +				       UVERBS_LOOKUP_READ, NULL);
> +	if (IS_ERR(uobj)) {
> +		pr_warn("unable to lookup ah handle\n");
> +		return NULL;
> +	}
> +
> +	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);

It can't be put and then return the data pointer, it is a use after free:

> +	return uobj->object;

> @@ -562,11 +563,6 @@ static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
>  
>  	init_send_wr(qp, &wqe->wr, ibwr);
>  
> -	if (qp_type(qp) == IB_QPT_UD ||
> -	    qp_type(qp) == IB_QPT_SMI ||
> -	    qp_type(qp) == IB_QPT_GSI)
> -		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));

It needs some kind of negotiated compat, can't just break userspace
like this

Jason

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC] rdma_rxe: Stop passing AV from user space
  2020-10-19 18:53 ` Jason Gunthorpe
@ 2020-10-19 19:06   ` Bob Pearson
  2020-10-19 23:00     ` Jason Gunthorpe
  0 siblings, 1 reply; 4+ messages in thread
From: Bob Pearson @ 2020-10-19 19:06 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: linux-rdma, Bob Pearson

On 10/19/20 1:53 PM, Jason Gunthorpe wrote:
> On Fri, Oct 16, 2020 at 12:01:48PM -0500, Bob Pearson wrote:
>>  
>> +static struct ib_ah *get_ah_from_handle(struct rxe_qp *qp, u32 handle)
>> +{
>> +	struct ib_uverbs_file *ufile;
>> +	struct uverbs_api *uapi;
>> +	const struct uverbs_api_object *type;
>> +	struct ib_uobject *uobj;
>> +
>> +	ufile = qp->ibqp.uobject->uevent.uobject.ufile;
>> +	uapi = ufile->device->uapi;
>> +	type = uapi_get_object(uapi, UVERBS_OBJECT_AH);
>> +	if (IS_ERR(type))
>> +		return NULL;
>> +	uobj = rdma_lookup_get_uobject(type, ufile, (s64)handle,
>> +				       UVERBS_LOOKUP_READ, NULL);
>> +	if (IS_ERR(uobj)) {
>> +		pr_warn("unable to lookup ah handle\n");
>> +		return NULL;
>> +	}
>> +
>> +	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
> 
> It can't be put and then return the data pointer, it is a use after free:
> 
>> +	return uobj->object;
> 
>> @@ -562,11 +563,6 @@ static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
>>  
>>  	init_send_wr(qp, &wqe->wr, ibwr);
>>  
>> -	if (qp_type(qp) == IB_QPT_UD ||
>> -	    qp_type(qp) == IB_QPT_SMI ||
>> -	    qp_type(qp) == IB_QPT_GSI)
>> -		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
> 
> It needs some kind of negotiated compat, can't just break userspace
> like this
> 
> Jason
> 

1st point. I get it. uobj->object contains the address of one of the ib_xxx verbs objects.
Normally the driver never looks at this level but presumably has a kref on that object so it makes
sense to look it up. Perhaps better would be:

	void *object;

	...

	uobj = rdma_lookup_get_uobject(...);

	object = uobj->object;

	rdma_lookup_put_uobject(...);

	return (struct ib_ah *)object;

Here the caller has created the ib_ah but has not yet destroyed it so it must hold a kref on it.


2nd point. I also get. This suggestion imagines that there will come a day when we can change the user API.
May be a rare day but must happen occasionally. The current design is just plain wrong and needs to get fixed
eventually.

Thanks for looking at this.

Bob

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC] rdma_rxe: Stop passing AV from user space
  2020-10-19 19:06   ` Bob Pearson
@ 2020-10-19 23:00     ` Jason Gunthorpe
  0 siblings, 0 replies; 4+ messages in thread
From: Jason Gunthorpe @ 2020-10-19 23:00 UTC (permalink / raw)
  To: Bob Pearson; +Cc: linux-rdma, Bob Pearson

On Mon, Oct 19, 2020 at 02:06:30PM -0500, Bob Pearson wrote:
> On 10/19/20 1:53 PM, Jason Gunthorpe wrote:
> > On Fri, Oct 16, 2020 at 12:01:48PM -0500, Bob Pearson wrote:
> >>  
> >> +static struct ib_ah *get_ah_from_handle(struct rxe_qp *qp, u32 handle)
> >> +{
> >> +	struct ib_uverbs_file *ufile;
> >> +	struct uverbs_api *uapi;
> >> +	const struct uverbs_api_object *type;
> >> +	struct ib_uobject *uobj;
> >> +
> >> +	ufile = qp->ibqp.uobject->uevent.uobject.ufile;
> >> +	uapi = ufile->device->uapi;
> >> +	type = uapi_get_object(uapi, UVERBS_OBJECT_AH);
> >> +	if (IS_ERR(type))
> >> +		return NULL;
> >> +	uobj = rdma_lookup_get_uobject(type, ufile, (s64)handle,
> >> +				       UVERBS_LOOKUP_READ, NULL);
> >> +	if (IS_ERR(uobj)) {
> >> +		pr_warn("unable to lookup ah handle\n");
> >> +		return NULL;
> >> +	}
> >> +
> >> +	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
> > 
> > It can't be put and then return the data pointer, it is a use after free:
> > 
> >> +	return uobj->object;
> > 
> >> @@ -562,11 +563,6 @@ static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
> >>  
> >>  	init_send_wr(qp, &wqe->wr, ibwr);
> >>  
> >> -	if (qp_type(qp) == IB_QPT_UD ||
> >> -	    qp_type(qp) == IB_QPT_SMI ||
> >> -	    qp_type(qp) == IB_QPT_GSI)
> >> -		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
> > 
> > It needs some kind of negotiated compat, can't just break userspace
> > like this
> > 
> > Jason
> > 
> 
> 1st point. I get it. uobj->object contains the address of one of the ib_xxx verbs objects.
> Normally the driver never looks at this level but presumably has a kref on that object so it makes
> sense to look it up. Perhaps better would be:
> 
> 	void *object;
> 
> 	...
> 
> 	uobj = rdma_lookup_get_uobject(...);
> 
> 	object = uobj->object;
> 
> 	rdma_lookup_put_uobject(...);
> 
> 	return (struct ib_ah *)object;
> 
> Here the caller has created the ib_ah but has not yet destroyed it so it must hold a kref on it.

Drivers are not supposed to keep using object after it has been
destroyed, so some kind of interlock is needed to prevent/defer
destruction here.

The uobject layer does not provide something usable to drivers

> 2nd point. I also get. This suggestion imagines that there will come
> a day when we can change the user API.  May be a rare day but must
> happen occasionally. The current design is just plain wrong and
> needs to get fixed eventually.

You can have some cap negotiation to switch the mode AH mode in the
WQEs - 'Use WQE format 2' for instance. Most of the HW drivers have
multiple WQE formats the userspace selects.

Jason

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-10-19 23:00 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-16 17:01 [PATCH RFC] rdma_rxe: Stop passing AV from user space Bob Pearson
2020-10-19 18:53 ` Jason Gunthorpe
2020-10-19 19:06   ` Bob Pearson
2020-10-19 23:00     ` Jason Gunthorpe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).