Currently the rdma_rxe driver and its user space provider exchange addressing information for UD sends by having the provider compute an address vector (AV) and send it with each WQE. This is not the way that the RDMA verbs API was intended to operate. This series of patches modifies the way UD send WQEs work by exchanging an index identifying the AH replacing the 88 byte AV by a 4 byte AH index. In order to not break compatibility with the existing API the rdma_rxe driver will recognise when an older version of the provider is not sending an index (i.e. it is 0) and will use the AV instead. v3: Split up commits into smaller steps. v2: Rearranged AV in rxe_send_wqe to be in the ud struct but padded to the same offset as the original preserving ABI compatibility. Bob Pearson (6): RDMA/rxe: Move AV from rxe_send_wqe to rxe_send_wr RDMA/rxe: Change AH objects to indexed RDMA/rxe: Create AH index and return to user space RDMA/rxe: Replace ah->pd by ah->ibah.pd RDMA/rxe: Lookup kernel AH from ah index in UD WQEs RDMA/rxe: Convert kernel UD post send to use ah_num drivers/infiniband/sw/rxe/rxe_av.c | 20 +++++++++++++- drivers/infiniband/sw/rxe/rxe_param.h | 4 ++- drivers/infiniband/sw/rxe/rxe_pool.c | 4 ++- drivers/infiniband/sw/rxe/rxe_req.c | 8 +++--- drivers/infiniband/sw/rxe/rxe_verbs.c | 39 ++++++++++++++++++++++----- drivers/infiniband/sw/rxe/rxe_verbs.h | 8 +++++- include/uapi/rdma/rdma_user_rxe.h | 10 ++++++- 7 files changed, 79 insertions(+), 14 deletions(-) -- 2.30.2
Move the struct rxe_av av from rxe_send_wqe to rxe_send_wr placing it in wr.ud at the same offset as it was previously. This has the effect of increasing the size of struct rxe_send_wr and keeping the size of struct rxe_send_wqe the same. This better reflects the use of this field which is only used for UD sends. This change has no effect on ABI compatibility so the modified rxe driver will operate with older versions of rdma-core. Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- v3: Per Jason's suggestion split up this patch into two pieces one of which just moves AV and combine adding the ah_num changes to the third patch. drivers/infiniband/sw/rxe/rxe_av.c | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 ++- include/uapi/rdma/rdma_user_rxe.h | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index da2e867a1ed9..85580ea5eed0 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -107,5 +107,5 @@ struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC) return &pkt->qp->pri_av; - return (pkt->wqe) ? &pkt->wqe->av : NULL; + return (pkt->wqe) ? &pkt->wqe->wr.wr.ud.av : NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f7b1a1f64c13..725015a2e84c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -605,7 +605,8 @@ static void init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_SMI || qp_type(qp) == IB_QPT_GSI) - memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av)); + memcpy(&wqe->wr.wr.ud.av, &to_rah(ud_wr(ibwr)->ah)->av, + sizeof(struct rxe_av)); if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) copy_inline_data_to_wqe(wqe, ibwr); diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index e283c2220aba..2f1ebbe96434 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -98,6 +98,9 @@ struct rxe_send_wr { __u32 remote_qpn; __u32 remote_qkey; __u16 pkey_index; + __u16 reserved; + __u32 pad[5]; + struct rxe_av av; } ud; struct { __aligned_u64 addr; @@ -148,7 +151,6 @@ struct rxe_dma_info { struct rxe_send_wqe { struct rxe_send_wr wr; - struct rxe_av av; __u32 status; __u32 state; __aligned_u64 iova; -- 2.30.2
Make changes to rxe_param.h and rxe_pool.c to allow indexing of AH objects. Valid indices are non-zero so older providers can be detected. Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- drivers/infiniband/sw/rxe/rxe_param.h | 4 +++- drivers/infiniband/sw/rxe/rxe_pool.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 742e6ec93686..ec5c6331bee8 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -67,7 +67,9 @@ enum rxe_device_param { RXE_MAX_MCAST_GRP = 8192, RXE_MAX_MCAST_QP_ATTACH = 56, RXE_MAX_TOT_MCAST_QP_ATTACH = 0x70000, - RXE_MAX_AH = 100, + RXE_MAX_AH = 16383, + RXE_MIN_AH_INDEX = 1, + RXE_MAX_AH_INDEX = 16383, RXE_MAX_SRQ_WR = 0x4000, RXE_MIN_SRQ_WR = 1, RXE_MAX_SRQ_SGE = 27, diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 0b8e7c6255a2..342f090152d1 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -26,7 +26,9 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { .name = "rxe-ah", .size = sizeof(struct rxe_ah), .elem_offset = offsetof(struct rxe_ah, pelem), - .flags = RXE_POOL_NO_ALLOC, + .flags = RXE_POOL_INDEX | RXE_POOL_NO_ALLOC, + .min_index = RXE_MIN_AH_INDEX, + .max_index = RXE_MAX_AH_INDEX, }, [RXE_TYPE_SRQ] = { .name = "rxe-srq", -- 2.30.2
Make changes to rdma_user_rxe.h to allow indexing AH objects, passing the index in UD send WRs to the driver and returning the index to the rxe provider. Modify rxe_create_ah() to add an index to AH when created and if called from a new user provider return it to user space. If called from an old provider mark the AH as not having a useful index. Modify rxe_destroy_ah to drop the index before deleting the object. Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- drivers/infiniband/sw/rxe/rxe_verbs.c | 31 ++++++++++++++++++++++++++- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 ++ include/uapi/rdma/rdma_user_rxe.h | 8 ++++++- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 725015a2e84c..7181e21f0c55 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -161,9 +161,19 @@ static int rxe_create_ah(struct ib_ah *ibah, struct ib_udata *udata) { - int err; struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); + struct rxe_create_ah_resp __user *uresp = NULL; + int err; + + if (udata) { + /* test if new user provider */ + if (udata->outlen >= sizeof(*uresp)) + uresp = udata->outbuf; + ah->is_user = true; + } else { + ah->is_user = false; + } err = rxe_av_chk_attr(rxe, init_attr->ah_attr); if (err) @@ -173,6 +183,24 @@ static int rxe_create_ah(struct ib_ah *ibah, if (err) return err; + /* create index > 0 */ + rxe_add_index(ah); + ah->ah_num = ah->pelem.index; + + if (uresp) { + /* only if new user provider */ + err = copy_to_user(&uresp->ah_num, &ah->ah_num, + sizeof(uresp->ah_num)); + if (err) { + rxe_drop_index(ah); + rxe_drop_ref(ah); + return -EFAULT; + } + } else if (ah->is_user) { + /* only if old user provider */ + ah->ah_num = 0; + } + rxe_init_av(init_attr->ah_attr, &ah->av); return 0; } @@ -205,6 +233,7 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); + rxe_drop_index(ah); rxe_drop_ref(ah); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 959a3260fcab..500c47d84500 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -48,6 +48,8 @@ struct rxe_ah { struct rxe_pool_entry pelem; struct rxe_pd *pd; struct rxe_av av; + bool is_user; + int ah_num; }; struct rxe_cqe { diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 2f1ebbe96434..dc9f7a5e203a 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -99,7 +99,8 @@ struct rxe_send_wr { __u32 remote_qkey; __u16 pkey_index; __u16 reserved; - __u32 pad[5]; + __u32 ah_num; + __u32 pad[4]; struct rxe_av av; } ud; struct { @@ -170,6 +171,11 @@ struct rxe_recv_wqe { struct rxe_dma_info dma; }; +struct rxe_create_ah_resp { + __u32 ah_num; + __u32 reserved; +}; + struct rxe_create_cq_resp { struct mminfo mi; }; -- 2.30.2
The pd field in struct rxe_ah is redundant with the pd field in the rdma-core's ib_ah. Eliminate the pd field in rxe_ah and add an inline to extract the pd from the ibah field. Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- drivers/infiniband/sw/rxe/rxe_verbs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 500c47d84500..6e48dcd21c5f 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -46,7 +46,6 @@ struct rxe_pd { struct rxe_ah { struct ib_ah ibah; struct rxe_pool_entry pelem; - struct rxe_pd *pd; struct rxe_av av; bool is_user; int ah_num; @@ -486,6 +485,11 @@ static inline u32 mr_rkey(struct rxe_mr *mr) return mr->ibmr.rkey; } +static inline struct rxe_pd *rxe_ah_pd(struct rxe_ah *ah) +{ + return to_rpd(ah->ibah.pd); +} + static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) { return to_rpd(mw->ibmw.pd); -- 2.30.2
Add code to rxe_get_av in rxe_av.c to use the AH index in UD send WQEs to lookup the kernel AH. For old user providers continue to use the AV passed in WQEs. Move setting pkt->rxe to before the call to rxe_get_av() to get access to the AH pool. Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- drivers/infiniband/sw/rxe/rxe_av.c | 20 +++++++++++++++++++- drivers/infiniband/sw/rxe/rxe_req.c | 8 +++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 85580ea5eed0..38c7b6fb39d7 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -101,11 +101,29 @@ void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr) struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) { + struct rxe_ah *ah; + u32 ah_num; + if (!pkt || !pkt->qp) return NULL; if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC) return &pkt->qp->pri_av; - return (pkt->wqe) ? &pkt->wqe->wr.wr.ud.av : NULL; + if (!pkt->wqe) + return NULL; + + ah_num = pkt->wqe->wr.wr.ud.ah_num; + if (ah_num) { + /* only new user provider or kernel client */ + ah = rxe_pool_get_index(&pkt->rxe->ah_pool, ah_num); + if (!ah || ah->ah_num != ah_num || rxe_ah_pd(ah) != pkt->qp->pd) { + pr_warn("Unable to find AH matching ah_num\n"); + return NULL; + } + return &ah->av; + } + + /* only old user provider for UD sends*/ + return &pkt->wqe->wr.wr.ud.av; } diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 3894197a82f6..86979a9e7afc 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -390,9 +390,8 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, /* length from start of bth to end of icrc */ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; - /* pkt->hdr, rxe, port_num and mask are initialized in ifc - * layer - */ + /* pkt->hdr, port_num and mask are initialized in ifc layer */ + pkt->rxe = rxe; pkt->opcode = opcode; pkt->qp = qp; pkt->psn = qp->req.psn; @@ -402,6 +401,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, /* init skb */ av = rxe_get_av(pkt); + if (!av) + return NULL; + skb = rxe_init_packet(rxe, av, paylen, pkt); if (unlikely(!skb)) return NULL; -- 2.30.2
Modify ib_post_send for kernel UD sends to put the AH index into the WQE instead of the address vector. Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> --- drivers/infiniband/sw/rxe/rxe_verbs.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 7181e21f0c55..ee3f70c8809a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -558,8 +558,11 @@ static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_SMI || qp_type(qp) == IB_QPT_GSI) { + struct ib_ah *ibah = ud_wr(ibwr)->ah; + wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; + wr->wr.ud.ah_num = to_rah(ibah)->ah_num; if (qp_type(qp) == IB_QPT_GSI) wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; if (wr->opcode == IB_WR_SEND_WITH_IMM) @@ -631,12 +634,6 @@ static void init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, return; } - if (qp_type(qp) == IB_QPT_UD || - qp_type(qp) == IB_QPT_SMI || - qp_type(qp) == IB_QPT_GSI) - memcpy(&wqe->wr.wr.ud.av, &to_rah(ud_wr(ibwr)->ah)->av, - sizeof(struct rxe_av)); - if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) copy_inline_data_to_wqe(wqe, ibwr); else -- 2.30.2
On Thu, Jul 22, 2021 at 04:22:42PM -0500, Bob Pearson wrote: > Make changes to rdma_user_rxe.h to allow indexing AH objects, passing > the index in UD send WRs to the driver and returning the index to the rxe > provider. > > Modify rxe_create_ah() to add an index to AH when created and if > called from a new user provider return it to user space. If called > from an old provider mark the AH as not having a useful index. > Modify rxe_destroy_ah to drop the index before deleting the object. > > Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com> > drivers/infiniband/sw/rxe/rxe_verbs.c | 31 ++++++++++++++++++++++++++- > drivers/infiniband/sw/rxe/rxe_verbs.h | 2 ++ > include/uapi/rdma/rdma_user_rxe.h | 8 ++++++- > 3 files changed, 39 insertions(+), 2 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c > index 725015a2e84c..7181e21f0c55 100644 > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c > @@ -161,9 +161,19 @@ static int rxe_create_ah(struct ib_ah *ibah, > struct ib_udata *udata) > > { > - int err; > struct rxe_dev *rxe = to_rdev(ibah->device); > struct rxe_ah *ah = to_rah(ibah); > + struct rxe_create_ah_resp __user *uresp = NULL; > + int err; > + > + if (udata) { > + /* test if new user provider */ > + if (udata->outlen >= sizeof(*uresp)) > + uresp = udata->outbuf; > + ah->is_user = true; > + } else { > + ah->is_user = false; > + } > > err = rxe_av_chk_attr(rxe, init_attr->ah_attr); > if (err) > @@ -173,6 +183,24 @@ static int rxe_create_ah(struct ib_ah *ibah, > if (err) > return err; > > + /* create index > 0 */ > + rxe_add_index(ah); > + ah->ah_num = ah->pelem.index; > + > + if (uresp) { > + /* only if new user provider */ > + err = copy_to_user(&uresp->ah_num, &ah->ah_num, > + sizeof(uresp->ah_num)); This should just a using a min_t(size_t udata->outlen, sizeof(*uresp)) And this needs to create the uresp on the stack and copy the whole thing, not try to copy single values at a time > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h > index 959a3260fcab..500c47d84500 100644 > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h > @@ -48,6 +48,8 @@ struct rxe_ah { > struct rxe_pool_entry pelem; > struct rxe_pd *pd; > struct rxe_av av; > + bool is_user; > + int ah_num; unsigned int? Jason
On Thu, Jul 22, 2021 at 04:22:44PM -0500, Bob Pearson wrote:
> Add code to rxe_get_av in rxe_av.c to use the AH index in UD send WQEs
> to lookup the kernel AH. For old user providers continue to use the AV
> passed in WQEs. Move setting pkt->rxe to before the call to rxe_get_av()
> to get access to the AH pool.
>
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> drivers/infiniband/sw/rxe/rxe_av.c | 20 +++++++++++++++++++-
> drivers/infiniband/sw/rxe/rxe_req.c | 8 +++++---
> 2 files changed, 24 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
> index 85580ea5eed0..38c7b6fb39d7 100644
> +++ b/drivers/infiniband/sw/rxe/rxe_av.c
> @@ -101,11 +101,29 @@ void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr)
>
> struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt)
> {
> + struct rxe_ah *ah;
> + u32 ah_num;
> +
> if (!pkt || !pkt->qp)
> return NULL;
>
> if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC)
> return &pkt->qp->pri_av;
>
> - return (pkt->wqe) ? &pkt->wqe->wr.wr.ud.av : NULL;
> + if (!pkt->wqe)
> + return NULL;
> +
> + ah_num = pkt->wqe->wr.wr.ud.ah_num;
> + if (ah_num) {
> + /* only new user provider or kernel client */
> + ah = rxe_pool_get_index(&pkt->rxe->ah_pool, ah_num);
rxe_pool_get_index() incr's a kref, but I don't see any put of that
kref in this code?
Jason