All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH for-next v3] RDMA/rxe: Fix ib_device reference counting (again)
@ 2021-03-04 19:20 Bob Pearson
  2021-03-05  3:10 ` Zhu Yanjun
  2021-03-05 18:20 ` Jason Gunthorpe
  0 siblings, 2 replies; 5+ messages in thread
From: Bob Pearson @ 2021-03-04 19:20 UTC (permalink / raw)
  To: jgg, zyjzyj2000, linux-rdma; +Cc: Bob Pearson

From: Bob Pearson <rpearsonhpe@gmail.com>

Three errors occurred in the fix referenced below.

1) rxe_rcv_mcast_pkt() dropped a reference to ib_device when
no error occurred causing an underflow on the reference counter.
This code is cleaned up to be clearer and easier to read.

2) Extending the reference taken by rxe_get_dev_from_net() in
rxe_udp_encap_recv() until each skb is freed was not matched by
a reference in the loopback path resulting in underflows.

3) In rxe_comp.c in rxe_completer() the function free_pkt() did
not clear skb which triggered a warning at done: and could possibly
at exit: . The WARN_ONCE() calls are not actually needed.
The call to free_pkt() is moved to the end to clearly show that
all skbs are freed.

This patch fixes these errors.

Fixes: 899aba891cab ("RDMA/rxe: Fix FIXME in rxe_udp_encap_recv()")
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
Version 3:
V2 of this patch had spelling errors and style issues which are
fixed in this version.

Version 2:
v1 of this patch incorrectly added a WARN_ON_ONCE in rxe_completer
where it could be triggered for normal traffic. This version
replaced that with a pr_warn located correctly.

v1 of this patch placed a call to kfree_skb in an if statement
that could trigger style warnings. This version cleans that up.

 drivers/infiniband/sw/rxe/rxe_comp.c | 55 +++++++++++---------------
 drivers/infiniband/sw/rxe/rxe_net.c  | 10 ++++-
 drivers/infiniband/sw/rxe/rxe_recv.c | 59 +++++++++++++++++-----------
 3 files changed, 67 insertions(+), 57 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index a8ac791a1bb9..17a361b8dbb1 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -547,6 +547,7 @@ int rxe_completer(void *arg)
 	struct sk_buff *skb = NULL;
 	struct rxe_pkt_info *pkt = NULL;
 	enum comp_state state;
+	int ret = 0;
 
 	rxe_add_ref(qp);
 
@@ -554,7 +555,8 @@ int rxe_completer(void *arg)
 	    qp->req.state == QP_STATE_RESET) {
 		rxe_drain_resp_pkts(qp, qp->valid &&
 				    qp->req.state == QP_STATE_ERROR);
-		goto exit;
+		ret = -EAGAIN;
+		goto done;
 	}
 
 	if (qp->comp.timeout) {
@@ -564,8 +566,10 @@ int rxe_completer(void *arg)
 		qp->comp.timeout_retry = 0;
 	}
 
-	if (qp->req.need_retry)
-		goto exit;
+	if (qp->req.need_retry) {
+		ret = -EAGAIN;
+		goto done;
+	}
 
 	state = COMPST_GET_ACK;
 
@@ -636,8 +640,6 @@ int rxe_completer(void *arg)
 			break;
 
 		case COMPST_DONE:
-			if (pkt)
-				free_pkt(pkt);
 			goto done;
 
 		case COMPST_EXIT:
@@ -660,7 +662,8 @@ int rxe_completer(void *arg)
 			    qp->qp_timeout_jiffies)
 				mod_timer(&qp->retrans_timer,
 					  jiffies + qp->qp_timeout_jiffies);
-			goto exit;
+			ret = -EAGAIN;
+			goto done;
 
 		case COMPST_ERROR_RETRY:
 			/* we come here if the retry timer fired and we did
@@ -672,18 +675,18 @@ int rxe_completer(void *arg)
 			 */
 
 			/* there is nothing to retry in this case */
-			if (!wqe || (wqe->state == wqe_state_posted))
-				goto exit;
+			if (!wqe || (wqe->state == wqe_state_posted)) {
+				pr_warn("Retry attempted without a valid wqe\n");
+				ret = -EAGAIN;
+				goto done;
+			}
 
 			/* if we've started a retry, don't start another
 			 * retry sequence, unless this is a timeout.
 			 */
 			if (qp->comp.started_retry &&
-			    !qp->comp.timeout_retry) {
-				if (pkt)
-					free_pkt(pkt);
+			    !qp->comp.timeout_retry)
 				goto done;
-			}
 
 			if (qp->comp.retry_cnt > 0) {
 				if (qp->comp.retry_cnt != 7)
@@ -704,8 +707,6 @@ int rxe_completer(void *arg)
 					qp->comp.started_retry = 1;
 					rxe_run_task(&qp->req.task, 0);
 				}
-				if (pkt)
-					free_pkt(pkt);
 				goto done;
 
 			} else {
@@ -726,8 +727,8 @@ int rxe_completer(void *arg)
 				mod_timer(&qp->rnr_nak_timer,
 					  jiffies + rnrnak_jiffies(aeth_syn(pkt)
 						& ~AETH_TYPE_MASK));
-				free_pkt(pkt);
-				goto exit;
+				ret = -EAGAIN;
+				goto done;
 			} else {
 				rxe_counter_inc(rxe,
 						RXE_CNT_RNR_RETRY_EXCEEDED);
@@ -740,25 +741,15 @@ int rxe_completer(void *arg)
 			WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS);
 			do_complete(qp, wqe);
 			rxe_qp_error(qp);
-			if (pkt)
-				free_pkt(pkt);
-			goto exit;
+			ret = -EAGAIN;
+			goto done;
 		}
 	}
 
-exit:
-	/* we come here if we are done with processing and want the task to
-	 * exit from the loop calling us
-	 */
-	WARN_ON_ONCE(skb);
-	rxe_drop_ref(qp);
-	return -EAGAIN;
-
 done:
-	/* we come here if we have processed a packet we want the task to call
-	 * us again to see if there is anything else to do
-	 */
-	WARN_ON_ONCE(skb);
+	if (pkt)
+		free_pkt(pkt);
 	rxe_drop_ref(qp);
-	return 0;
+
+	return ret;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 0701bd1ffd1a..01662727dca0 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -407,14 +407,22 @@ int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 	return 0;
 }
 
+/* fix up a send packet to match the packets
+ * received from UDP before looping them back
+ */
 void rxe_loopback(struct sk_buff *skb)
 {
+	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
 	if (skb->protocol == htons(ETH_P_IP))
 		skb_pull(skb, sizeof(struct iphdr));
 	else
 		skb_pull(skb, sizeof(struct ipv6hdr));
 
-	rxe_rcv(skb);
+	if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev)))
+		kfree_skb(skb);
+	else
+		rxe_rcv(skb);
 }
 
 struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
index 45d2f711bce2..7a49e27da23a 100644
--- a/drivers/infiniband/sw/rxe/rxe_recv.c
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -237,8 +237,6 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
 	struct rxe_mc_elem *mce;
 	struct rxe_qp *qp;
 	union ib_gid dgid;
-	struct sk_buff *per_qp_skb;
-	struct rxe_pkt_info *per_qp_pkt;
 	int err;
 
 	if (skb->protocol == htons(ETH_P_IP))
@@ -250,10 +248,15 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
 	/* lookup mcast group corresponding to mgid, takes a ref */
 	mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid);
 	if (!mcg)
-		goto err1;	/* mcast group not registered */
+		goto drop;	/* mcast group not registered */
 
 	spin_lock_bh(&mcg->mcg_lock);
 
+	/* this is unreliable datagram service so we let
+	 * failures to deliver a multicast packet to a
+	 * single QP happen and just move on and try
+	 * the rest of them on the list
+	 */
 	list_for_each_entry(mce, &mcg->qp_list, qp_list) {
 		qp = mce->qp;
 
@@ -266,39 +269,47 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
 		if (err)
 			continue;
 
-		/* for all but the last qp create a new clone of the
-		 * skb and pass to the qp. If an error occurs in the
-		 * checks for the last qp in the list we need to
-		 * free the skb since it hasn't been passed on to
-		 * rxe_rcv_pkt() which would free it later.
+		/* for all but the last QP create a new clone of the
+		 * skb and pass to the QP. Pass the original skb to
+		 * the last QP in the list.
 		 */
 		if (mce->qp_list.next != &mcg->qp_list) {
-			per_qp_skb = skb_clone(skb, GFP_ATOMIC);
-			if (WARN_ON(!ib_device_try_get(&rxe->ib_dev))) {
-				kfree_skb(per_qp_skb);
+			struct sk_buff *cskb;
+			struct rxe_pkt_info *cpkt;
+
+			cskb = skb_clone(skb, GFP_ATOMIC);
+			if (unlikely(!cskb))
 				continue;
+
+			if (WARN_ON(!ib_device_try_get(&rxe->ib_dev))) {
+				kfree_skb(cskb);
+				break;
 			}
+
+			cpkt = SKB_TO_PKT(cskb);
+			cpkt->qp = qp;
+			rxe_add_ref(qp);
+			rxe_rcv_pkt(cpkt, cskb);
 		} else {
-			per_qp_skb = skb;
-			/* show we have consumed the skb */
-			skb = NULL;
+			pkt->qp = qp;
+			rxe_add_ref(qp);
+			rxe_rcv_pkt(pkt, skb);
+			skb = NULL;	/* mark consumed */
 		}
-
-		if (unlikely(!per_qp_skb))
-			continue;
-
-		per_qp_pkt = SKB_TO_PKT(per_qp_skb);
-		per_qp_pkt->qp = qp;
-		rxe_add_ref(qp);
-		rxe_rcv_pkt(per_qp_pkt, per_qp_skb);
 	}
 
 	spin_unlock_bh(&mcg->mcg_lock);
 
 	rxe_drop_ref(mcg);	/* drop ref from rxe_pool_get_key. */
 
-err1:
-	/* free skb if not consumed */
+	if (likely(!skb))
+		return;
+
+	/* This only occurs if one of the checks fails on the last
+	 * QP in the list above
+	 */
+
+drop:
 	kfree_skb(skb);
 	ib_device_put(&rxe->ib_dev);
 }
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH for-next v3] RDMA/rxe: Fix ib_device reference counting (again)
  2021-03-04 19:20 [PATCH for-next v3] RDMA/rxe: Fix ib_device reference counting (again) Bob Pearson
@ 2021-03-05  3:10 ` Zhu Yanjun
  2021-03-05 18:02   ` Bob Pearson
  2021-03-05 18:20 ` Jason Gunthorpe
  1 sibling, 1 reply; 5+ messages in thread
From: Zhu Yanjun @ 2021-03-05  3:10 UTC (permalink / raw)
  To: Bob Pearson; +Cc: Jason Gunthorpe, RDMA mailing list

On Fri, Mar 5, 2021 at 3:23 AM Bob Pearson <rpearsonhpe@gmail.com> wrote:
>
> From: Bob Pearson <rpearsonhpe@gmail.com>
>
> Three errors occurred in the fix referenced below.
>
> 1) rxe_rcv_mcast_pkt() dropped a reference to ib_device when
> no error occurred causing an underflow on the reference counter.
> This code is cleaned up to be clearer and easier to read.
>
> 2) Extending the reference taken by rxe_get_dev_from_net() in
> rxe_udp_encap_recv() until each skb is freed was not matched by
> a reference in the loopback path resulting in underflows.
>
> 3) In rxe_comp.c in rxe_completer() the function free_pkt() did
> not clear skb which triggered a warning at done: and could possibly
> at exit: . The WARN_ONCE() calls are not actually needed.
> The call to free_pkt() is moved to the end to clearly show that
> all skbs are freed.

Where do these not-freed skb come from? Except these skb, are other
resources freed correctly?

IMHO, the root cause should be found and other resources should be also handled.
WARN_ON_ONCE() should be kept on exit and done.

So if some skb are not freed incorrectly, we can find these skb in the loop end.

Zhu Yanjun

>
> This patch fixes these errors.
>
> Fixes: 899aba891cab ("RDMA/rxe: Fix FIXME in rxe_udp_encap_recv()")
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
> Version 3:
> V2 of this patch had spelling errors and style issues which are
> fixed in this version.
>
> Version 2:
> v1 of this patch incorrectly added a WARN_ON_ONCE in rxe_completer
> where it could be triggered for normal traffic. This version
> replaced that with a pr_warn located correctly.
>
> v1 of this patch placed a call to kfree_skb in an if statement
> that could trigger style warnings. This version cleans that up.
>
>  drivers/infiniband/sw/rxe/rxe_comp.c | 55 +++++++++++---------------
>  drivers/infiniband/sw/rxe/rxe_net.c  | 10 ++++-
>  drivers/infiniband/sw/rxe/rxe_recv.c | 59 +++++++++++++++++-----------
>  3 files changed, 67 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
> index a8ac791a1bb9..17a361b8dbb1 100644
> --- a/drivers/infiniband/sw/rxe/rxe_comp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_comp.c
> @@ -547,6 +547,7 @@ int rxe_completer(void *arg)
>         struct sk_buff *skb = NULL;
>         struct rxe_pkt_info *pkt = NULL;
>         enum comp_state state;
> +       int ret = 0;
>
>         rxe_add_ref(qp);
>
> @@ -554,7 +555,8 @@ int rxe_completer(void *arg)
>             qp->req.state == QP_STATE_RESET) {
>                 rxe_drain_resp_pkts(qp, qp->valid &&
>                                     qp->req.state == QP_STATE_ERROR);
> -               goto exit;
> +               ret = -EAGAIN;
> +               goto done;
>         }
>
>         if (qp->comp.timeout) {
> @@ -564,8 +566,10 @@ int rxe_completer(void *arg)
>                 qp->comp.timeout_retry = 0;
>         }
>
> -       if (qp->req.need_retry)
> -               goto exit;
> +       if (qp->req.need_retry) {
> +               ret = -EAGAIN;
> +               goto done;
> +       }
>
>         state = COMPST_GET_ACK;
>
> @@ -636,8 +640,6 @@ int rxe_completer(void *arg)
>                         break;
>
>                 case COMPST_DONE:
> -                       if (pkt)
> -                               free_pkt(pkt);
>                         goto done;
>
>                 case COMPST_EXIT:
> @@ -660,7 +662,8 @@ int rxe_completer(void *arg)
>                             qp->qp_timeout_jiffies)
>                                 mod_timer(&qp->retrans_timer,
>                                           jiffies + qp->qp_timeout_jiffies);
> -                       goto exit;
> +                       ret = -EAGAIN;
> +                       goto done;
>
>                 case COMPST_ERROR_RETRY:
>                         /* we come here if the retry timer fired and we did
> @@ -672,18 +675,18 @@ int rxe_completer(void *arg)
>                          */
>
>                         /* there is nothing to retry in this case */
> -                       if (!wqe || (wqe->state == wqe_state_posted))
> -                               goto exit;
> +                       if (!wqe || (wqe->state == wqe_state_posted)) {
> +                               pr_warn("Retry attempted without a valid wqe\n");
> +                               ret = -EAGAIN;
> +                               goto done;
> +                       }
>
>                         /* if we've started a retry, don't start another
>                          * retry sequence, unless this is a timeout.
>                          */
>                         if (qp->comp.started_retry &&
> -                           !qp->comp.timeout_retry) {
> -                               if (pkt)
> -                                       free_pkt(pkt);
> +                           !qp->comp.timeout_retry)
>                                 goto done;
> -                       }
>
>                         if (qp->comp.retry_cnt > 0) {
>                                 if (qp->comp.retry_cnt != 7)
> @@ -704,8 +707,6 @@ int rxe_completer(void *arg)
>                                         qp->comp.started_retry = 1;
>                                         rxe_run_task(&qp->req.task, 0);
>                                 }
> -                               if (pkt)
> -                                       free_pkt(pkt);
>                                 goto done;
>
>                         } else {
> @@ -726,8 +727,8 @@ int rxe_completer(void *arg)
>                                 mod_timer(&qp->rnr_nak_timer,
>                                           jiffies + rnrnak_jiffies(aeth_syn(pkt)
>                                                 & ~AETH_TYPE_MASK));
> -                               free_pkt(pkt);
> -                               goto exit;
> +                               ret = -EAGAIN;
> +                               goto done;
>                         } else {
>                                 rxe_counter_inc(rxe,
>                                                 RXE_CNT_RNR_RETRY_EXCEEDED);
> @@ -740,25 +741,15 @@ int rxe_completer(void *arg)
>                         WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS);
>                         do_complete(qp, wqe);
>                         rxe_qp_error(qp);
> -                       if (pkt)
> -                               free_pkt(pkt);
> -                       goto exit;
> +                       ret = -EAGAIN;
> +                       goto done;
>                 }
>         }
>
> -exit:
> -       /* we come here if we are done with processing and want the task to
> -        * exit from the loop calling us
> -        */
> -       WARN_ON_ONCE(skb);
> -       rxe_drop_ref(qp);
> -       return -EAGAIN;
> -
>  done:
> -       /* we come here if we have processed a packet we want the task to call
> -        * us again to see if there is anything else to do
> -        */
> -       WARN_ON_ONCE(skb);
> +       if (pkt)
> +               free_pkt(pkt);
>         rxe_drop_ref(qp);
> -       return 0;
> +
> +       return ret;
>  }
> diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
> index 0701bd1ffd1a..01662727dca0 100644
> --- a/drivers/infiniband/sw/rxe/rxe_net.c
> +++ b/drivers/infiniband/sw/rxe/rxe_net.c
> @@ -407,14 +407,22 @@ int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb)
>         return 0;
>  }
>
> +/* fix up a send packet to match the packets
> + * received from UDP before looping them back
> + */
>  void rxe_loopback(struct sk_buff *skb)
>  {
> +       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
> +
>         if (skb->protocol == htons(ETH_P_IP))
>                 skb_pull(skb, sizeof(struct iphdr));
>         else
>                 skb_pull(skb, sizeof(struct ipv6hdr));
>
> -       rxe_rcv(skb);
> +       if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev)))
> +               kfree_skb(skb);
> +       else
> +               rxe_rcv(skb);
>  }
>
>  struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
> diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
> index 45d2f711bce2..7a49e27da23a 100644
> --- a/drivers/infiniband/sw/rxe/rxe_recv.c
> +++ b/drivers/infiniband/sw/rxe/rxe_recv.c
> @@ -237,8 +237,6 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
>         struct rxe_mc_elem *mce;
>         struct rxe_qp *qp;
>         union ib_gid dgid;
> -       struct sk_buff *per_qp_skb;
> -       struct rxe_pkt_info *per_qp_pkt;
>         int err;
>
>         if (skb->protocol == htons(ETH_P_IP))
> @@ -250,10 +248,15 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
>         /* lookup mcast group corresponding to mgid, takes a ref */
>         mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid);
>         if (!mcg)
> -               goto err1;      /* mcast group not registered */
> +               goto drop;      /* mcast group not registered */
>
>         spin_lock_bh(&mcg->mcg_lock);
>
> +       /* this is unreliable datagram service so we let
> +        * failures to deliver a multicast packet to a
> +        * single QP happen and just move on and try
> +        * the rest of them on the list
> +        */
>         list_for_each_entry(mce, &mcg->qp_list, qp_list) {
>                 qp = mce->qp;
>
> @@ -266,39 +269,47 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
>                 if (err)
>                         continue;
>
> -               /* for all but the last qp create a new clone of the
> -                * skb and pass to the qp. If an error occurs in the
> -                * checks for the last qp in the list we need to
> -                * free the skb since it hasn't been passed on to
> -                * rxe_rcv_pkt() which would free it later.
> +               /* for all but the last QP create a new clone of the
> +                * skb and pass to the QP. Pass the original skb to
> +                * the last QP in the list.
>                  */
>                 if (mce->qp_list.next != &mcg->qp_list) {
> -                       per_qp_skb = skb_clone(skb, GFP_ATOMIC);
> -                       if (WARN_ON(!ib_device_try_get(&rxe->ib_dev))) {
> -                               kfree_skb(per_qp_skb);
> +                       struct sk_buff *cskb;
> +                       struct rxe_pkt_info *cpkt;
> +
> +                       cskb = skb_clone(skb, GFP_ATOMIC);
> +                       if (unlikely(!cskb))
>                                 continue;
> +
> +                       if (WARN_ON(!ib_device_try_get(&rxe->ib_dev))) {
> +                               kfree_skb(cskb);
> +                               break;
>                         }
> +
> +                       cpkt = SKB_TO_PKT(cskb);
> +                       cpkt->qp = qp;
> +                       rxe_add_ref(qp);
> +                       rxe_rcv_pkt(cpkt, cskb);
>                 } else {
> -                       per_qp_skb = skb;
> -                       /* show we have consumed the skb */
> -                       skb = NULL;
> +                       pkt->qp = qp;
> +                       rxe_add_ref(qp);
> +                       rxe_rcv_pkt(pkt, skb);
> +                       skb = NULL;     /* mark consumed */
>                 }
> -
> -               if (unlikely(!per_qp_skb))
> -                       continue;
> -
> -               per_qp_pkt = SKB_TO_PKT(per_qp_skb);
> -               per_qp_pkt->qp = qp;
> -               rxe_add_ref(qp);
> -               rxe_rcv_pkt(per_qp_pkt, per_qp_skb);
>         }
>
>         spin_unlock_bh(&mcg->mcg_lock);
>
>         rxe_drop_ref(mcg);      /* drop ref from rxe_pool_get_key. */
>
> -err1:
> -       /* free skb if not consumed */
> +       if (likely(!skb))
> +               return;
> +
> +       /* This only occurs if one of the checks fails on the last
> +        * QP in the list above
> +        */
> +
> +drop:
>         kfree_skb(skb);
>         ib_device_put(&rxe->ib_dev);
>  }
> --
> 2.27.0
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH for-next v3] RDMA/rxe: Fix ib_device reference counting (again)
  2021-03-05  3:10 ` Zhu Yanjun
@ 2021-03-05 18:02   ` Bob Pearson
  0 siblings, 0 replies; 5+ messages in thread
From: Bob Pearson @ 2021-03-05 18:02 UTC (permalink / raw)
  To: Zhu Yanjun, linux-rdma; +Cc: Jason Gunthorpe


> 
> Where do these not-freed skb come from? Except these skb, are other
> resources freed correctly?
> 
> IMHO, the root cause should be found and other resources should be also handled.
> WARN_ON_ONCE() should be kept on exit and done.
> 
> So if some skb are not freed incorrectly, we can find these skb in the loop end.
> 
> Zhu Yanjun
> 

In the original code (before the patch that this patch is trying to fix) there was one path
through rxe_completer() that *could* have leaked an skb. We haven't been seeing that one though.
Every other path through the state machine exited after calling kfree_skb(skb). You can check
but you have to look at an older kernel.

The earlier "Fix FIXME" patch, which is upstream, collected the code related to freeing the skbs
into a subroutine called free_pkt(). This was an improvement but even though it calls
kfree_skb(skb) it didn't set the local variable skb to NULL. This allowed some bogus warnings
to show up as you have been demonstrating.

This patch "fix reference counting (again)" fixes both of these issues by getting rid of the
warning and moving the free_pkt() to the end of rxe_completer(). Now, after applying the
patch, the end of the subroutine reads

...
done:
	if (pkt)
		free_pkt(pkt);
	rxe_drop_ref(qp);

	return ret;
}

If you check, you can see there are no other return statements in the routine. (skb and pkt are
pointers to the same packet. the macro SKB_TO_PKT sets pkt = &skb->cb and PKT_TO_SKB sets
skb = container_of(...) so passing pkt is equivalent to passing skb.)

The only way out, therefore, frees a packet if there is one and we cannot be leaking skbs. The
free_pkt() routine handles all the other stuff that needs to be done when the skb is freed.
(The rxe_drop_ref(qp) drops the reference to QP taken at the top of the rxe_completer() routine.)

The default behavior when receiving an ack packet that isn't expected or perfectly correct is to
drop it. This includes ack packets received when the sender is retrying a request. The comment
after case COMPST_ERROR_RETRY: is misleading because you do have an ack packet when you get there
but it could be unexpected if no wqe is waiting for it. This was the path that before could have
lead to leaking an skb but that is now fixed.

As of now there are no remaining paths that can leak a packet and no reason to check that
skb == NULL. If it makes you happy you can clear it like this

	if (pkt)
		free_pkt(pkt);
	skb = NULL;
	WARN_ON_ONCE(skb);

but that would be a pointless waste of time.

Bob

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH for-next v3] RDMA/rxe: Fix ib_device reference counting (again)
  2021-03-04 19:20 [PATCH for-next v3] RDMA/rxe: Fix ib_device reference counting (again) Bob Pearson
  2021-03-05  3:10 ` Zhu Yanjun
@ 2021-03-05 18:20 ` Jason Gunthorpe
  2021-03-05 18:59   ` Bob Pearson
  1 sibling, 1 reply; 5+ messages in thread
From: Jason Gunthorpe @ 2021-03-05 18:20 UTC (permalink / raw)
  To: Bob Pearson; +Cc: zyjzyj2000, linux-rdma

On Thu, Mar 04, 2021 at 01:20:49PM -0600, Bob Pearson wrote:
> From: Bob Pearson <rpearsonhpe@gmail.com>
> 
> Three errors occurred in the fix referenced below.
> 
> 1) rxe_rcv_mcast_pkt() dropped a reference to ib_device when
> no error occurred causing an underflow on the reference counter.
> This code is cleaned up to be clearer and easier to read.
> 
> 2) Extending the reference taken by rxe_get_dev_from_net() in
> rxe_udp_encap_recv() until each skb is freed was not matched by
> a reference in the loopback path resulting in underflows.
> 
> 3) In rxe_comp.c in rxe_completer() the function free_pkt() did
> not clear skb which triggered a warning at done: and could possibly
> at exit: . The WARN_ONCE() calls are not actually needed.
> The call to free_pkt() is moved to the end to clearly show that
> all skbs are freed.
> 
> This patch fixes these errors.
> 
> Fixes: 899aba891cab ("RDMA/rxe: Fix FIXME in rxe_udp_encap_recv()")
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> Version 3:
> V2 of this patch had spelling errors and style issues which are
> fixed in this version.
> 
> Version 2:
> v1 of this patch incorrectly added a WARN_ON_ONCE in rxe_completer
> where it could be triggered for normal traffic. This version
> replaced that with a pr_warn located correctly.
> 
> v1 of this patch placed a call to kfree_skb in an if statement
> that could trigger style warnings. This version cleans that up.
> 
>  drivers/infiniband/sw/rxe/rxe_comp.c | 55 +++++++++++---------------
>  drivers/infiniband/sw/rxe/rxe_net.c  | 10 ++++-
>  drivers/infiniband/sw/rxe/rxe_recv.c | 59 +++++++++++++++++-----------
>  3 files changed, 67 insertions(+), 57 deletions(-)

I split this into three patches for-rc as is required by Linus.

It looks reasonable to me and the reorganizing make sense

Thanks,
Jason

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH for-next v3] RDMA/rxe: Fix ib_device reference counting (again)
  2021-03-05 18:20 ` Jason Gunthorpe
@ 2021-03-05 18:59   ` Bob Pearson
  0 siblings, 0 replies; 5+ messages in thread
From: Bob Pearson @ 2021-03-05 18:59 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: zyjzyj2000, linux-rdma

On 3/5/21 12:20 PM, Jason Gunthorpe wrote:
> On Thu, Mar 04, 2021 at 01:20:49PM -0600, Bob Pearson wrote:
>> From: Bob Pearson <rpearsonhpe@gmail.com>
>>
>> Three errors occurred in the fix referenced below.
>>
>> 1) rxe_rcv_mcast_pkt() dropped a reference to ib_device when
>> no error occurred causing an underflow on the reference counter.
>> This code is cleaned up to be clearer and easier to read.
>>
>> 2) Extending the reference taken by rxe_get_dev_from_net() in
>> rxe_udp_encap_recv() until each skb is freed was not matched by
>> a reference in the loopback path resulting in underflows.
>>
>> 3) In rxe_comp.c in rxe_completer() the function free_pkt() did
>> not clear skb which triggered a warning at done: and could possibly
>> at exit: . The WARN_ONCE() calls are not actually needed.
>> The call to free_pkt() is moved to the end to clearly show that
>> all skbs are freed.
>>
>> This patch fixes these errors.
>>
>> Fixes: 899aba891cab ("RDMA/rxe: Fix FIXME in rxe_udp_encap_recv()")
>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>> Version 3:
>> V2 of this patch had spelling errors and style issues which are
>> fixed in this version.
>>
>> Version 2:
>> v1 of this patch incorrectly added a WARN_ON_ONCE in rxe_completer
>> where it could be triggered for normal traffic. This version
>> replaced that with a pr_warn located correctly.
>>
>> v1 of this patch placed a call to kfree_skb in an if statement
>> that could trigger style warnings. This version cleans that up.
>>
>>  drivers/infiniband/sw/rxe/rxe_comp.c | 55 +++++++++++---------------
>>  drivers/infiniband/sw/rxe/rxe_net.c  | 10 ++++-
>>  drivers/infiniband/sw/rxe/rxe_recv.c | 59 +++++++++++++++++-----------
>>  3 files changed, 67 insertions(+), 57 deletions(-)
> 
> I split this into three patches for-rc as is required by Linus.
> 
> It looks reasonable to me and the reorganizing make sense
> 
> Thanks,
> Jason
> 
Thanks. Sorry this took so long to resolve.

bob

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-03-05 19:00 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-04 19:20 [PATCH for-next v3] RDMA/rxe: Fix ib_device reference counting (again) Bob Pearson
2021-03-05  3:10 ` Zhu Yanjun
2021-03-05 18:02   ` Bob Pearson
2021-03-05 18:20 ` Jason Gunthorpe
2021-03-05 18:59   ` Bob Pearson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.