linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] avoid race condition between start_xmit and cm_rep_handler
@ 2018-08-16  1:11 Aaron Knister
  2018-08-16  1:18 ` Aaron Knister
  2018-08-16  4:54 ` Leon Romanovsky
  0 siblings, 2 replies; 7+ messages in thread
From: Aaron Knister @ 2018-08-16  1:11 UTC (permalink / raw)
  To: linux-rdma; +Cc: stable, Ira Weiny, John Fleck

Inside of start_xmit() the call to check if the connection is up and the
queueing of the packets for later transmission is not atomic which
leaves a window where cm_rep_handler can run, set the connection up,
dequeue pending packets and leave the subsequently queued packets by
start_xmit() sitting on neigh->queue until they're dropped when the
connection is torn down. This only applies to connected mode. These
dropped packets can really upset TCP, for example,  and cause
multi-minute delays in transmission for open connections.

I've got a reproducer available if it's needed.

Here's the code in start_xmit where we check to see if the connection
is up:

       if (ipoib_cm_get(neigh)) {
               if (ipoib_cm_up(neigh)) {
                       ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
                       goto unref;
               }
       }

The race occurs if cm_rep_handler execution occurs after the above
connection check (specifically if it gets to the point where it acquires
priv->lock to dequeue pending skb's) but before the below code snippet
in start_xmit where packets are queued.

       if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
               push_pseudo_header(skb, phdr->hwaddr);
               spin_lock_irqsave(&priv->lock, flags);
               __skb_queue_tail(&neigh->queue, skb);
               spin_unlock_irqrestore(&priv->lock, flags);
       } else {
               ++dev->stats.tx_dropped;
               dev_kfree_skb_any(skb);
       }

The patch re-checks ipoib_cm_up with priv->lock held to avoid this
race condition. Since odds are the conn should be up most of the time
(and thus the connection *not* down most of the time) we don't hold the
lock for the first check attempt to avoid a slowdown from unecessary
locking for the majority of the packets transmitted during the
connection's life.

Cc: stable@vger.kernel.org
Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 53 +++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 26cde95b..529dbeab 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1093,6 +1093,34 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
+static void defer_neigh_skb(struct sk_buff *skb,  struct net_device *dev,
+			    struct ipoib_neigh *neigh,
+			    struct ipoib_pseudo_header *phdr,
+			    unsigned long *flags)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	unsigned long local_flags;
+	int acquire_priv_lock = 0;
+
+	/* Passing in pointer to spin_lock flags indicates spin lock
+	 * already acquired so we don't need to acquire the priv lock */
+	if (flags == NULL) {
+		flags = &local_flags;
+		acquire_priv_lock = 1;
+	}
+
+	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		push_pseudo_header(skb, phdr->hwaddr);
+		if (acquire_priv_lock)
+			spin_lock_irqsave(&priv->lock, *flags);
+		__skb_queue_tail(&neigh->queue, skb);
+		spin_unlock_irqrestore(&priv->lock, *flags);
+	} else {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+}
+
 static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -1160,6 +1188,21 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
 			goto unref;
 		}
+		/*
+		 * Re-check ipoib_cm_up with priv->lock held to avoid
+		 * race condition between start_xmit and skb_dequeue in
+		 * cm_rep_handler. Since odds are the conn should be up
+		 * most of the time, we don't hold the lock for the
+		 * first check above
+		 */
+		spin_lock_irqsave(&priv->lock, flags);
+		if (ipoib_cm_up(neigh)) {
+			spin_unlock_irqrestore(&priv->lock, flags);
+			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+		} else  {
+			defer_neigh_skb(skb, dev, neigh, phdr, &flags);
+		}
+		goto unref;
 	} else if (neigh->ah && neigh->ah->valid) {
 		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
 						IPOIB_QPN(phdr->hwaddr));
@@ -1168,15 +1211,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		neigh_refresh_path(neigh, phdr->hwaddr, dev);
 	}
 
-	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
-		push_pseudo_header(skb, phdr->hwaddr);
-		spin_lock_irqsave(&priv->lock, flags);
-		__skb_queue_tail(&neigh->queue, skb);
-		spin_unlock_irqrestore(&priv->lock, flags);
-	} else {
-		++dev->stats.tx_dropped;
-		dev_kfree_skb_any(skb);
-	}
+	defer_neigh_skb(skb, dev, neigh, phdr, NULL);
 
 unref:
 	ipoib_neigh_put(neigh);
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] avoid race condition between start_xmit and cm_rep_handler
  2018-08-16  1:11 [PATCH] avoid race condition between start_xmit and cm_rep_handler Aaron Knister
@ 2018-08-16  1:18 ` Aaron Knister
  2018-08-16  4:54 ` Leon Romanovsky
  1 sibling, 0 replies; 7+ messages in thread
From: Aaron Knister @ 2018-08-16  1:18 UTC (permalink / raw)
  To: linux-rdma; +Cc: stable, Ira Weiny, John Fleck

Sorry about the noise. I'm a total n00b when it comes to git send-email and it took me several attempts to get the correct from address and I also missed the Cc to stable in the sign off section (and somewhere in there there was a test to myself where git send-email picked up the Cc to stable after I added it). The e-mail I'm replying to is what I'd actually like to submit as a patch. Please disregard the others.

-Aaron

On 8/15/18 9:11 PM, Aaron Knister wrote:
> Inside of start_xmit() the call to check if the connection is up and the
> queueing of the packets for later transmission is not atomic which
> leaves a window where cm_rep_handler can run, set the connection up,
> dequeue pending packets and leave the subsequently queued packets by
> start_xmit() sitting on neigh->queue until they're dropped when the
> connection is torn down. This only applies to connected mode. These
> dropped packets can really upset TCP, for example,  and cause
> multi-minute delays in transmission for open connections.
> 
> I've got a reproducer available if it's needed.
> 
> Here's the code in start_xmit where we check to see if the connection
> is up:
> 
>         if (ipoib_cm_get(neigh)) {
>                 if (ipoib_cm_up(neigh)) {
>                         ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                         goto unref;
>                 }
>         }
> 
> The race occurs if cm_rep_handler execution occurs after the above
> connection check (specifically if it gets to the point where it acquires
> priv->lock to dequeue pending skb's) but before the below code snippet
> in start_xmit where packets are queued.
> 
>         if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>                 push_pseudo_header(skb, phdr->hwaddr);
>                 spin_lock_irqsave(&priv->lock, flags);
>                 __skb_queue_tail(&neigh->queue, skb);
>                 spin_unlock_irqrestore(&priv->lock, flags);
>         } else {
>                 ++dev->stats.tx_dropped;
>                 dev_kfree_skb_any(skb);
>         }
> 
> The patch re-checks ipoib_cm_up with priv->lock held to avoid this
> race condition. Since odds are the conn should be up most of the time
> (and thus the connection *not* down most of the time) we don't hold the
> lock for the first check attempt to avoid a slowdown from unecessary
> locking for the majority of the packets transmitted during the
> connection's life.
> 
> Cc: stable@vger.kernel.org
> Tested-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
> ---
>   drivers/infiniband/ulp/ipoib/ipoib_main.c | 53 +++++++++++++++++++++++++------
>   1 file changed, 44 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 26cde95b..529dbeab 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -1093,6 +1093,34 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
>   	spin_unlock_irqrestore(&priv->lock, flags);
>   }
>   
> +static void defer_neigh_skb(struct sk_buff *skb,  struct net_device *dev,
> +			    struct ipoib_neigh *neigh,
> +			    struct ipoib_pseudo_header *phdr,
> +			    unsigned long *flags)
> +{
> +	struct ipoib_dev_priv *priv = ipoib_priv(dev);
> +	unsigned long local_flags;
> +	int acquire_priv_lock = 0;
> +
> +	/* Passing in pointer to spin_lock flags indicates spin lock
> +	 * already acquired so we don't need to acquire the priv lock */
> +	if (flags == NULL) {
> +		flags = &local_flags;
> +		acquire_priv_lock = 1;
> +	}
> +
> +	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
> +		push_pseudo_header(skb, phdr->hwaddr);
> +		if (acquire_priv_lock)
> +			spin_lock_irqsave(&priv->lock, *flags);
> +		__skb_queue_tail(&neigh->queue, skb);
> +		spin_unlock_irqrestore(&priv->lock, *flags);
> +	} else {
> +		++dev->stats.tx_dropped;
> +		dev_kfree_skb_any(skb);
> +	}
> +}
> +
>   static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
>   {
>   	struct ipoib_dev_priv *priv = ipoib_priv(dev);
> @@ -1160,6 +1188,21 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
>   			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>   			goto unref;
>   		}
> +		/*
> +		 * Re-check ipoib_cm_up with priv->lock held to avoid
> +		 * race condition between start_xmit and skb_dequeue in
> +		 * cm_rep_handler. Since odds are the conn should be up
> +		 * most of the time, we don't hold the lock for the
> +		 * first check above
> +		 */
> +		spin_lock_irqsave(&priv->lock, flags);
> +		if (ipoib_cm_up(neigh)) {
> +			spin_unlock_irqrestore(&priv->lock, flags);
> +			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
> +		} else  {
> +			defer_neigh_skb(skb, dev, neigh, phdr, &flags);
> +		}
> +		goto unref;
>   	} else if (neigh->ah && neigh->ah->valid) {
>   		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
>   						IPOIB_QPN(phdr->hwaddr));
> @@ -1168,15 +1211,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
>   		neigh_refresh_path(neigh, phdr->hwaddr, dev);
>   	}
>   
> -	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
> -		push_pseudo_header(skb, phdr->hwaddr);
> -		spin_lock_irqsave(&priv->lock, flags);
> -		__skb_queue_tail(&neigh->queue, skb);
> -		spin_unlock_irqrestore(&priv->lock, flags);
> -	} else {
> -		++dev->stats.tx_dropped;
> -		dev_kfree_skb_any(skb);
> -	}
> +	defer_neigh_skb(skb, dev, neigh, phdr, NULL);
>   
>   unref:
>   	ipoib_neigh_put(neigh);
> 

-- 
Aaron Knister
NASA Center for Climate Simulation (Code 606.2)
Goddard Space Flight Center
(301) 286-2776

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] avoid race condition between start_xmit and cm_rep_handler
  2018-08-16  1:11 [PATCH] avoid race condition between start_xmit and cm_rep_handler Aaron Knister
  2018-08-16  1:18 ` Aaron Knister
@ 2018-08-16  4:54 ` Leon Romanovsky
  2018-08-16 13:04   ` Aaron Knister
  1 sibling, 1 reply; 7+ messages in thread
From: Leon Romanovsky @ 2018-08-16  4:54 UTC (permalink / raw)
  To: Aaron Knister; +Cc: linux-rdma, stable, Ira Weiny, John Fleck

[-- Attachment #1: Type: text/plain, Size: 2299 bytes --]

On Wed, Aug 15, 2018 at 09:11:49PM -0400, Aaron Knister wrote:
> Inside of start_xmit() the call to check if the connection is up and the
> queueing of the packets for later transmission is not atomic which
> leaves a window where cm_rep_handler can run, set the connection up,
> dequeue pending packets and leave the subsequently queued packets by
> start_xmit() sitting on neigh->queue until they're dropped when the
> connection is torn down. This only applies to connected mode. These
> dropped packets can really upset TCP, for example,  and cause
> multi-minute delays in transmission for open connections.
>
> I've got a reproducer available if it's needed.
>
> Here's the code in start_xmit where we check to see if the connection
> is up:
>
>        if (ipoib_cm_get(neigh)) {
>                if (ipoib_cm_up(neigh)) {
>                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                        goto unref;
>                }
>        }
>
> The race occurs if cm_rep_handler execution occurs after the above
> connection check (specifically if it gets to the point where it acquires
> priv->lock to dequeue pending skb's) but before the below code snippet
> in start_xmit where packets are queued.
>
>        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>                push_pseudo_header(skb, phdr->hwaddr);
>                spin_lock_irqsave(&priv->lock, flags);
>                __skb_queue_tail(&neigh->queue, skb);
>                spin_unlock_irqrestore(&priv->lock, flags);
>        } else {
>                ++dev->stats.tx_dropped;
>                dev_kfree_skb_any(skb);
>        }
>
> The patch re-checks ipoib_cm_up with priv->lock held to avoid this
> race condition. Since odds are the conn should be up most of the time
> (and thus the connection *not* down most of the time) we don't hold the
> lock for the first check attempt to avoid a slowdown from unecessary
> locking for the majority of the packets transmitted during the
> connection's life.
>
> Cc: stable@vger.kernel.org
> Tested-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
> ---

Sorry, but no mainly for two reasons:
1. Don't lock/unlock in different functions.
2. Don't create unbalanced number of lock/unlocks.

Thanks

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] avoid race condition between start_xmit and cm_rep_handler
  2018-08-16  4:54 ` Leon Romanovsky
@ 2018-08-16 13:04   ` Aaron Knister
  0 siblings, 0 replies; 7+ messages in thread
From: Aaron Knister @ 2018-08-16 13:04 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: linux-rdma, stable, Ira Weiny, John Fleck


[-- Attachment #1.1: Type: text/plain, Size: 3560 bytes --]



On 8/16/18 12:54 AM, Leon Romanovsky wrote:
> On Wed, Aug 15, 2018 at 09:11:49PM -0400, Aaron Knister wrote:
>> Inside of start_xmit() the call to check if the connection is up and the
>> queueing of the packets for later transmission is not atomic which
>> leaves a window where cm_rep_handler can run, set the connection up,
>> dequeue pending packets and leave the subsequently queued packets by
>> start_xmit() sitting on neigh->queue until they're dropped when the
>> connection is torn down. This only applies to connected mode. These
>> dropped packets can really upset TCP, for example,  and cause
>> multi-minute delays in transmission for open connections.
>>
>> I've got a reproducer available if it's needed.
>>
>> Here's the code in start_xmit where we check to see if the connection
>> is up:
>>
>>         if (ipoib_cm_get(neigh)) {
>>                 if (ipoib_cm_up(neigh)) {
>>                         ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>>                         goto unref;
>>                 }
>>         }
>>
>> The race occurs if cm_rep_handler execution occurs after the above
>> connection check (specifically if it gets to the point where it acquires
>> priv->lock to dequeue pending skb's) but before the below code snippet
>> in start_xmit where packets are queued.
>>
>>         if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>>                 push_pseudo_header(skb, phdr->hwaddr);
>>                 spin_lock_irqsave(&priv->lock, flags);
>>                 __skb_queue_tail(&neigh->queue, skb);
>>                 spin_unlock_irqrestore(&priv->lock, flags);
>>         } else {
>>                 ++dev->stats.tx_dropped;
>>                 dev_kfree_skb_any(skb);
>>         }
>>
>> The patch re-checks ipoib_cm_up with priv->lock held to avoid this
>> race condition. Since odds are the conn should be up most of the time
>> (and thus the connection *not* down most of the time) we don't hold the
>> lock for the first check attempt to avoid a slowdown from unecessary
>> locking for the majority of the packets transmitted during the
>> connection's life.
>>
>> Cc: stable@vger.kernel.org
>> Tested-by: Ira Weiny <ira.weiny@intel.com>
>> Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
>> ---
> 
> Sorry, but no mainly for two reasons:
> 1. Don't lock/unlock in different functions.
> 2. Don't create unbalanced number of lock/unlocks.
> 
> Thanks
> 

Thanks, Leon. I'm on-board with not locking/unlocking between functions. That felt a little weird, and I think I can work around that. I'm curious, though, about the unbalanced number of lock/unlocks because I don't see that looking at the patch. Could you help me understand your concern? Having said that, this struck me as appearing unbalanced:

+		spin_lock_irqsave(&priv->lock, flags);
+		if (ipoib_cm_up(neigh)) {
+			spin_unlock_irqrestore(&priv->lock, flags);
+			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+		} else  {
+			defer_neigh_skb(skb, dev, neigh, phdr, &flags);
+		}

but that call to defer_neigh_skb would call spin_unlock_irqrestore because it passes in &flags to defer_neigh_skb. It's not obvious, though, which is probably an issue. I'm trying to balance only holding priv->lock where absolutely necessary with not typing chunks of code out twice but with subtle differences.

I'll re-work this and re-submit.

-Aaron

-- 
Aaron Knister
NASA Center for Climate Simulation (Code 606.2)
Goddard Space Flight Center
(301) 286-2776


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 874 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] avoid race condition between start_xmit and cm_rep_handler
  2018-08-16  0:37 Aaron S. Knister
  2018-08-16  1:24 ` Weiny, Ira
@ 2018-08-16 22:27 ` Jason Gunthorpe
  1 sibling, 0 replies; 7+ messages in thread
From: Jason Gunthorpe @ 2018-08-16 22:27 UTC (permalink / raw)
  To: Aaron S. Knister; +Cc: linux-rdma, stable, Ira Weiny, John Fleck

On Wed, Aug 15, 2018 at 08:37:22PM -0400, Aaron S. Knister wrote:
> Inside of start_xmit() the call to check if the connection is up and the
> queueing of the packets for later transmission is not atomic which
> leaves a window where cm_rep_handler can run, set the connection up,
> dequeue pending packets and leave the subsequently queued packets by
> start_xmit() sitting on neigh->queue until they're dropped when the
> connection is torn down. This only applies to connected mode. These
> dropped packets can really upset TCP, for example,  and cause
> multi-minute delays in transmission for open connections.
> 
> I've got a reproducer available if it's needed.
> 
> Here's the code in start_xmit where we check to see if the connection
> is up:
> 
>        if (ipoib_cm_get(neigh)) {
>                if (ipoib_cm_up(neigh)) {
>                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                        goto unref;
>                }
>        }

Agree with Leon on the locking. Find a more elegant way to write
this.

> The race occurs if cm_rep_handler execution occurs after the above
> connection check (specifically if it gets to the point where it acquires
> priv->lock to dequeue pending skb's) but before the below code snippet
> in start_xmit where packets are queued.
> 
>        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>                push_pseudo_header(skb, phdr->hwaddr);
>                spin_lock_irqsave(&priv->lock, flags);
>                __skb_queue_tail(&neigh->queue, skb);
>                spin_unlock_irqrestore(&priv->lock, flags);
>        } else {
>                ++dev->stats.tx_dropped;
>                dev_kfree_skb_any(skb);
>        }

Somehow I think the spinlock should be held across the skb_queue_len as
well. Right?

I wonder if the 'neigh->ah' has the same racing problem and needs the
same fixing:

	} else if (neigh->ah) {
		neigh_refresh_path(neigh, phdr->hwaddr, dev);
	}

??

Have a feeling that needs a READ_ONCE to be correct...
 
Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH] avoid race condition between start_xmit and cm_rep_handler
  2018-08-16  0:37 Aaron S. Knister
@ 2018-08-16  1:24 ` Weiny, Ira
  2018-08-16 22:27 ` Jason Gunthorpe
  1 sibling, 0 replies; 7+ messages in thread
From: Weiny, Ira @ 2018-08-16  1:24 UTC (permalink / raw)
  To: Aaron S. Knister, linux-rdma; +Cc: stable, Fleck, John

Aaron and I were working on this and this was sent by an incorrect email without the stable email in the commit message.

I have removed it from patchworks and Aaron has resent the patch...

Please disregard thanks,
Ira

> -----Original Message-----
> From: Aaron S. Knister [mailto:aknister@discover.nccs.nasa.gov]
> Sent: Wednesday, August 15, 2018 5:37 PM
> To: linux-rdma@vger.kernel.org
> Cc: stable@vger.kernel.org; Weiny, Ira <ira.weiny@intel.com>; Fleck, John
> <john.fleck@intel.com>
> Subject: [PATCH] avoid race condition between start_xmit and
> cm_rep_handler
> 
> Inside of start_xmit() the call to check if the connection is up and the
> queueing of the packets for later transmission is not atomic which leaves a
> window where cm_rep_handler can run, set the connection up, dequeue
> pending packets and leave the subsequently queued packets by
> start_xmit() sitting on neigh->queue until they're dropped when the
> connection is torn down. This only applies to connected mode. These
> dropped packets can really upset TCP, for example,  and cause multi-minute
> delays in transmission for open connections.
> 
> I've got a reproducer available if it's needed.
> 
> Here's the code in start_xmit where we check to see if the connection is up:
> 
>        if (ipoib_cm_get(neigh)) {
>                if (ipoib_cm_up(neigh)) {
>                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>                        goto unref;
>                }
>        }
> 
> The race occurs if cm_rep_handler execution occurs after the above
> connection check (specifically if it gets to the point where it acquires
> priv->lock to dequeue pending skb's) but before the below code snippet
> in start_xmit where packets are queued.
> 
>        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
>                push_pseudo_header(skb, phdr->hwaddr);
>                spin_lock_irqsave(&priv->lock, flags);
>                __skb_queue_tail(&neigh->queue, skb);
>                spin_unlock_irqrestore(&priv->lock, flags);
>        } else {
>                ++dev->stats.tx_dropped;
>                dev_kfree_skb_any(skb);
>        }
> 
> The patch re-checks ipoib_cm_up with priv->lock held to avoid this race
> condition. Since odds are the conn should be up most of the time (and thus
> the connection *not* down most of the time) we don't hold the lock for the
> first check attempt to avoid a slowdown from unecessary locking for the
> majority of the packets transmitted during the connection's life.
> 
> Tested-by: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib_main.c | 53
> +++++++++++++++++++++++++------
>  1 file changed, 44 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 26cde95b..529dbeab 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -1093,6 +1093,34 @@ static void unicast_arp_send(struct sk_buff *skb,
> struct net_device *dev,
>  	spin_unlock_irqrestore(&priv->lock, flags);  }
> 
> +static void defer_neigh_skb(struct sk_buff *skb,  struct net_device *dev,
> +			    struct ipoib_neigh *neigh,
> +			    struct ipoib_pseudo_header *phdr,
> +			    unsigned long *flags)
> +{
> +	struct ipoib_dev_priv *priv = ipoib_priv(dev);
> +	unsigned long local_flags;
> +	int acquire_priv_lock = 0;
> +
> +	/* Passing in pointer to spin_lock flags indicates spin lock
> +	 * already acquired so we don't need to acquire the priv lock */
> +	if (flags == NULL) {
> +		flags = &local_flags;
> +		acquire_priv_lock = 1;
> +	}
> +
> +	if (skb_queue_len(&neigh->queue) <
> IPOIB_MAX_PATH_REC_QUEUE) {
> +		push_pseudo_header(skb, phdr->hwaddr);
> +		if (acquire_priv_lock)
> +			spin_lock_irqsave(&priv->lock, *flags);
> +		__skb_queue_tail(&neigh->queue, skb);
> +		spin_unlock_irqrestore(&priv->lock, *flags);
> +	} else {
> +		++dev->stats.tx_dropped;
> +		dev_kfree_skb_any(skb);
> +	}
> +}
> +
>  static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device
> *dev)  {
>  	struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -1160,6 +1188,21
> @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct
> net_device *dev)
>  			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
>  			goto unref;
>  		}
> +		/*
> +		 * Re-check ipoib_cm_up with priv->lock held to avoid
> +		 * race condition between start_xmit and skb_dequeue in
> +		 * cm_rep_handler. Since odds are the conn should be up
> +		 * most of the time, we don't hold the lock for the
> +		 * first check above
> +		 */
> +		spin_lock_irqsave(&priv->lock, flags);
> +		if (ipoib_cm_up(neigh)) {
> +			spin_unlock_irqrestore(&priv->lock, flags);
> +			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
> +		} else  {
> +			defer_neigh_skb(skb, dev, neigh, phdr, &flags);
> +		}
> +		goto unref;
>  	} else if (neigh->ah && neigh->ah->valid) {
>  		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
>  						IPOIB_QPN(phdr->hwaddr));
> @@ -1168,15 +1211,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff
> *skb, struct net_device *dev)
>  		neigh_refresh_path(neigh, phdr->hwaddr, dev);
>  	}
> 
> -	if (skb_queue_len(&neigh->queue) <
> IPOIB_MAX_PATH_REC_QUEUE) {
> -		push_pseudo_header(skb, phdr->hwaddr);
> -		spin_lock_irqsave(&priv->lock, flags);
> -		__skb_queue_tail(&neigh->queue, skb);
> -		spin_unlock_irqrestore(&priv->lock, flags);
> -	} else {
> -		++dev->stats.tx_dropped;
> -		dev_kfree_skb_any(skb);
> -	}
> +	defer_neigh_skb(skb, dev, neigh, phdr, NULL);
> 
>  unref:
>  	ipoib_neigh_put(neigh);
> --
> 2.12.3

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] avoid race condition between start_xmit and cm_rep_handler
@ 2018-08-16  0:37 Aaron S. Knister
  2018-08-16  1:24 ` Weiny, Ira
  2018-08-16 22:27 ` Jason Gunthorpe
  0 siblings, 2 replies; 7+ messages in thread
From: Aaron S. Knister @ 2018-08-16  0:37 UTC (permalink / raw)
  To: linux-rdma; +Cc: stable, Ira Weiny, John Fleck

Inside of start_xmit() the call to check if the connection is up and the
queueing of the packets for later transmission is not atomic which
leaves a window where cm_rep_handler can run, set the connection up,
dequeue pending packets and leave the subsequently queued packets by
start_xmit() sitting on neigh->queue until they're dropped when the
connection is torn down. This only applies to connected mode. These
dropped packets can really upset TCP, for example,  and cause
multi-minute delays in transmission for open connections.

I've got a reproducer available if it's needed.

Here's the code in start_xmit where we check to see if the connection
is up:

       if (ipoib_cm_get(neigh)) {
               if (ipoib_cm_up(neigh)) {
                       ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
                       goto unref;
               }
       }

The race occurs if cm_rep_handler execution occurs after the above
connection check (specifically if it gets to the point where it acquires
priv->lock to dequeue pending skb's) but before the below code snippet
in start_xmit where packets are queued.

       if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
               push_pseudo_header(skb, phdr->hwaddr);
               spin_lock_irqsave(&priv->lock, flags);
               __skb_queue_tail(&neigh->queue, skb);
               spin_unlock_irqrestore(&priv->lock, flags);
       } else {
               ++dev->stats.tx_dropped;
               dev_kfree_skb_any(skb);
       }

The patch re-checks ipoib_cm_up with priv->lock held to avoid this
race condition. Since odds are the conn should be up most of the time
(and thus the connection *not* down most of the time) we don't hold the
lock for the first check attempt to avoid a slowdown from unecessary
locking for the majority of the packets transmitted during the
connection's life.

Tested-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Aaron Knister <aaron.s.knister@nasa.gov>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 53 +++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 26cde95b..529dbeab 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1093,6 +1093,34 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
+static void defer_neigh_skb(struct sk_buff *skb,  struct net_device *dev,
+			    struct ipoib_neigh *neigh,
+			    struct ipoib_pseudo_header *phdr,
+			    unsigned long *flags)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	unsigned long local_flags;
+	int acquire_priv_lock = 0;
+
+	/* Passing in pointer to spin_lock flags indicates spin lock
+	 * already acquired so we don't need to acquire the priv lock */
+	if (flags == NULL) {
+		flags = &local_flags;
+		acquire_priv_lock = 1;
+	}
+
+	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		push_pseudo_header(skb, phdr->hwaddr);
+		if (acquire_priv_lock)
+			spin_lock_irqsave(&priv->lock, *flags);
+		__skb_queue_tail(&neigh->queue, skb);
+		spin_unlock_irqrestore(&priv->lock, *flags);
+	} else {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+}
+
 static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
@@ -1160,6 +1188,21 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
 			goto unref;
 		}
+		/*
+		 * Re-check ipoib_cm_up with priv->lock held to avoid
+		 * race condition between start_xmit and skb_dequeue in
+		 * cm_rep_handler. Since odds are the conn should be up
+		 * most of the time, we don't hold the lock for the
+		 * first check above
+		 */
+		spin_lock_irqsave(&priv->lock, flags);
+		if (ipoib_cm_up(neigh)) {
+			spin_unlock_irqrestore(&priv->lock, flags);
+			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+		} else  {
+			defer_neigh_skb(skb, dev, neigh, phdr, &flags);
+		}
+		goto unref;
 	} else if (neigh->ah && neigh->ah->valid) {
 		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
 						IPOIB_QPN(phdr->hwaddr));
@@ -1168,15 +1211,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		neigh_refresh_path(neigh, phdr->hwaddr, dev);
 	}
 
-	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
-		push_pseudo_header(skb, phdr->hwaddr);
-		spin_lock_irqsave(&priv->lock, flags);
-		__skb_queue_tail(&neigh->queue, skb);
-		spin_unlock_irqrestore(&priv->lock, flags);
-	} else {
-		++dev->stats.tx_dropped;
-		dev_kfree_skb_any(skb);
-	}
+	defer_neigh_skb(skb, dev, neigh, phdr, NULL);
 
 unref:
 	ipoib_neigh_put(neigh);
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-08-16 22:27 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-16  1:11 [PATCH] avoid race condition between start_xmit and cm_rep_handler Aaron Knister
2018-08-16  1:18 ` Aaron Knister
2018-08-16  4:54 ` Leon Romanovsky
2018-08-16 13:04   ` Aaron Knister
  -- strict thread matches above, loose matches on Subject: below --
2018-08-16  0:37 Aaron S. Knister
2018-08-16  1:24 ` Weiny, Ira
2018-08-16 22:27 ` Jason Gunthorpe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).