linux-ppp.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame
@ 2016-08-16 11:05 fgao
  2016-08-16 11:36 ` Feng Gao
  2016-08-19 15:16 ` [PATCH 1/1] ppp: Fix one deadlock issue of PPP when reentrant fgao
  0 siblings, 2 replies; 10+ messages in thread
From: fgao @ 2016-08-16 11:05 UTC (permalink / raw)
  To: paulus, linux-ppp, netdev; +Cc: gfree.wind, Gao Feng

From: Gao Feng <fgao@ikuai8.com>

PPP channel holds one spinlock before send frame. But the skb may
select the same PPP channel with wrong route policy. As a result,
the skb reaches the same channel path. It tries to get the same
spinlock which is held before. Bang, the deadlock comes out.

Now add one lock owner to avoid it like xmit_lock_owner of
netdev_queue. Check the lock owner before try to get the spinlock.
If the current cpu is already the owner, needn't lock again. When
PPP channel holds the spinlock at the first time, it sets owner
with current CPU ID.

The following is the panic stack of 3.3.8. But the same issue
should be in the upstream too.

[<ffffffff81568131>] ? _raw_spin_lock_bh+0x11/0x40
[<ffffffffa006a2b7>] ppp_unregister_channel+0x1347/0x2170 [ppp_generic]
[<ffffffff810a2827>] ? kmem_cache_free+0xa7/0xc0
[<ffffffffa006ad27>] ppp_unregister_channel+0x1db7/0x2170 [ppp_generic]
[<ffffffffa006afd5>] ppp_unregister_channel+0x2065/0x2170 [ppp_generic]
[<ffffffff8148f1dd>] dev_hard_start_xmit+0x4cd/0x620
[<ffffffff814a6254>] sch_direct_xmit+0x74/0x1d0
[<ffffffff8148f88d>] dev_queue_xmit+0x1d/0x30
[<ffffffff81496a4c>] neigh_direct_output+0xc/0x10
[<ffffffff814d9dae>] ip_finish_output+0x25e/0x2b0
[<ffffffff814da688>] ip_output+0x88/0x90
[<ffffffff814d9e9f>] ? __ip_local_out+0x9f/0xb0
[<ffffffff814d9ed4>] ip_local_out+0x24/0x30
[<ffffffffa00b9745>] 0xffffffffa00b9744
[<ffffffffa006b068>] ppp_unregister_channel+0x20f8/0x2170 [ppp_generic]
[<ffffffffa006b202>] ppp_output_wakeup+0x122/0x11d0 [ppp_generic]
[<ffffffff810a7978>] vfs_write+0xb8/0x160
[<ffffffff810a7c55>] sys_write+0x45/0x90
[<ffffffff815689e2>] system_call_fastpath+0x16/0x1b

The call flow is like this.
ppp_write->ppp_channel_push->start_xmit->select inappropriate route
.... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
ppp_push. Now ppp_push tries to get the same spinlock which is held
in ppp_channel_push.

Although the PPP deadlock is caused by inappropriate route policy
with L2TP, I think it is not accepted the PPP module would cause kernel
deadlock with wrong route policy.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
---
 v1: Initial Patch

 drivers/net/ppp/ppp_generic.c | 49 +++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 70cfa06..ffd0233 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -162,6 +162,29 @@ struct ppp {
 			 |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
 			 |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
 
+struct chennel_lock {
+	spinlock_t lock;
+	u32 owner;
+};
+
+#define PPP_CHANNEL_LOCK_INIT(cl) \
+	cl.owner = -1; \
+	spin_lock_init(&cl.lock)
+
+#define PPP_CHANNEL_LOCK_BH(cl) \
+	do { \
+		local_bh_disable(); \
+		if (cl.owner != smp_processor_id()) { \
+			spin_lock(&cl.lock); \
+			cl.owner = smp_processor_id(); \
+		} \
+	} while (0)
+
+#define PPP_CHANNEL_UNLOCK_BH(cl) \
+	cl.owner = -1; \
+	spin_unlock(&cl.lock); \
+	local_bh_enable()
+
 /*
  * Private data structure for each channel.
  * This includes the data structure used for multilink.
@@ -171,7 +194,7 @@ struct channel {
 	struct list_head list;		/* link in all/new_channels list */
 	struct ppp_channel *chan;	/* public channel data structure */
 	struct rw_semaphore chan_sem;	/* protects `chan' during chan ioctl */
-	spinlock_t	downl;		/* protects `chan', file.xq dequeue */
+	struct chennel_lock downl;	/* protects `chan', file.xq dequeue */
 	struct ppp	*ppp;		/* ppp unit we're connected to */
 	struct net	*chan_net;	/* the net channel belongs to */
 	struct list_head clist;		/* link in list of channels per unit */
@@ -1597,7 +1620,7 @@ ppp_push(struct ppp *ppp)
 		list = list->next;
 		pch = list_entry(list, struct channel, clist);
 
-		spin_lock_bh(&pch->downl);
+		PPP_CHANNEL_LOCK_BH(pch->downl);
 		if (pch->chan) {
 			if (pch->chan->ops->start_xmit(pch->chan, skb))
 				ppp->xmit_pending = NULL;
@@ -1606,7 +1629,7 @@ ppp_push(struct ppp *ppp)
 			kfree_skb(skb);
 			ppp->xmit_pending = NULL;
 		}
-		spin_unlock_bh(&pch->downl);
+		PPP_CHANNEL_UNLOCK_BH(pch->downl);
 		return;
 	}
 
@@ -1736,7 +1759,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 		}
 
 		/* check the channel's mtu and whether it is still attached. */
-		spin_lock_bh(&pch->downl);
+		PPP_CHANNEL_LOCK_BH(pch->downl);
 		if (pch->chan = NULL) {
 			/* can't use this channel, it's being deregistered */
 			if (pch->speed = 0)
@@ -1744,7 +1767,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 			else
 				totspeed -= pch->speed;
 
-			spin_unlock_bh(&pch->downl);
+			PPP_CHANNEL_UNLOCK_BH(pch->downl);
 			pch->avail = 0;
 			totlen = len;
 			totfree--;
@@ -1795,7 +1818,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 		 */
 		if (flen <= 0) {
 			pch->avail = 2;
-			spin_unlock_bh(&pch->downl);
+			PPP_CHANNEL_UNLOCK_BH(pch->downl);
 			continue;
 		}
 
@@ -1840,14 +1863,14 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 		len -= flen;
 		++ppp->nxseq;
 		bits = 0;
-		spin_unlock_bh(&pch->downl);
+		PPP_CHANNEL_UNLOCK_BH(pch->downl);
 	}
 	ppp->nxchan = i;
 
 	return 1;
 
  noskb:
-	spin_unlock_bh(&pch->downl);
+	PPP_CHANNEL_UNLOCK_BH(pch->downl);
 	if (ppp->debug & 1)
 		netdev_err(ppp->dev, "PPP: no memory (fragment)\n");
 	++ppp->dev->stats.tx_errors;
@@ -1865,7 +1888,7 @@ ppp_channel_push(struct channel *pch)
 	struct sk_buff *skb;
 	struct ppp *ppp;
 
-	spin_lock_bh(&pch->downl);
+	PPP_CHANNEL_LOCK_BH(pch->downl);
 	if (pch->chan) {
 		while (!skb_queue_empty(&pch->file.xq)) {
 			skb = skb_dequeue(&pch->file.xq);
@@ -1879,7 +1902,7 @@ ppp_channel_push(struct channel *pch)
 		/* channel got deregistered */
 		skb_queue_purge(&pch->file.xq);
 	}
-	spin_unlock_bh(&pch->downl);
+	PPP_CHANNEL_UNLOCK_BH(pch->downl);
 	/* see if there is anything from the attached unit to be sent */
 	if (skb_queue_empty(&pch->file.xq)) {
 		read_lock_bh(&pch->upl);
@@ -2520,7 +2543,7 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
 	pch->lastseq = -1;
 #endif /* CONFIG_PPP_MULTILINK */
 	init_rwsem(&pch->chan_sem);
-	spin_lock_init(&pch->downl);
+	PPP_CHANNEL_LOCK_INIT(pch->downl);
 	rwlock_init(&pch->upl);
 
 	spin_lock_bh(&pn->all_channels_lock);
@@ -2599,9 +2622,9 @@ ppp_unregister_channel(struct ppp_channel *chan)
 	 * the channel's start_xmit or ioctl routine before we proceed.
 	 */
 	down_write(&pch->chan_sem);
-	spin_lock_bh(&pch->downl);
+	PPP_CHANNEL_LOCK_BH(pch->downl);
 	pch->chan = NULL;
-	spin_unlock_bh(&pch->downl);
+	PPP_CHANNEL_UNLOCK_BH(pch->downl);
 	up_write(&pch->chan_sem);
 	ppp_disconnect_channel(pch);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame
  2016-08-16 11:05 [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame fgao
@ 2016-08-16 11:36 ` Feng Gao
  2016-08-18 14:11   ` Philp Prindeville
  2016-08-19 15:16 ` [PATCH 1/1] ppp: Fix one deadlock issue of PPP when reentrant fgao
  1 sibling, 1 reply; 10+ messages in thread
From: Feng Gao @ 2016-08-16 11:36 UTC (permalink / raw)
  To: Gao Feng; +Cc: paulus, linux-ppp, Linux Kernel Network Developers

Hi Paul,

The v1 patch does not handle the recursive lock case. It could cause
unlock multiple times.
So I send the v2 patch as one update.

Best Regards
Feng

On Tue, Aug 16, 2016 at 7:05 PM,  <fgao@ikuai8.com> wrote:
> From: Gao Feng <fgao@ikuai8.com>
>
> PPP channel holds one spinlock before send frame. But the skb may
> select the same PPP channel with wrong route policy. As a result,
> the skb reaches the same channel path. It tries to get the same
> spinlock which is held before. Bang, the deadlock comes out.
>
> Now add one lock owner to avoid it like xmit_lock_owner of
> netdev_queue. Check the lock owner before try to get the spinlock.
> If the current cpu is already the owner, needn't lock again. When
> PPP channel holds the spinlock at the first time, it sets owner
> with current CPU ID.
>
> The following is the panic stack of 3.3.8. But the same issue
> should be in the upstream too.
>
> [<ffffffff81568131>] ? _raw_spin_lock_bh+0x11/0x40
> [<ffffffffa006a2b7>] ppp_unregister_channel+0x1347/0x2170 [ppp_generic]
> [<ffffffff810a2827>] ? kmem_cache_free+0xa7/0xc0
> [<ffffffffa006ad27>] ppp_unregister_channel+0x1db7/0x2170 [ppp_generic]
> [<ffffffffa006afd5>] ppp_unregister_channel+0x2065/0x2170 [ppp_generic]
> [<ffffffff8148f1dd>] dev_hard_start_xmit+0x4cd/0x620
> [<ffffffff814a6254>] sch_direct_xmit+0x74/0x1d0
> [<ffffffff8148f88d>] dev_queue_xmit+0x1d/0x30
> [<ffffffff81496a4c>] neigh_direct_output+0xc/0x10
> [<ffffffff814d9dae>] ip_finish_output+0x25e/0x2b0
> [<ffffffff814da688>] ip_output+0x88/0x90
> [<ffffffff814d9e9f>] ? __ip_local_out+0x9f/0xb0
> [<ffffffff814d9ed4>] ip_local_out+0x24/0x30
> [<ffffffffa00b9745>] 0xffffffffa00b9744
> [<ffffffffa006b068>] ppp_unregister_channel+0x20f8/0x2170 [ppp_generic]
> [<ffffffffa006b202>] ppp_output_wakeup+0x122/0x11d0 [ppp_generic]
> [<ffffffff810a7978>] vfs_write+0xb8/0x160
> [<ffffffff810a7c55>] sys_write+0x45/0x90
> [<ffffffff815689e2>] system_call_fastpath+0x16/0x1b
>
> The call flow is like this.
> ppp_write->ppp_channel_push->start_xmit->select inappropriate route
> .... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
> ppp_push. Now ppp_push tries to get the same spinlock which is held
> in ppp_channel_push.
>
> Although the PPP deadlock is caused by inappropriate route policy
> with L2TP, I think it is not accepted the PPP module would cause kernel
> deadlock with wrong route policy.
>
> Signed-off-by: Gao Feng <fgao@ikuai8.com>
> ---
>  v1: Initial Patch
>
>  drivers/net/ppp/ppp_generic.c | 49 +++++++++++++++++++++++++++++++------------
>  1 file changed, 36 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
> index 70cfa06..ffd0233 100644
> --- a/drivers/net/ppp/ppp_generic.c
> +++ b/drivers/net/ppp/ppp_generic.c
> @@ -162,6 +162,29 @@ struct ppp {
>                          |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
>                          |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
>
> +struct chennel_lock {
> +       spinlock_t lock;
> +       u32 owner;
> +};
> +
> +#define PPP_CHANNEL_LOCK_INIT(cl) \
> +       cl.owner = -1; \
> +       spin_lock_init(&cl.lock)
> +
> +#define PPP_CHANNEL_LOCK_BH(cl) \
> +       do { \
> +               local_bh_disable(); \
> +               if (cl.owner != smp_processor_id()) { \
> +                       spin_lock(&cl.lock); \
> +                       cl.owner = smp_processor_id(); \
> +               } \
> +       } while (0)
> +
> +#define PPP_CHANNEL_UNLOCK_BH(cl) \
> +       cl.owner = -1; \
> +       spin_unlock(&cl.lock); \
> +       local_bh_enable()
> +
>  /*
>   * Private data structure for each channel.
>   * This includes the data structure used for multilink.
> @@ -171,7 +194,7 @@ struct channel {
>         struct list_head list;          /* link in all/new_channels list */
>         struct ppp_channel *chan;       /* public channel data structure */
>         struct rw_semaphore chan_sem;   /* protects `chan' during chan ioctl */
> -       spinlock_t      downl;          /* protects `chan', file.xq dequeue */
> +       struct chennel_lock downl;      /* protects `chan', file.xq dequeue */
>         struct ppp      *ppp;           /* ppp unit we're connected to */
>         struct net      *chan_net;      /* the net channel belongs to */
>         struct list_head clist;         /* link in list of channels per unit */
> @@ -1597,7 +1620,7 @@ ppp_push(struct ppp *ppp)
>                 list = list->next;
>                 pch = list_entry(list, struct channel, clist);
>
> -               spin_lock_bh(&pch->downl);
> +               PPP_CHANNEL_LOCK_BH(pch->downl);
>                 if (pch->chan) {
>                         if (pch->chan->ops->start_xmit(pch->chan, skb))
>                                 ppp->xmit_pending = NULL;
> @@ -1606,7 +1629,7 @@ ppp_push(struct ppp *ppp)
>                         kfree_skb(skb);
>                         ppp->xmit_pending = NULL;
>                 }
> -               spin_unlock_bh(&pch->downl);
> +               PPP_CHANNEL_UNLOCK_BH(pch->downl);
>                 return;
>         }
>
> @@ -1736,7 +1759,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>                 }
>
>                 /* check the channel's mtu and whether it is still attached. */
> -               spin_lock_bh(&pch->downl);
> +               PPP_CHANNEL_LOCK_BH(pch->downl);
>                 if (pch->chan = NULL) {
>                         /* can't use this channel, it's being deregistered */
>                         if (pch->speed = 0)
> @@ -1744,7 +1767,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>                         else
>                                 totspeed -= pch->speed;
>
> -                       spin_unlock_bh(&pch->downl);
> +                       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>                         pch->avail = 0;
>                         totlen = len;
>                         totfree--;
> @@ -1795,7 +1818,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>                  */
>                 if (flen <= 0) {
>                         pch->avail = 2;
> -                       spin_unlock_bh(&pch->downl);
> +                       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>                         continue;
>                 }
>
> @@ -1840,14 +1863,14 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>                 len -= flen;
>                 ++ppp->nxseq;
>                 bits = 0;
> -               spin_unlock_bh(&pch->downl);
> +               PPP_CHANNEL_UNLOCK_BH(pch->downl);
>         }
>         ppp->nxchan = i;
>
>         return 1;
>
>   noskb:
> -       spin_unlock_bh(&pch->downl);
> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>         if (ppp->debug & 1)
>                 netdev_err(ppp->dev, "PPP: no memory (fragment)\n");
>         ++ppp->dev->stats.tx_errors;
> @@ -1865,7 +1888,7 @@ ppp_channel_push(struct channel *pch)
>         struct sk_buff *skb;
>         struct ppp *ppp;
>
> -       spin_lock_bh(&pch->downl);
> +       PPP_CHANNEL_LOCK_BH(pch->downl);
>         if (pch->chan) {
>                 while (!skb_queue_empty(&pch->file.xq)) {
>                         skb = skb_dequeue(&pch->file.xq);
> @@ -1879,7 +1902,7 @@ ppp_channel_push(struct channel *pch)
>                 /* channel got deregistered */
>                 skb_queue_purge(&pch->file.xq);
>         }
> -       spin_unlock_bh(&pch->downl);
> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>         /* see if there is anything from the attached unit to be sent */
>         if (skb_queue_empty(&pch->file.xq)) {
>                 read_lock_bh(&pch->upl);
> @@ -2520,7 +2543,7 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
>         pch->lastseq = -1;
>  #endif /* CONFIG_PPP_MULTILINK */
>         init_rwsem(&pch->chan_sem);
> -       spin_lock_init(&pch->downl);
> +       PPP_CHANNEL_LOCK_INIT(pch->downl);
>         rwlock_init(&pch->upl);
>
>         spin_lock_bh(&pn->all_channels_lock);
> @@ -2599,9 +2622,9 @@ ppp_unregister_channel(struct ppp_channel *chan)
>          * the channel's start_xmit or ioctl routine before we proceed.
>          */
>         down_write(&pch->chan_sem);
> -       spin_lock_bh(&pch->downl);
> +       PPP_CHANNEL_LOCK_BH(pch->downl);
>         pch->chan = NULL;
> -       spin_unlock_bh(&pch->downl);
> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>         up_write(&pch->chan_sem);
>         ppp_disconnect_channel(pch);
>
> --
> 1.9.1
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame
  2016-08-16 11:36 ` Feng Gao
@ 2016-08-18 14:11   ` Philp Prindeville
  2016-08-18 15:05     ` Feng Gao
  2016-08-19 22:47     ` Guillaume Nault
  0 siblings, 2 replies; 10+ messages in thread
From: Philp Prindeville @ 2016-08-18 14:11 UTC (permalink / raw)
  To: Feng Gao, Gao Feng; +Cc: paulus, linux-ppp, Linux Kernel Network Developers

Feng,

If the CPU can already be holding the lock, that implies re-entrancy.  
What's to stop the first flow of code which acquired the lock from 
releasing it again before the 2nd flow is done?  Is the 2nd flow running 
at a higher priority or with interrupts disabled?

It seems unlikely to me that this sort of locking problem hasn't existed 
elsewhere before and that an entirely new mechanism for handling it is 
needed...  How are similar re-entrancy issues handled elsewhere?

-Philip



On 08/16/2016 05:36 AM, Feng Gao wrote:
> Hi Paul,
>
> The v1 patch does not handle the recursive lock case. It could cause
> unlock multiple times.
> So I send the v2 patch as one update.
>
> Best Regards
> Feng
>
> On Tue, Aug 16, 2016 at 7:05 PM,  <fgao@ikuai8.com> wrote:
>> From: Gao Feng <fgao@ikuai8.com>
>>
>> PPP channel holds one spinlock before send frame. But the skb may
>> select the same PPP channel with wrong route policy. As a result,
>> the skb reaches the same channel path. It tries to get the same
>> spinlock which is held before. Bang, the deadlock comes out.
>>
>> Now add one lock owner to avoid it like xmit_lock_owner of
>> netdev_queue. Check the lock owner before try to get the spinlock.
>> If the current cpu is already the owner, needn't lock again. When
>> PPP channel holds the spinlock at the first time, it sets owner
>> with current CPU ID.
>>
>> The following is the panic stack of 3.3.8. But the same issue
>> should be in the upstream too.
>>
>> [<ffffffff81568131>] ? _raw_spin_lock_bh+0x11/0x40
>> [<ffffffffa006a2b7>] ppp_unregister_channel+0x1347/0x2170 [ppp_generic]
>> [<ffffffff810a2827>] ? kmem_cache_free+0xa7/0xc0
>> [<ffffffffa006ad27>] ppp_unregister_channel+0x1db7/0x2170 [ppp_generic]
>> [<ffffffffa006afd5>] ppp_unregister_channel+0x2065/0x2170 [ppp_generic]
>> [<ffffffff8148f1dd>] dev_hard_start_xmit+0x4cd/0x620
>> [<ffffffff814a6254>] sch_direct_xmit+0x74/0x1d0
>> [<ffffffff8148f88d>] dev_queue_xmit+0x1d/0x30
>> [<ffffffff81496a4c>] neigh_direct_output+0xc/0x10
>> [<ffffffff814d9dae>] ip_finish_output+0x25e/0x2b0
>> [<ffffffff814da688>] ip_output+0x88/0x90
>> [<ffffffff814d9e9f>] ? __ip_local_out+0x9f/0xb0
>> [<ffffffff814d9ed4>] ip_local_out+0x24/0x30
>> [<ffffffffa00b9745>] 0xffffffffa00b9744
>> [<ffffffffa006b068>] ppp_unregister_channel+0x20f8/0x2170 [ppp_generic]
>> [<ffffffffa006b202>] ppp_output_wakeup+0x122/0x11d0 [ppp_generic]
>> [<ffffffff810a7978>] vfs_write+0xb8/0x160
>> [<ffffffff810a7c55>] sys_write+0x45/0x90
>> [<ffffffff815689e2>] system_call_fastpath+0x16/0x1b
>>
>> The call flow is like this.
>> ppp_write->ppp_channel_push->start_xmit->select inappropriate route
>> .... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
>> ppp_push. Now ppp_push tries to get the same spinlock which is held
>> in ppp_channel_push.
>>
>> Although the PPP deadlock is caused by inappropriate route policy
>> with L2TP, I think it is not accepted the PPP module would cause kernel
>> deadlock with wrong route policy.
>>
>> Signed-off-by: Gao Feng <fgao@ikuai8.com>
>> ---
>>   v1: Initial Patch
>>
>>   drivers/net/ppp/ppp_generic.c | 49 +++++++++++++++++++++++++++++++------------
>>   1 file changed, 36 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
>> index 70cfa06..ffd0233 100644
>> --- a/drivers/net/ppp/ppp_generic.c
>> +++ b/drivers/net/ppp/ppp_generic.c
>> @@ -162,6 +162,29 @@ struct ppp {
>>                           |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
>>                           |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
>>
>> +struct chennel_lock {
>> +       spinlock_t lock;
>> +       u32 owner;
>> +};
>> +
>> +#define PPP_CHANNEL_LOCK_INIT(cl) \
>> +       cl.owner = -1; \
>> +       spin_lock_init(&cl.lock)
>> +
>> +#define PPP_CHANNEL_LOCK_BH(cl) \
>> +       do { \
>> +               local_bh_disable(); \
>> +               if (cl.owner != smp_processor_id()) { \
>> +                       spin_lock(&cl.lock); \
>> +                       cl.owner = smp_processor_id(); \
>> +               } \
>> +       } while (0)
>> +
>> +#define PPP_CHANNEL_UNLOCK_BH(cl) \
>> +       cl.owner = -1; \
>> +       spin_unlock(&cl.lock); \
>> +       local_bh_enable()
>> +
>>   /*
>>    * Private data structure for each channel.
>>    * This includes the data structure used for multilink.
>> @@ -171,7 +194,7 @@ struct channel {
>>          struct list_head list;          /* link in all/new_channels list */
>>          struct ppp_channel *chan;       /* public channel data structure */
>>          struct rw_semaphore chan_sem;   /* protects `chan' during chan ioctl */
>> -       spinlock_t      downl;          /* protects `chan', file.xq dequeue */
>> +       struct chennel_lock downl;      /* protects `chan', file.xq dequeue */
>>          struct ppp      *ppp;           /* ppp unit we're connected to */
>>          struct net      *chan_net;      /* the net channel belongs to */
>>          struct list_head clist;         /* link in list of channels per unit */
>> @@ -1597,7 +1620,7 @@ ppp_push(struct ppp *ppp)
>>                  list = list->next;
>>                  pch = list_entry(list, struct channel, clist);
>>
>> -               spin_lock_bh(&pch->downl);
>> +               PPP_CHANNEL_LOCK_BH(pch->downl);
>>                  if (pch->chan) {
>>                          if (pch->chan->ops->start_xmit(pch->chan, skb))
>>                                  ppp->xmit_pending = NULL;
>> @@ -1606,7 +1629,7 @@ ppp_push(struct ppp *ppp)
>>                          kfree_skb(skb);
>>                          ppp->xmit_pending = NULL;
>>                  }
>> -               spin_unlock_bh(&pch->downl);
>> +               PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>                  return;
>>          }
>>
>> @@ -1736,7 +1759,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>>                  }
>>
>>                  /* check the channel's mtu and whether it is still attached. */
>> -               spin_lock_bh(&pch->downl);
>> +               PPP_CHANNEL_LOCK_BH(pch->downl);
>>                  if (pch->chan = NULL) {
>>                          /* can't use this channel, it's being deregistered */
>>                          if (pch->speed = 0)
>> @@ -1744,7 +1767,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>>                          else
>>                                  totspeed -= pch->speed;
>>
>> -                       spin_unlock_bh(&pch->downl);
>> +                       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>                          pch->avail = 0;
>>                          totlen = len;
>>                          totfree--;
>> @@ -1795,7 +1818,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>>                   */
>>                  if (flen <= 0) {
>>                          pch->avail = 2;
>> -                       spin_unlock_bh(&pch->downl);
>> +                       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>                          continue;
>>                  }
>>
>> @@ -1840,14 +1863,14 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>>                  len -= flen;
>>                  ++ppp->nxseq;
>>                  bits = 0;
>> -               spin_unlock_bh(&pch->downl);
>> +               PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>          }
>>          ppp->nxchan = i;
>>
>>          return 1;
>>
>>    noskb:
>> -       spin_unlock_bh(&pch->downl);
>> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>          if (ppp->debug & 1)
>>                  netdev_err(ppp->dev, "PPP: no memory (fragment)\n");
>>          ++ppp->dev->stats.tx_errors;
>> @@ -1865,7 +1888,7 @@ ppp_channel_push(struct channel *pch)
>>          struct sk_buff *skb;
>>          struct ppp *ppp;
>>
>> -       spin_lock_bh(&pch->downl);
>> +       PPP_CHANNEL_LOCK_BH(pch->downl);
>>          if (pch->chan) {
>>                  while (!skb_queue_empty(&pch->file.xq)) {
>>                          skb = skb_dequeue(&pch->file.xq);
>> @@ -1879,7 +1902,7 @@ ppp_channel_push(struct channel *pch)
>>                  /* channel got deregistered */
>>                  skb_queue_purge(&pch->file.xq);
>>          }
>> -       spin_unlock_bh(&pch->downl);
>> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>          /* see if there is anything from the attached unit to be sent */
>>          if (skb_queue_empty(&pch->file.xq)) {
>>                  read_lock_bh(&pch->upl);
>> @@ -2520,7 +2543,7 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
>>          pch->lastseq = -1;
>>   #endif /* CONFIG_PPP_MULTILINK */
>>          init_rwsem(&pch->chan_sem);
>> -       spin_lock_init(&pch->downl);
>> +       PPP_CHANNEL_LOCK_INIT(pch->downl);
>>          rwlock_init(&pch->upl);
>>
>>          spin_lock_bh(&pn->all_channels_lock);
>> @@ -2599,9 +2622,9 @@ ppp_unregister_channel(struct ppp_channel *chan)
>>           * the channel's start_xmit or ioctl routine before we proceed.
>>           */
>>          down_write(&pch->chan_sem);
>> -       spin_lock_bh(&pch->downl);
>> +       PPP_CHANNEL_LOCK_BH(pch->downl);
>>          pch->chan = NULL;
>> -       spin_unlock_bh(&pch->downl);
>> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>          up_write(&pch->chan_sem);
>>          ppp_disconnect_channel(pch);
>>
>> --
>> 1.9.1
>>


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame
  2016-08-18 14:11   ` Philp Prindeville
@ 2016-08-18 15:05     ` Feng Gao
  2016-08-18 16:48       ` Philp Prindeville
  2016-08-19 22:47     ` Guillaume Nault
  1 sibling, 1 reply; 10+ messages in thread
From: Feng Gao @ 2016-08-18 15:05 UTC (permalink / raw)
  To: Philp Prindeville
  Cc: Gao Feng, paulus, linux-ppp, Linux Kernel Network Developers

Hi Philip,

I answer your question inline.

On Thu, Aug 18, 2016 at 10:11 PM, Philp Prindeville
<philipp@redfish-solutions.com> wrote:
> Feng,
>
> If the CPU can already be holding the lock, that implies re-entrancy.
> What's to stop the first flow of code which acquired the lock from releasing
> it again before the 2nd flow is done?  Is the 2nd flow running at a higher
> priority or with interrupts disabled?

There is no preemption happened. It is caused by wrong route policy by l2tp.
For example, the cpu0 get the spinlock of channel1, then the channel1
is selected again after route. As a result, cpu0 tries to get the same
spinlock again.

The call flow is like this.
ppp_write->ppp_channel_push->start_xmit->select inappropriate route
.... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
ppp_push. Now ppp_push tries to get the same spinlock which is held
in ppp_channel_push.

Regards
Feng

>
> It seems unlikely to me that this sort of locking problem hasn't existed
> elsewhere before and that an entirely new mechanism for handling it is
> needed...  How are similar re-entrancy issues handled elsewhere?
>
> -Philip
>
>
>
>
> On 08/16/2016 05:36 AM, Feng Gao wrote:
>>
>> Hi Paul,
>>
>> The v1 patch does not handle the recursive lock case. It could cause
>> unlock multiple times.
>> So I send the v2 patch as one update.
>>
>> Best Regards
>> Feng
>>
>> On Tue, Aug 16, 2016 at 7:05 PM,  <fgao@ikuai8.com> wrote:
>>>
>>> From: Gao Feng <fgao@ikuai8.com>
>>>
>>> PPP channel holds one spinlock before send frame. But the skb may
>>> select the same PPP channel with wrong route policy. As a result,
>>> the skb reaches the same channel path. It tries to get the same
>>> spinlock which is held before. Bang, the deadlock comes out.
>>>
>>> Now add one lock owner to avoid it like xmit_lock_owner of
>>> netdev_queue. Check the lock owner before try to get the spinlock.
>>> If the current cpu is already the owner, needn't lock again. When
>>> PPP channel holds the spinlock at the first time, it sets owner
>>> with current CPU ID.
>>>
>>> The following is the panic stack of 3.3.8. But the same issue
>>> should be in the upstream too.
>>>
>>> [<ffffffff81568131>] ? _raw_spin_lock_bh+0x11/0x40
>>> [<ffffffffa006a2b7>] ppp_unregister_channel+0x1347/0x2170 [ppp_generic]
>>> [<ffffffff810a2827>] ? kmem_cache_free+0xa7/0xc0
>>> [<ffffffffa006ad27>] ppp_unregister_channel+0x1db7/0x2170 [ppp_generic]
>>> [<ffffffffa006afd5>] ppp_unregister_channel+0x2065/0x2170 [ppp_generic]
>>> [<ffffffff8148f1dd>] dev_hard_start_xmit+0x4cd/0x620
>>> [<ffffffff814a6254>] sch_direct_xmit+0x74/0x1d0
>>> [<ffffffff8148f88d>] dev_queue_xmit+0x1d/0x30
>>> [<ffffffff81496a4c>] neigh_direct_output+0xc/0x10
>>> [<ffffffff814d9dae>] ip_finish_output+0x25e/0x2b0
>>> [<ffffffff814da688>] ip_output+0x88/0x90
>>> [<ffffffff814d9e9f>] ? __ip_local_out+0x9f/0xb0
>>> [<ffffffff814d9ed4>] ip_local_out+0x24/0x30
>>> [<ffffffffa00b9745>] 0xffffffffa00b9744
>>> [<ffffffffa006b068>] ppp_unregister_channel+0x20f8/0x2170 [ppp_generic]
>>> [<ffffffffa006b202>] ppp_output_wakeup+0x122/0x11d0 [ppp_generic]
>>> [<ffffffff810a7978>] vfs_write+0xb8/0x160
>>> [<ffffffff810a7c55>] sys_write+0x45/0x90
>>> [<ffffffff815689e2>] system_call_fastpath+0x16/0x1b
>>>
>>> The call flow is like this.
>>> ppp_write->ppp_channel_push->start_xmit->select inappropriate route
>>> .... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
>>> ppp_push. Now ppp_push tries to get the same spinlock which is held
>>> in ppp_channel_push.
>>>
>>> Although the PPP deadlock is caused by inappropriate route policy
>>> with L2TP, I think it is not accepted the PPP module would cause kernel
>>> deadlock with wrong route policy.
>>>
>>> Signed-off-by: Gao Feng <fgao@ikuai8.com>
>>> ---
>>>   v1: Initial Patch
>>>
>>>   drivers/net/ppp/ppp_generic.c | 49
>>> +++++++++++++++++++++++++++++++------------
>>>   1 file changed, 36 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/net/ppp/ppp_generic.c
>>> b/drivers/net/ppp/ppp_generic.c
>>> index 70cfa06..ffd0233 100644
>>> --- a/drivers/net/ppp/ppp_generic.c
>>> +++ b/drivers/net/ppp/ppp_generic.c
>>> @@ -162,6 +162,29 @@ struct ppp {
>>>                           |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
>>>                           |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
>>>
>>> +struct chennel_lock {
>>> +       spinlock_t lock;
>>> +       u32 owner;
>>> +};
>>> +
>>> +#define PPP_CHANNEL_LOCK_INIT(cl) \
>>> +       cl.owner = -1; \
>>> +       spin_lock_init(&cl.lock)
>>> +
>>> +#define PPP_CHANNEL_LOCK_BH(cl) \
>>> +       do { \
>>> +               local_bh_disable(); \
>>> +               if (cl.owner != smp_processor_id()) { \
>>> +                       spin_lock(&cl.lock); \
>>> +                       cl.owner = smp_processor_id(); \
>>> +               } \
>>> +       } while (0)
>>> +
>>> +#define PPP_CHANNEL_UNLOCK_BH(cl) \
>>> +       cl.owner = -1; \
>>> +       spin_unlock(&cl.lock); \
>>> +       local_bh_enable()
>>> +
>>>   /*
>>>    * Private data structure for each channel.
>>>    * This includes the data structure used for multilink.
>>> @@ -171,7 +194,7 @@ struct channel {
>>>          struct list_head list;          /* link in all/new_channels list
>>> */
>>>          struct ppp_channel *chan;       /* public channel data structure
>>> */
>>>          struct rw_semaphore chan_sem;   /* protects `chan' during chan
>>> ioctl */
>>> -       spinlock_t      downl;          /* protects `chan', file.xq
>>> dequeue */
>>> +       struct chennel_lock downl;      /* protects `chan', file.xq
>>> dequeue */
>>>          struct ppp      *ppp;           /* ppp unit we're connected to
>>> */
>>>          struct net      *chan_net;      /* the net channel belongs to */
>>>          struct list_head clist;         /* link in list of channels per
>>> unit */
>>> @@ -1597,7 +1620,7 @@ ppp_push(struct ppp *ppp)
>>>                  list = list->next;
>>>                  pch = list_entry(list, struct channel, clist);
>>>
>>> -               spin_lock_bh(&pch->downl);
>>> +               PPP_CHANNEL_LOCK_BH(pch->downl);
>>>                  if (pch->chan) {
>>>                          if (pch->chan->ops->start_xmit(pch->chan, skb))
>>>                                  ppp->xmit_pending = NULL;
>>> @@ -1606,7 +1629,7 @@ ppp_push(struct ppp *ppp)
>>>                          kfree_skb(skb);
>>>                          ppp->xmit_pending = NULL;
>>>                  }
>>> -               spin_unlock_bh(&pch->downl);
>>> +               PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>>                  return;
>>>          }
>>>
>>> @@ -1736,7 +1759,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct
>>> sk_buff *skb)
>>>                  }
>>>
>>>                  /* check the channel's mtu and whether it is still
>>> attached. */
>>> -               spin_lock_bh(&pch->downl);
>>> +               PPP_CHANNEL_LOCK_BH(pch->downl);
>>>                  if (pch->chan = NULL) {
>>>                          /* can't use this channel, it's being
>>> deregistered */
>>>                          if (pch->speed = 0)
>>> @@ -1744,7 +1767,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct
>>> sk_buff *skb)
>>>                          else
>>>                                  totspeed -= pch->speed;
>>>
>>> -                       spin_unlock_bh(&pch->downl);
>>> +                       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>>                          pch->avail = 0;
>>>                          totlen = len;
>>>                          totfree--;
>>> @@ -1795,7 +1818,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct
>>> sk_buff *skb)
>>>                   */
>>>                  if (flen <= 0) {
>>>                          pch->avail = 2;
>>> -                       spin_unlock_bh(&pch->downl);
>>> +                       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>>                          continue;
>>>                  }
>>>
>>> @@ -1840,14 +1863,14 @@ static int ppp_mp_explode(struct ppp *ppp, struct
>>> sk_buff *skb)
>>>                  len -= flen;
>>>                  ++ppp->nxseq;
>>>                  bits = 0;
>>> -               spin_unlock_bh(&pch->downl);
>>> +               PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>>          }
>>>          ppp->nxchan = i;
>>>
>>>          return 1;
>>>
>>>    noskb:
>>> -       spin_unlock_bh(&pch->downl);
>>> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>>          if (ppp->debug & 1)
>>>                  netdev_err(ppp->dev, "PPP: no memory (fragment)\n");
>>>          ++ppp->dev->stats.tx_errors;
>>> @@ -1865,7 +1888,7 @@ ppp_channel_push(struct channel *pch)
>>>          struct sk_buff *skb;
>>>          struct ppp *ppp;
>>>
>>> -       spin_lock_bh(&pch->downl);
>>> +       PPP_CHANNEL_LOCK_BH(pch->downl);
>>>          if (pch->chan) {
>>>                  while (!skb_queue_empty(&pch->file.xq)) {
>>>                          skb = skb_dequeue(&pch->file.xq);
>>> @@ -1879,7 +1902,7 @@ ppp_channel_push(struct channel *pch)
>>>                  /* channel got deregistered */
>>>                  skb_queue_purge(&pch->file.xq);
>>>          }
>>> -       spin_unlock_bh(&pch->downl);
>>> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>>          /* see if there is anything from the attached unit to be sent */
>>>          if (skb_queue_empty(&pch->file.xq)) {
>>>                  read_lock_bh(&pch->upl);
>>> @@ -2520,7 +2543,7 @@ int ppp_register_net_channel(struct net *net,
>>> struct ppp_channel *chan)
>>>          pch->lastseq = -1;
>>>   #endif /* CONFIG_PPP_MULTILINK */
>>>          init_rwsem(&pch->chan_sem);
>>> -       spin_lock_init(&pch->downl);
>>> +       PPP_CHANNEL_LOCK_INIT(pch->downl);
>>>          rwlock_init(&pch->upl);
>>>
>>>          spin_lock_bh(&pn->all_channels_lock);
>>> @@ -2599,9 +2622,9 @@ ppp_unregister_channel(struct ppp_channel *chan)
>>>           * the channel's start_xmit or ioctl routine before we proceed.
>>>           */
>>>          down_write(&pch->chan_sem);
>>> -       spin_lock_bh(&pch->downl);
>>> +       PPP_CHANNEL_LOCK_BH(pch->downl);
>>>          pch->chan = NULL;
>>> -       spin_unlock_bh(&pch->downl);
>>> +       PPP_CHANNEL_UNLOCK_BH(pch->downl);
>>>          up_write(&pch->chan_sem);
>>>          ppp_disconnect_channel(pch);
>>>
>>> --
>>> 1.9.1
>>>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame
  2016-08-18 15:05     ` Feng Gao
@ 2016-08-18 16:48       ` Philp Prindeville
  2016-08-18 22:53         ` Feng Gao
  0 siblings, 1 reply; 10+ messages in thread
From: Philp Prindeville @ 2016-08-18 16:48 UTC (permalink / raw)
  To: Feng Gao; +Cc: Gao Feng, paulus, linux-ppp, Linux Kernel Network Developers



On 08/18/2016 09:05 AM, Feng Gao wrote:
> On Thu, Aug 18, 2016 at 10:11 PM, Philp Prindeville
> <philipp@redfish-solutions.com>  wrote:
>> >Feng,
>> >
>> >If the CPU can already be holding the lock, that implies re-entrancy.
>> >What's to stop the first flow of code which acquired the lock from releasing
>> >it again before the 2nd flow is done?  Is the 2nd flow running at a higher
>> >priority or with interrupts disabled?
> There is no preemption happened. It is caused by wrong route policy by l2tp.
> For example, the cpu0 get the spinlock of channel1, then the channel1
> is selected again after route. As a result, cpu0 tries to get the same
> spinlock again.
>
> The call flow is like this.
> ppp_write->ppp_channel_push->start_xmit->select inappropriate route
> .... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
> ppp_push. Now ppp_push tries to get the same spinlock which is held
> in ppp_channel_push.
>
> Regards
> Feng
>

If we're detecting (through the fact that the lock has already been 
acquired) that the wrong route is being applied, why don't we just punt 
the packet instead?

-Philip


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame
  2016-08-18 16:48       ` Philp Prindeville
@ 2016-08-18 22:53         ` Feng Gao
  0 siblings, 0 replies; 10+ messages in thread
From: Feng Gao @ 2016-08-18 22:53 UTC (permalink / raw)
  To: Philp Prindeville
  Cc: Gao Feng, paulus, linux-ppp, Linux Kernel Network Developers

Hi Philp,

Yes. I am agree with you.
Just drop is better to support recursive lock.

I will send a new patch later.

Regards
Feng


On Fri, Aug 19, 2016 at 12:48 AM, Philp Prindeville
<philipp@redfish-solutions.com> wrote:
>
>
> On 08/18/2016 09:05 AM, Feng Gao wrote:
>>
>> On Thu, Aug 18, 2016 at 10:11 PM, Philp Prindeville
>> <philipp@redfish-solutions.com>  wrote:
>>>
>>> >Feng,
>>> >
>>> >If the CPU can already be holding the lock, that implies re-entrancy.
>>> >What's to stop the first flow of code which acquired the lock from
>>> > releasing
>>> >it again before the 2nd flow is done?  Is the 2nd flow running at a
>>> > higher
>>> >priority or with interrupts disabled?
>>
>> There is no preemption happened. It is caused by wrong route policy by
>> l2tp.
>> For example, the cpu0 get the spinlock of channel1, then the channel1
>> is selected again after route. As a result, cpu0 tries to get the same
>> spinlock again.
>>
>> The call flow is like this.
>> ppp_write->ppp_channel_push->start_xmit->select inappropriate route
>> .... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
>> ppp_push. Now ppp_push tries to get the same spinlock which is held
>> in ppp_channel_push.
>>
>> Regards
>> Feng
>>
>
> If we're detecting (through the fact that the lock has already been
> acquired) that the wrong route is being applied, why don't we just punt the
> packet instead?
>
> -Philip
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/1] ppp: Fix one deadlock issue of PPP when reentrant
  2016-08-16 11:05 [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame fgao
  2016-08-16 11:36 ` Feng Gao
@ 2016-08-19 15:16 ` fgao
  2016-08-19 21:48   ` Guillaume Nault
  1 sibling, 1 reply; 10+ messages in thread
From: fgao @ 2016-08-19 15:16 UTC (permalink / raw)
  To: paulus, g.nault, philipp, linux-ppp, netdev; +Cc: gfree.wind, Gao Feng

From: Gao Feng <fgao@ikuai8.com>

PPP channel holds one spinlock before send frame. But the skb may
select the same PPP channel with wrong route policy. As a result,
the skb reaches the same channel path. It tries to get the same
spinlock which is held before. Bang, the deadlock comes out.

Now add one lock owner to avoid it like xmit_lock_owner of
netdev_queue. Check the lock owner before try to get the spinlock.
If the current cpu is already the owner, it means ppp finds there is
one reentrant and returns directly. If not owner and hold the spinlock
successfully, it sets owner with current CPU ID.

The following is the panic stack of 3.3.8. But the same issue
should be in the upstream too.

[<ffffffff81568131>] ? _raw_spin_lock_bh+0x11/0x40
[<ffffffffa006a2b7>] ppp_unregister_channel+0x1347/0x2170 [ppp_generic]
[<ffffffff810a2827>] ? kmem_cache_free+0xa7/0xc0
[<ffffffffa006ad27>] ppp_unregister_channel+0x1db7/0x2170 [ppp_generic]
[<ffffffffa006afd5>] ppp_unregister_channel+0x2065/0x2170 [ppp_generic]
[<ffffffff8148f1dd>] dev_hard_start_xmit+0x4cd/0x620
[<ffffffff814a6254>] sch_direct_xmit+0x74/0x1d0
[<ffffffff8148f88d>] dev_queue_xmit+0x1d/0x30
[<ffffffff81496a4c>] neigh_direct_output+0xc/0x10
[<ffffffff814d9dae>] ip_finish_output+0x25e/0x2b0
[<ffffffff814da688>] ip_output+0x88/0x90
[<ffffffff814d9e9f>] ? __ip_local_out+0x9f/0xb0
[<ffffffff814d9ed4>] ip_local_out+0x24/0x30
[<ffffffffa00b9745>] 0xffffffffa00b9744
[<ffffffffa006b068>] ppp_unregister_channel+0x20f8/0x2170 [ppp_generic]
[<ffffffffa006b202>] ppp_output_wakeup+0x122/0x11d0 [ppp_generic]
[<ffffffff810a7978>] vfs_write+0xb8/0x160
[<ffffffff810a7c55>] sys_write+0x45/0x90
[<ffffffff815689e2>] system_call_fastpath+0x16/0x1b

The call flow is like this.
ppp_write->ppp_channel_push->start_xmit->select inappropriate route
.... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
ppp_push. Now ppp_push tries to get the same spinlock which is held
in ppp_channel_push.

Although the PPP deadlock is caused by inappropriate route policy
with L2TP, I think it is not accepted the PPP module would cause kernel
deadlock with wrong route policy.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
---
 v3: Change the fix solution. Giveup the send chance instead of recursive lock
 v2: Fix recursive unlock issue
 v1: Initial patch
 
 drivers/net/ppp/ppp_generic.c | 95 +++++++++++++++++++++++++++++++++----------
 1 file changed, 73 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 70cfa06..b653f1f 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -162,6 +162,46 @@ struct ppp {
 			 |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
 			 |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
 
+struct channel_lock {
+	spinlock_t lock;
+	int owner;
+};
+
+static inline void ppp_channel_lock_init(struct channel_lock *cl)
+{
+	cl->owner = -1;
+	spin_lock_init(&cl->lock);
+}
+
+static inline bool ppp_channel_lock_bh(struct channel_lock *cl)
+{
+	int cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	if (cpu = cl->owner) {
+		/* The CPU already holds this channel lock and sends. But the
+		 * channel is selected after inappropriate route. It causes
+		 * reenter the channel again. It is forbidden by PPP module.
+		 */
+		if (net_ratelimit())
+			pr_err("PPP detects one recursive channel send\n");
+		local_bh_enable();
+		return false;
+	}
+	spin_lock(&cl->lock);
+	cl->owner = cpu;
+
+	return true;
+}
+
+static inline void ppp_channel_unlock_bh(struct channel_lock *cl)
+{
+	cl->owner = -1;
+	spin_unlock(&cl->lock);
+	local_bh_enable();
+}
+
 /*
  * Private data structure for each channel.
  * This includes the data structure used for multilink.
@@ -171,7 +211,7 @@ struct channel {
 	struct list_head list;		/* link in all/new_channels list */
 	struct ppp_channel *chan;	/* public channel data structure */
 	struct rw_semaphore chan_sem;	/* protects `chan' during chan ioctl */
-	spinlock_t	downl;		/* protects `chan', file.xq dequeue */
+	struct channel_lock downl;	/* protects `chan', file.xq dequeue */
 	struct ppp	*ppp;		/* ppp unit we're connected to */
 	struct net	*chan_net;	/* the net channel belongs to */
 	struct list_head clist;		/* link in list of channels per unit */
@@ -1587,9 +1627,7 @@ ppp_push(struct ppp *ppp)
 	list = &ppp->channels;
 	if (list_empty(list)) {
 		/* nowhere to send the packet, just drop it */
-		ppp->xmit_pending = NULL;
-		kfree_skb(skb);
-		return;
+		goto drop;
 	}
 
 	if ((ppp->flags & SC_MULTILINK) = 0) {
@@ -1597,16 +1635,19 @@ ppp_push(struct ppp *ppp)
 		list = list->next;
 		pch = list_entry(list, struct channel, clist);
 
-		spin_lock_bh(&pch->downl);
+		if (unlikely(!ppp_channel_lock_bh(&pch->downl))) {
+			/* Fail to hold channel lock */
+			goto drop;
+		}
 		if (pch->chan) {
 			if (pch->chan->ops->start_xmit(pch->chan, skb))
 				ppp->xmit_pending = NULL;
 		} else {
 			/* channel got unregistered */
-			kfree_skb(skb);
-			ppp->xmit_pending = NULL;
+			ppp_channel_unlock_bh(&pch->downl);
+			goto drop;
 		}
-		spin_unlock_bh(&pch->downl);
+		ppp_channel_unlock_bh(&pch->downl);
 		return;
 	}
 
@@ -1617,6 +1658,7 @@ ppp_push(struct ppp *ppp)
 		return;
 #endif /* CONFIG_PPP_MULTILINK */
 
+drop:
 	ppp->xmit_pending = NULL;
 	kfree_skb(skb);
 }
@@ -1645,6 +1687,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 	struct channel *pch;
 	struct sk_buff *frag;
 	struct ppp_channel *chan;
+	bool locked;
 
 	totspeed = 0; /*total bitrate of the bundle*/
 	nfree = 0; /* # channels which have no packet already queued */
@@ -1735,17 +1778,21 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 			pch->avail = 1;
 		}
 
-		/* check the channel's mtu and whether it is still attached. */
-		spin_lock_bh(&pch->downl);
-		if (pch->chan = NULL) {
-			/* can't use this channel, it's being deregistered */
+		locked = ppp_channel_lock_bh(&pch->downl);
+		if (!locked || !pch->chan) {
+			/* can't use this channel, it's being deregistered
+			 * or fail to lock the channel
+			 */
 			if (pch->speed = 0)
 				nzero--;
 			else
 				totspeed -= pch->speed;
 
-			spin_unlock_bh(&pch->downl);
-			pch->avail = 0;
+			if (locked) {
+				/* channel is deregistered */
+				ppp_channel_unlock_bh(&pch->downl);
+				pch->avail = 0;
+			}
 			totlen = len;
 			totfree--;
 			nfree--;
@@ -1795,7 +1842,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 		 */
 		if (flen <= 0) {
 			pch->avail = 2;
-			spin_unlock_bh(&pch->downl);
+			ppp_channel_unlock_bh(&pch->downl);
 			continue;
 		}
 
@@ -1840,14 +1887,14 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
 		len -= flen;
 		++ppp->nxseq;
 		bits = 0;
-		spin_unlock_bh(&pch->downl);
+		ppp_channel_unlock_bh(&pch->downl);
 	}
 	ppp->nxchan = i;
 
 	return 1;
 
  noskb:
-	spin_unlock_bh(&pch->downl);
+	ppp_channel_unlock_bh(&pch->downl);
 	if (ppp->debug & 1)
 		netdev_err(ppp->dev, "PPP: no memory (fragment)\n");
 	++ppp->dev->stats.tx_errors;
@@ -1865,7 +1912,8 @@ ppp_channel_push(struct channel *pch)
 	struct sk_buff *skb;
 	struct ppp *ppp;
 
-	spin_lock_bh(&pch->downl);
+	if (unlikely(!ppp_channel_lock_bh(&pch->downl)))
+		return;
 	if (pch->chan) {
 		while (!skb_queue_empty(&pch->file.xq)) {
 			skb = skb_dequeue(&pch->file.xq);
@@ -1879,7 +1927,7 @@ ppp_channel_push(struct channel *pch)
 		/* channel got deregistered */
 		skb_queue_purge(&pch->file.xq);
 	}
-	spin_unlock_bh(&pch->downl);
+	ppp_channel_unlock_bh(&pch->downl);
 	/* see if there is anything from the attached unit to be sent */
 	if (skb_queue_empty(&pch->file.xq)) {
 		read_lock_bh(&pch->upl);
@@ -2520,7 +2568,7 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
 	pch->lastseq = -1;
 #endif /* CONFIG_PPP_MULTILINK */
 	init_rwsem(&pch->chan_sem);
-	spin_lock_init(&pch->downl);
+	ppp_channel_lock_init(&pch->downl);
 	rwlock_init(&pch->upl);
 
 	spin_lock_bh(&pn->all_channels_lock);
@@ -2599,9 +2647,12 @@ ppp_unregister_channel(struct ppp_channel *chan)
 	 * the channel's start_xmit or ioctl routine before we proceed.
 	 */
 	down_write(&pch->chan_sem);
-	spin_lock_bh(&pch->downl);
+	if (unlikely(!ppp_channel_lock_bh(&pch->downl))) {
+		up_write(&pch->chan_sem);
+		return;
+	}
 	pch->chan = NULL;
-	spin_unlock_bh(&pch->downl);
+	ppp_channel_unlock_bh(&pch->downl);
 	up_write(&pch->chan_sem);
 	ppp_disconnect_channel(pch);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] ppp: Fix one deadlock issue of PPP when reentrant
  2016-08-19 15:16 ` [PATCH 1/1] ppp: Fix one deadlock issue of PPP when reentrant fgao
@ 2016-08-19 21:48   ` Guillaume Nault
  2016-08-20  6:40     ` Feng Gao
  0 siblings, 1 reply; 10+ messages in thread
From: Guillaume Nault @ 2016-08-19 21:48 UTC (permalink / raw)
  To: fgao; +Cc: paulus, philipp, linux-ppp, netdev, gfree.wind

On Fri, Aug 19, 2016 at 11:16:41PM +0800, fgao@ikuai8.com wrote:
> From: Gao Feng <fgao@ikuai8.com>
> 
> PPP channel holds one spinlock before send frame. But the skb may
> select the same PPP channel with wrong route policy. As a result,
> the skb reaches the same channel path. It tries to get the same
> spinlock which is held before. Bang, the deadlock comes out.
> 
> Now add one lock owner to avoid it like xmit_lock_owner of
> netdev_queue. Check the lock owner before try to get the spinlock.
> If the current cpu is already the owner, it means ppp finds there is
> one reentrant and returns directly. If not owner and hold the spinlock
> successfully, it sets owner with current CPU ID.
> 
> The following is the panic stack of 3.3.8. But the same issue
> should be in the upstream too.
> 
> [<ffffffff81568131>] ? _raw_spin_lock_bh+0x11/0x40
> [<ffffffffa006a2b7>] ppp_unregister_channel+0x1347/0x2170 [ppp_generic]
> [<ffffffff810a2827>] ? kmem_cache_free+0xa7/0xc0
> [<ffffffffa006ad27>] ppp_unregister_channel+0x1db7/0x2170 [ppp_generic]
> [<ffffffffa006afd5>] ppp_unregister_channel+0x2065/0x2170 [ppp_generic]
> [<ffffffff8148f1dd>] dev_hard_start_xmit+0x4cd/0x620
> [<ffffffff814a6254>] sch_direct_xmit+0x74/0x1d0
> [<ffffffff8148f88d>] dev_queue_xmit+0x1d/0x30
> [<ffffffff81496a4c>] neigh_direct_output+0xc/0x10
> [<ffffffff814d9dae>] ip_finish_output+0x25e/0x2b0
> [<ffffffff814da688>] ip_output+0x88/0x90
> [<ffffffff814d9e9f>] ? __ip_local_out+0x9f/0xb0
> [<ffffffff814d9ed4>] ip_local_out+0x24/0x30
> [<ffffffffa00b9745>] 0xffffffffa00b9744
> [<ffffffffa006b068>] ppp_unregister_channel+0x20f8/0x2170 [ppp_generic]
> [<ffffffffa006b202>] ppp_output_wakeup+0x122/0x11d0 [ppp_generic]
> [<ffffffff810a7978>] vfs_write+0xb8/0x160
> [<ffffffff810a7c55>] sys_write+0x45/0x90
> [<ffffffff815689e2>] system_call_fastpath+0x16/0x1b
> 
> The call flow is like this.
> ppp_write->ppp_channel_push->start_xmit->select inappropriate route
> .... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
> ppp_push. Now ppp_push tries to get the same spinlock which is held
> in ppp_channel_push.
> 
> Although the PPP deadlock is caused by inappropriate route policy
> with L2TP, I think it is not accepted the PPP module would cause kernel
> deadlock with wrong route policy.
> 
> Signed-off-by: Gao Feng <fgao@ikuai8.com>
> ---
>  v3: Change the fix solution. Giveup the send chance instead of recursive lock
>  v2: Fix recursive unlock issue
>  v1: Initial patch
>  
>  drivers/net/ppp/ppp_generic.c | 95 +++++++++++++++++++++++++++++++++----------
>  1 file changed, 73 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
> index 70cfa06..b653f1f 100644
> --- a/drivers/net/ppp/ppp_generic.c
> +++ b/drivers/net/ppp/ppp_generic.c
> @@ -162,6 +162,46 @@ struct ppp {
>  			 |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
>  			 |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
>  
> +struct channel_lock {
> +	spinlock_t lock;
> +	int owner;
> +};
> +
> +static inline void ppp_channel_lock_init(struct channel_lock *cl)
No need for inline in .c files.

> +{
> +	cl->owner = -1;
> +	spin_lock_init(&cl->lock);
> +}
> +
> +static inline bool ppp_channel_lock_bh(struct channel_lock *cl)
> +{
> +	int cpu;
> +
> +	local_bh_disable();
> +	cpu = smp_processor_id();
> +	if (cpu = cl->owner) {
> +		/* The CPU already holds this channel lock and sends. But the
> +		 * channel is selected after inappropriate route. It causes
> +		 * reenter the channel again. It is forbidden by PPP module.
> +		 */
> +		if (net_ratelimit())
> +			pr_err("PPP detects one recursive channel send\n");
> +		local_bh_enable();
What about calling local_bh_enable() before logging the error?

> +		return false;
> +	}
> +	spin_lock(&cl->lock);
> +	cl->owner = cpu;
> +
> +	return true;
> +}
> +
> +static inline void ppp_channel_unlock_bh(struct channel_lock *cl)
> +{
> +	cl->owner = -1;
> +	spin_unlock(&cl->lock);
> +	local_bh_enable();
> +}
> +
>  /*
>   * Private data structure for each channel.
>   * This includes the data structure used for multilink.
> @@ -171,7 +211,7 @@ struct channel {
>  	struct list_head list;		/* link in all/new_channels list */
>  	struct ppp_channel *chan;	/* public channel data structure */
>  	struct rw_semaphore chan_sem;	/* protects `chan' during chan ioctl */
> -	spinlock_t	downl;		/* protects `chan', file.xq dequeue */
> +	struct channel_lock downl;	/* protects `chan', file.xq dequeue */
>  	struct ppp	*ppp;		/* ppp unit we're connected to */
>  	struct net	*chan_net;	/* the net channel belongs to */
>  	struct list_head clist;		/* link in list of channels per unit */

> @@ -1645,6 +1687,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>  	struct channel *pch;
>  	struct sk_buff *frag;
>  	struct ppp_channel *chan;
> +	bool locked;
>  
>  	totspeed = 0; /*total bitrate of the bundle*/
>  	nfree = 0; /* # channels which have no packet already queued */
> @@ -1735,17 +1778,21 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>  			pch->avail = 1;
>  		}
>  
> -		/* check the channel's mtu and whether it is still attached. */
> -		spin_lock_bh(&pch->downl);
> -		if (pch->chan = NULL) {
> -			/* can't use this channel, it's being deregistered */
> +		locked = ppp_channel_lock_bh(&pch->downl);
> +		if (!locked || !pch->chan) {
> +			/* can't use this channel, it's being deregistered
> +			 * or fail to lock the channel
> +			 */
>  			if (pch->speed = 0)
>  				nzero--;
>  			else
>  				totspeed -= pch->speed;
>  
> -			spin_unlock_bh(&pch->downl);
> -			pch->avail = 0;
> +			if (locked) {
> +				/* channel is deregistered */
> +				ppp_channel_unlock_bh(&pch->downl);
> +				pch->avail = 0;
> 
Why do you reset pch->avail only if lock was held, but perform the
other operations in both cases?

> @@ -2599,9 +2647,12 @@ ppp_unregister_channel(struct ppp_channel *chan)
>  	 * the channel's start_xmit or ioctl routine before we proceed.
>  	 */
>  	down_write(&pch->chan_sem);
> -	spin_lock_bh(&pch->downl);
> +	if (unlikely(!ppp_channel_lock_bh(&pch->downl))) {
> +		up_write(&pch->chan_sem);
> +		return;
> +	}
>  	pch->chan = NULL;
> -	spin_unlock_bh(&pch->downl);
> +	ppp_channel_unlock_bh(&pch->downl);
>  	up_write(&pch->chan_sem);
>  	ppp_disconnect_channel(pch);
>  
This one isn't in the xmit path. What about defining a
__ppp_channel_unlock_bh() variant that wouldn't check
cl->owner (and wouldn't fail)? We have no recursion here.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame
  2016-08-18 14:11   ` Philp Prindeville
  2016-08-18 15:05     ` Feng Gao
@ 2016-08-19 22:47     ` Guillaume Nault
  1 sibling, 0 replies; 10+ messages in thread
From: Guillaume Nault @ 2016-08-19 22:47 UTC (permalink / raw)
  To: Philp Prindeville
  Cc: Feng Gao, Gao Feng, paulus, linux-ppp, Linux Kernel Network Developers

On Thu, Aug 18, 2016 at 08:11:15AM -0600, Philp Prindeville wrote:
> 
> It seems unlikely to me that this sort of locking problem hasn't existed
> elsewhere before and that an entirely new mechanism for handling it is
> needed...  How are similar re-entrancy issues handled elsewhere?
>
Virtual devices like vxlan and geneve drop packets if the ouput device
(after encapsulation) is the same as the encapsulating device.
It's possible to bypass this check by stacking devices. In this case,
recursion occurs until __dev_queue_xmit() triggers the
XMIT_RECURSION_LIMIT. If the qdisc isn't noqueue so that recursion
isn't stopped by __dev_queue_xmit(), then it's possible to fill the
skb's headroom and the packet gets drop after pskb_expand_head() fails.

In any case only the faulty path is affected and the system remains
stable.

It'd be nice to have PPP working this way, but this is made really
difficult by the number of locks used in the xmit path.
Even checking if the output interface is also the encapsulating one
isn't trivial. The L2TP layer doesn't access the upper interface. It
may not even be a PPP device in case of L2TPv3.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] ppp: Fix one deadlock issue of PPP when reentrant
  2016-08-19 21:48   ` Guillaume Nault
@ 2016-08-20  6:40     ` Feng Gao
  0 siblings, 0 replies; 10+ messages in thread
From: Feng Gao @ 2016-08-20  6:40 UTC (permalink / raw)
  To: Guillaume Nault
  Cc: Gao Feng, paulus, Philp Prindeville, linux-ppp,
	Linux Kernel Network Developers

On Sat, Aug 20, 2016 at 5:48 AM, Guillaume Nault <g.nault@alphalink.fr> wrote:
> On Fri, Aug 19, 2016 at 11:16:41PM +0800, fgao@ikuai8.com wrote:
>> From: Gao Feng <fgao@ikuai8.com>
>>
>> PPP channel holds one spinlock before send frame. But the skb may
>> select the same PPP channel with wrong route policy. As a result,
>> the skb reaches the same channel path. It tries to get the same
>> spinlock which is held before. Bang, the deadlock comes out.
>>
>> Now add one lock owner to avoid it like xmit_lock_owner of
>> netdev_queue. Check the lock owner before try to get the spinlock.
>> If the current cpu is already the owner, it means ppp finds there is
>> one reentrant and returns directly. If not owner and hold the spinlock
>> successfully, it sets owner with current CPU ID.
>>
>> The following is the panic stack of 3.3.8. But the same issue
>> should be in the upstream too.
>>
>> [<ffffffff81568131>] ? _raw_spin_lock_bh+0x11/0x40
>> [<ffffffffa006a2b7>] ppp_unregister_channel+0x1347/0x2170 [ppp_generic]
>> [<ffffffff810a2827>] ? kmem_cache_free+0xa7/0xc0
>> [<ffffffffa006ad27>] ppp_unregister_channel+0x1db7/0x2170 [ppp_generic]
>> [<ffffffffa006afd5>] ppp_unregister_channel+0x2065/0x2170 [ppp_generic]
>> [<ffffffff8148f1dd>] dev_hard_start_xmit+0x4cd/0x620
>> [<ffffffff814a6254>] sch_direct_xmit+0x74/0x1d0
>> [<ffffffff8148f88d>] dev_queue_xmit+0x1d/0x30
>> [<ffffffff81496a4c>] neigh_direct_output+0xc/0x10
>> [<ffffffff814d9dae>] ip_finish_output+0x25e/0x2b0
>> [<ffffffff814da688>] ip_output+0x88/0x90
>> [<ffffffff814d9e9f>] ? __ip_local_out+0x9f/0xb0
>> [<ffffffff814d9ed4>] ip_local_out+0x24/0x30
>> [<ffffffffa00b9745>] 0xffffffffa00b9744
>> [<ffffffffa006b068>] ppp_unregister_channel+0x20f8/0x2170 [ppp_generic]
>> [<ffffffffa006b202>] ppp_output_wakeup+0x122/0x11d0 [ppp_generic]
>> [<ffffffff810a7978>] vfs_write+0xb8/0x160
>> [<ffffffff810a7c55>] sys_write+0x45/0x90
>> [<ffffffff815689e2>] system_call_fastpath+0x16/0x1b
>>
>> The call flow is like this.
>> ppp_write->ppp_channel_push->start_xmit->select inappropriate route
>> .... -> dev_hard_start_xmit->ppp_start_xmit->ppp_xmit_process->
>> ppp_push. Now ppp_push tries to get the same spinlock which is held
>> in ppp_channel_push.
>>
>> Although the PPP deadlock is caused by inappropriate route policy
>> with L2TP, I think it is not accepted the PPP module would cause kernel
>> deadlock with wrong route policy.
>>
>> Signed-off-by: Gao Feng <fgao@ikuai8.com>
>> ---
>>  v3: Change the fix solution. Giveup the send chance instead of recursive lock
>>  v2: Fix recursive unlock issue
>>  v1: Initial patch
>>
>>  drivers/net/ppp/ppp_generic.c | 95 +++++++++++++++++++++++++++++++++----------
>>  1 file changed, 73 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
>> index 70cfa06..b653f1f 100644
>> --- a/drivers/net/ppp/ppp_generic.c
>> +++ b/drivers/net/ppp/ppp_generic.c
>> @@ -162,6 +162,46 @@ struct ppp {
>>                        |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
>>                        |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
>>
>> +struct channel_lock {
>> +     spinlock_t lock;
>> +     int owner;
>> +};
>> +
>> +static inline void ppp_channel_lock_init(struct channel_lock *cl)
> No need for inline in .c files.

OK. I make them as non-inline.

>
>> +{
>> +     cl->owner = -1;
>> +     spin_lock_init(&cl->lock);
>> +}
>> +
>> +static inline bool ppp_channel_lock_bh(struct channel_lock *cl)
>> +{
>> +     int cpu;
>> +
>> +     local_bh_disable();
>> +     cpu = smp_processor_id();
>> +     if (cpu = cl->owner) {
>> +             /* The CPU already holds this channel lock and sends. But the
>> +              * channel is selected after inappropriate route. It causes
>> +              * reenter the channel again. It is forbidden by PPP module.
>> +              */
>> +             if (net_ratelimit())
>> +                     pr_err("PPP detects one recursive channel send\n");
>> +             local_bh_enable();
> What about calling local_bh_enable() before logging the error?

Ok.

>
>> +             return false;
>> +     }
>> +     spin_lock(&cl->lock);
>> +     cl->owner = cpu;
>> +
>> +     return true;
>> +}
>> +
>> +static inline void ppp_channel_unlock_bh(struct channel_lock *cl)
>> +{
>> +     cl->owner = -1;
>> +     spin_unlock(&cl->lock);
>> +     local_bh_enable();
>> +}
>> +
>>  /*
>>   * Private data structure for each channel.
>>   * This includes the data structure used for multilink.
>> @@ -171,7 +211,7 @@ struct channel {
>>       struct list_head list;          /* link in all/new_channels list */
>>       struct ppp_channel *chan;       /* public channel data structure */
>>       struct rw_semaphore chan_sem;   /* protects `chan' during chan ioctl */
>> -     spinlock_t      downl;          /* protects `chan', file.xq dequeue */
>> +     struct channel_lock downl;      /* protects `chan', file.xq dequeue */
>>       struct ppp      *ppp;           /* ppp unit we're connected to */
>>       struct net      *chan_net;      /* the net channel belongs to */
>>       struct list_head clist;         /* link in list of channels per unit */
>
>> @@ -1645,6 +1687,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>>       struct channel *pch;
>>       struct sk_buff *frag;
>>       struct ppp_channel *chan;
>> +     bool locked;
>>
>>       totspeed = 0; /*total bitrate of the bundle*/
>>       nfree = 0; /* # channels which have no packet already queued */
>> @@ -1735,17 +1778,21 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
>>                       pch->avail = 1;
>>               }
>>
>> -             /* check the channel's mtu and whether it is still attached. */
>> -             spin_lock_bh(&pch->downl);
>> -             if (pch->chan = NULL) {
>> -                     /* can't use this channel, it's being deregistered */
>> +             locked = ppp_channel_lock_bh(&pch->downl);
>> +             if (!locked || !pch->chan) {
>> +                     /* can't use this channel, it's being deregistered
>> +                      * or fail to lock the channel
>> +                      */
>>                       if (pch->speed = 0)
>>                               nzero--;
>>                       else
>>                               totspeed -= pch->speed;
>>
>> -                     spin_unlock_bh(&pch->downl);
>> -                     pch->avail = 0;
>> +                     if (locked) {
>> +                             /* channel is deregistered */
>> +                             ppp_channel_unlock_bh(&pch->downl);
>> +                             pch->avail = 0;
>>
> Why do you reset pch->avail only if lock was held, but perform the
> other operations in both cases?

Because list_for_each_entry(pch, &ppp->channels, clist) has already
counted the channel which fail to get lock. So I think need to perform
the operations like "!pch->chan" case except reset pch->avail. Because
it still may be available.

>
>> @@ -2599,9 +2647,12 @@ ppp_unregister_channel(struct ppp_channel *chan)
>>        * the channel's start_xmit or ioctl routine before we proceed.
>>        */
>>       down_write(&pch->chan_sem);
>> -     spin_lock_bh(&pch->downl);
>> +     if (unlikely(!ppp_channel_lock_bh(&pch->downl))) {
>> +             up_write(&pch->chan_sem);
>> +             return;
>> +     }
>>       pch->chan = NULL;
>> -     spin_unlock_bh(&pch->downl);
>> +     ppp_channel_unlock_bh(&pch->downl);
>>       up_write(&pch->chan_sem);
>>       ppp_disconnect_channel(pch);
>>
> This one isn't in the xmit path. What about defining a
> __ppp_channel_unlock_bh() variant that wouldn't check
> cl->owner (and wouldn't fail)? We have no recursion here.

Thanks. Good idea.

Regards
Feng

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2016-08-20  6:40 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-16 11:05 [PATCH 1/1] ppp: Fix one deadlock issue of PPP when send frame fgao
2016-08-16 11:36 ` Feng Gao
2016-08-18 14:11   ` Philp Prindeville
2016-08-18 15:05     ` Feng Gao
2016-08-18 16:48       ` Philp Prindeville
2016-08-18 22:53         ` Feng Gao
2016-08-19 22:47     ` Guillaume Nault
2016-08-19 15:16 ` [PATCH 1/1] ppp: Fix one deadlock issue of PPP when reentrant fgao
2016-08-19 21:48   ` Guillaume Nault
2016-08-20  6:40     ` Feng Gao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).