All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
@ 2024-03-26 23:03 Jamal Hadi Salim
  2024-03-27 13:23 ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-03-26 23:03 UTC (permalink / raw)
  To: davem, kuba, edumazet, pabeni
  Cc: jiri, xiyou.wangcong, netdev, renmingshuai, Jamal Hadi Salim,
	Victor Nogueira

When the mirred action is used on a classful egress qdisc and a packet is
mirrored or redirected to self we hit a qdisc lock deadlock.
See trace below.

[..... other info removed for brevity....]
[   82.890906]
[   82.890906] ============================================
[   82.890906] WARNING: possible recursive locking detected
[   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
[   82.890906] --------------------------------------------
[   82.890906] ping/418 is trying to acquire lock:
[   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
__dev_queue_xmit+0x1778/0x3550
[   82.890906]
[   82.890906] but task is already holding lock:
[   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
__dev_queue_xmit+0x1778/0x3550
[   82.890906]
[   82.890906] other info that might help us debug this:
[   82.890906]  Possible unsafe locking scenario:
[   82.890906]
[   82.890906]        CPU0
[   82.890906]        ----
[   82.890906]   lock(&sch->q.lock);
[   82.890906]   lock(&sch->q.lock);
[   82.890906]
[   82.890906]  *** DEADLOCK ***
[   82.890906]
[..... other info removed for brevity....]

Example setup (eth0->eth0) to recreate
tc qdisc add dev eth0 root handle 1: htb default 30
tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
     action mirred egress redirect dev eth0

Another example(eth0->eth1->eth0) to recreate
tc qdisc add dev eth0 root handle 1: htb default 30
tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
     action mirred egress redirect dev eth1

tc qdisc add dev eth1 root handle 1: htb default 30
tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
     action mirred egress redirect dev eth0

We fix this by adding a per-cpu, per-qdisc recursion counter which is
incremented the first time a root qdisc is entered and on a second attempt
enter the same root qdisc from the top, the packet is dropped to break the
loop.

Reported-by: renmingshuai@huawei.com
Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
Co-developed-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 include/net/sch_generic.h |  2 ++
 net/core/dev.c            |  9 +++++++++
 net/sched/sch_api.c       | 12 ++++++++++++
 net/sched/sch_generic.c   |  2 ++
 4 files changed, 25 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cefe0c4bdae3..f9f99df037ed 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -125,6 +125,8 @@ struct Qdisc {
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
 
+	u16 __percpu            *xmit_recursion;
+
 	struct rcu_head		rcu;
 	netdevice_tracker	dev_tracker;
 	/* private data */
diff --git a/net/core/dev.c b/net/core/dev.c
index 9a67003e49db..2b712388c06f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	if (unlikely(contended))
 		spin_lock(&q->busylock);
 
+	if (__this_cpu_read(*q->xmit_recursion) > 0) {
+		__qdisc_drop(skb, &to_free);
+		rc = NET_XMIT_DROP;
+		goto free_skb_list;
+	}
+
+	__this_cpu_inc(*q->xmit_recursion);
 	spin_lock(root_lock);
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 		__qdisc_drop(skb, &to_free);
@@ -3825,7 +3832,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		}
 	}
 	spin_unlock(root_lock);
+	__this_cpu_dec(*q->xmit_recursion);
 	if (unlikely(to_free))
+free_skb_list:
 		kfree_skb_list_reason(to_free,
 				      tcf_get_drop_reason(to_free));
 	if (unlikely(contended))
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 65e05b0c98e4..6c3bc1aff89a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1260,6 +1260,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 	struct Qdisc *sch;
 	struct Qdisc_ops *ops;
 	struct qdisc_size_table *stab;
+	int cpu;
 
 	ops = qdisc_lookup_ops(kind);
 #ifdef CONFIG_MODULES
@@ -1376,11 +1377,22 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		}
 	}
 
+	sch->xmit_recursion = alloc_percpu(u16);
+	if (!sch->xmit_recursion) {
+		err = -ENOMEM;
+		goto err_out5;
+	}
+	for_each_possible_cpu(cpu)
+		(*per_cpu_ptr(sch->xmit_recursion, cpu)) = 0;
+
 	qdisc_hash_add(sch, false);
 	trace_qdisc_create(ops, dev, parent);
 
 	return sch;
 
+err_out5:
+	if (tca[TCA_RATE])
+		gen_kill_estimator(&sch->rate_est);
 err_out4:
 	/* Even if ops->init() failed, we call ops->destroy()
 	 * like qdisc_create_dflt().
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ff5336493777..afbbd2e885a4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1070,6 +1070,8 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	netdev_put(dev, &qdisc->dev_tracker);
 
+	free_percpu(qdisc->xmit_recursion);
+
 	trace_qdisc_destroy(qdisc);
 
 	call_rcu(&qdisc->rcu, qdisc_free_cb);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-03-26 23:03 [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion Jamal Hadi Salim
@ 2024-03-27 13:23 ` Eric Dumazet
  2024-03-27 22:57   ` Jamal Hadi Salim
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2024-03-27 13:23 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> When the mirred action is used on a classful egress qdisc and a packet is
> mirrored or redirected to self we hit a qdisc lock deadlock.
> See trace below.
>
> [..... other info removed for brevity....]
> [   82.890906]
> [   82.890906] ============================================
> [   82.890906] WARNING: possible recursive locking detected
> [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> [   82.890906] --------------------------------------------
> [   82.890906] ping/418 is trying to acquire lock:
> [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> __dev_queue_xmit+0x1778/0x3550
> [   82.890906]
> [   82.890906] but task is already holding lock:
> [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> __dev_queue_xmit+0x1778/0x3550
> [   82.890906]
> [   82.890906] other info that might help us debug this:
> [   82.890906]  Possible unsafe locking scenario:
> [   82.890906]
> [   82.890906]        CPU0
> [   82.890906]        ----
> [   82.890906]   lock(&sch->q.lock);
> [   82.890906]   lock(&sch->q.lock);
> [   82.890906]
> [   82.890906]  *** DEADLOCK ***
> [   82.890906]
> [..... other info removed for brevity....]
>
> Example setup (eth0->eth0) to recreate
> tc qdisc add dev eth0 root handle 1: htb default 30
> tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
>      action mirred egress redirect dev eth0
>
> Another example(eth0->eth1->eth0) to recreate
> tc qdisc add dev eth0 root handle 1: htb default 30
> tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
>      action mirred egress redirect dev eth1
>
> tc qdisc add dev eth1 root handle 1: htb default 30
> tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
>      action mirred egress redirect dev eth0
>
> We fix this by adding a per-cpu, per-qdisc recursion counter which is
> incremented the first time a root qdisc is entered and on a second attempt
> enter the same root qdisc from the top, the packet is dropped to break the
> loop.
>
> Reported-by: renmingshuai@huawei.com
> Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> ---
>  include/net/sch_generic.h |  2 ++
>  net/core/dev.c            |  9 +++++++++
>  net/sched/sch_api.c       | 12 ++++++++++++
>  net/sched/sch_generic.c   |  2 ++
>  4 files changed, 25 insertions(+)
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index cefe0c4bdae3..f9f99df037ed 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -125,6 +125,8 @@ struct Qdisc {
>         spinlock_t              busylock ____cacheline_aligned_in_smp;
>         spinlock_t              seqlock;
>
> +       u16 __percpu            *xmit_recursion;
> +
>         struct rcu_head         rcu;
>         netdevice_tracker       dev_tracker;
>         /* private data */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 9a67003e49db..2b712388c06f 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>         if (unlikely(contended))
>                 spin_lock(&q->busylock);

This could hang here (busylock)


>
> +       if (__this_cpu_read(*q->xmit_recursion) > 0) {
> +               __qdisc_drop(skb, &to_free);
> +               rc = NET_XMIT_DROP;
> +               goto free_skb_list;
> +       }


I do not think we want to add yet another cache line miss and
complexity in tx fast path.

I think that mirred should  use a separate queue to kick a transmit
from the top level.

(Like netif_rx() does)

Using a softnet.xmit_qdisc_recursion (not a qdisc-per-cpu thing),
would allow mirred to bypass this additional queue
in most cases.

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb37817d6382c29117afd8ce54db6dba94f8c930..62ba5ef554860496ee928f7ed6b7c3ea46b8ee1d
100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3217,7 +3217,8 @@ struct softnet_data {
 #endif
        /* written and read only by owning cpu: */
        struct {
-               u16 recursion;
+               u8 recursion;
+               u8 qdisc_recursion;
                u8  more;
 #ifdef CONFIG_NET_EGRESS
                u8  skip_txqueue;
diff --git a/net/core/dev.c b/net/core/dev.c
index 9a67003e49db87f3f92b6c6296b3e7a5ca9d9171..7ac59835edef657e9558d4d4fc0a76b171aace93
100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4298,7 +4298,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct
net_device *sb_dev)

        trace_net_dev_queue(skb);
        if (q->enqueue) {
+               __this_cpu_inc(softnet_data.xmit.qdisc_recursion);
                rc = __dev_xmit_skb(skb, q, dev, txq);
+               __this_cpu_dec(softnet_data.xmit.qdisc_recursion);
                goto out;
        }

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5b38143659249e66718348e0ec4ed3c7bc21c13d..0f5f02e6744397d33ae2a72670ba7131aaa6942e
100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -237,8 +237,13 @@ tcf_mirred_forward(bool at_ingress, bool
want_ingress, struct sk_buff *skb)
 {
        int err;

-       if (!want_ingress)
-               err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
+       if (!want_ingress) {
+               if (__this_cpu_read(softnet_data.xmit.qdisc_recursion)) {
+                       // Queue to top level, or drop
+               } else {
+                       err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
+               }
+       }
        else if (!at_ingress)
                err = netif_rx(skb);
        else

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-03-27 13:23 ` Eric Dumazet
@ 2024-03-27 22:57   ` Jamal Hadi Salim
  2024-03-27 23:12     ` Jamal Hadi Salim
  2024-04-02 16:47     ` Eric Dumazet
  0 siblings, 2 replies; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-03-27 22:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> > When the mirred action is used on a classful egress qdisc and a packet is
> > mirrored or redirected to self we hit a qdisc lock deadlock.
> > See trace below.
> >
> > [..... other info removed for brevity....]
> > [   82.890906]
> > [   82.890906] ============================================
> > [   82.890906] WARNING: possible recursive locking detected
> > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > [   82.890906] --------------------------------------------
> > [   82.890906] ping/418 is trying to acquire lock:
> > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > __dev_queue_xmit+0x1778/0x3550
> > [   82.890906]
> > [   82.890906] but task is already holding lock:
> > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > __dev_queue_xmit+0x1778/0x3550
> > [   82.890906]
> > [   82.890906] other info that might help us debug this:
> > [   82.890906]  Possible unsafe locking scenario:
> > [   82.890906]
> > [   82.890906]        CPU0
> > [   82.890906]        ----
> > [   82.890906]   lock(&sch->q.lock);
> > [   82.890906]   lock(&sch->q.lock);
> > [   82.890906]
> > [   82.890906]  *** DEADLOCK ***
> > [   82.890906]
> > [..... other info removed for brevity....]
> >
> > Example setup (eth0->eth0) to recreate
> > tc qdisc add dev eth0 root handle 1: htb default 30
> > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> >      action mirred egress redirect dev eth0
> >
> > Another example(eth0->eth1->eth0) to recreate
> > tc qdisc add dev eth0 root handle 1: htb default 30
> > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> >      action mirred egress redirect dev eth1
> >
> > tc qdisc add dev eth1 root handle 1: htb default 30
> > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> >      action mirred egress redirect dev eth0
> >
> > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > incremented the first time a root qdisc is entered and on a second attempt
> > enter the same root qdisc from the top, the packet is dropped to break the
> > loop.
> >
> > Reported-by: renmingshuai@huawei.com
> > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > ---
> >  include/net/sch_generic.h |  2 ++
> >  net/core/dev.c            |  9 +++++++++
> >  net/sched/sch_api.c       | 12 ++++++++++++
> >  net/sched/sch_generic.c   |  2 ++
> >  4 files changed, 25 insertions(+)
> >
> > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > index cefe0c4bdae3..f9f99df037ed 100644
> > --- a/include/net/sch_generic.h
> > +++ b/include/net/sch_generic.h
> > @@ -125,6 +125,8 @@ struct Qdisc {
> >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> >         spinlock_t              seqlock;
> >
> > +       u16 __percpu            *xmit_recursion;
> > +
> >         struct rcu_head         rcu;
> >         netdevice_tracker       dev_tracker;
> >         /* private data */
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 9a67003e49db..2b712388c06f 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> >         if (unlikely(contended))
> >                 spin_lock(&q->busylock);
>
> This could hang here (busylock)

Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
its code vicinity. Am I missing something?

>
> >
> > +       if (__this_cpu_read(*q->xmit_recursion) > 0) {
> > +               __qdisc_drop(skb, &to_free);
> > +               rc = NET_XMIT_DROP;
> > +               goto free_skb_list;
> > +       }
>
>
> I do not think we want to add yet another cache line miss and
> complexity in tx fast path.
>

I empathize. The cache miss is due to a per-cpu variable? Otherwise
that seems to be in the vicinity of the other fields being accessed in
__dev_xmit_skb()

> I think that mirred should  use a separate queue to kick a transmit
> from the top level.
>
> (Like netif_rx() does)
>

Eric, here's my concern: this would entail restructuring mirred
totally just to cater for one use case which is in itself _a bad
config_ for egress qdisc case only. Mirred is very heavily used in
many use cases and changing its behavior could likely introduce other
corner cases for some use cases which we would be chasing for a while.
Not to forget now we have to go via an extra transient queue.
If i understood what you are suggesting is to add an equivalent of
backlog queu for the tx side? I am assuming in a very similar nature
to backlog, meaning per cpu fired by softirq? or is it something
closer to qdisc->gso_skb
For either of those cases, the amount of infrastructure code there is
not a few lines of code. And then there's the desire to break the loop
etc.

Some questions regarding your proposal - something I am not following
And i may have misunderstood what you are suggesting, but i am missing
what scenario mirred can directly call tcf_dev_queue_xmit() (see my
comment below)..

> Using a softnet.xmit_qdisc_recursion (not a qdisc-per-cpu thing),
> would allow mirred to bypass this additional queue
> in most cases.
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index cb37817d6382c29117afd8ce54db6dba94f8c930..62ba5ef554860496ee928f7ed6b7c3ea46b8ee1d
> 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3217,7 +3217,8 @@ struct softnet_data {
>  #endif
>         /* written and read only by owning cpu: */
>         struct {
> -               u16 recursion;
> +               u8 recursion;
> +               u8 qdisc_recursion;
>                 u8  more;
>  #ifdef CONFIG_NET_EGRESS
>                 u8  skip_txqueue;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 9a67003e49db87f3f92b6c6296b3e7a5ca9d9171..7ac59835edef657e9558d4d4fc0a76b171aace93
> 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4298,7 +4298,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct
> net_device *sb_dev)
>
>         trace_net_dev_queue(skb);
>         if (q->enqueue) {
> +               __this_cpu_inc(softnet_data.xmit.qdisc_recursion);

This increments the count by 1..

>                 rc = __dev_xmit_skb(skb, q, dev, txq);
> +               __this_cpu_dec(softnet_data.xmit.qdisc_recursion);
>                 goto out;
>         }
>
> diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
> index 5b38143659249e66718348e0ec4ed3c7bc21c13d..0f5f02e6744397d33ae2a72670ba7131aaa6942e
> 100644
> --- a/net/sched/act_mirred.c
> +++ b/net/sched/act_mirred.c
> @@ -237,8 +237,13 @@ tcf_mirred_forward(bool at_ingress, bool
> want_ingress, struct sk_buff *skb)
>  {
>         int err;
>
> -       if (!want_ingress)
> -               err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
> +       if (!want_ingress) {
> +               if (__this_cpu_read(softnet_data.xmit.qdisc_recursion)) {

Where does the defered
So this will always be 1 assuming the defer queue will have to be
something like a workqueue

> +                       // Queue to top level, or drop
> +               } else {

and we'll never enter this..

> +                       err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
> +               }
> +       }
>         else if (!at_ingress)
>                 err = netif_rx(skb);
>         else

cheers,
jamal

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-03-27 22:57   ` Jamal Hadi Salim
@ 2024-03-27 23:12     ` Jamal Hadi Salim
  2024-04-02  2:00       ` renmingshuai
  2024-04-02 16:47     ` Eric Dumazet
  1 sibling, 1 reply; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-03-27 23:12 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Wed, Mar 27, 2024 at 6:57 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > >
> > > When the mirred action is used on a classful egress qdisc and a packet is
> > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > See trace below.
> > >
> > > [..... other info removed for brevity....]
> > > [   82.890906]
> > > [   82.890906] ============================================
> > > [   82.890906] WARNING: possible recursive locking detected
> > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > [   82.890906] --------------------------------------------
> > > [   82.890906] ping/418 is trying to acquire lock:
> > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > __dev_queue_xmit+0x1778/0x3550
> > > [   82.890906]
> > > [   82.890906] but task is already holding lock:
> > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > __dev_queue_xmit+0x1778/0x3550
> > > [   82.890906]
> > > [   82.890906] other info that might help us debug this:
> > > [   82.890906]  Possible unsafe locking scenario:
> > > [   82.890906]
> > > [   82.890906]        CPU0
> > > [   82.890906]        ----
> > > [   82.890906]   lock(&sch->q.lock);
> > > [   82.890906]   lock(&sch->q.lock);
> > > [   82.890906]
> > > [   82.890906]  *** DEADLOCK ***
> > > [   82.890906]
> > > [..... other info removed for brevity....]
> > >
> > > Example setup (eth0->eth0) to recreate
> > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > >      action mirred egress redirect dev eth0
> > >
> > > Another example(eth0->eth1->eth0) to recreate
> > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > >      action mirred egress redirect dev eth1
> > >
> > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > >      action mirred egress redirect dev eth0
> > >
> > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > incremented the first time a root qdisc is entered and on a second attempt
> > > enter the same root qdisc from the top, the packet is dropped to break the
> > > loop.
> > >
> > > Reported-by: renmingshuai@huawei.com
> > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > ---
> > >  include/net/sch_generic.h |  2 ++
> > >  net/core/dev.c            |  9 +++++++++
> > >  net/sched/sch_api.c       | 12 ++++++++++++
> > >  net/sched/sch_generic.c   |  2 ++
> > >  4 files changed, 25 insertions(+)
> > >
> > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > index cefe0c4bdae3..f9f99df037ed 100644
> > > --- a/include/net/sch_generic.h
> > > +++ b/include/net/sch_generic.h
> > > @@ -125,6 +125,8 @@ struct Qdisc {
> > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > >         spinlock_t              seqlock;
> > >
> > > +       u16 __percpu            *xmit_recursion;
> > > +
> > >         struct rcu_head         rcu;
> > >         netdevice_tracker       dev_tracker;
> > >         /* private data */
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 9a67003e49db..2b712388c06f 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > >         if (unlikely(contended))
> > >                 spin_lock(&q->busylock);
> >
> > This could hang here (busylock)
>
> Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> its code vicinity. Am I missing something?
>
> >
> > >
> > > +       if (__this_cpu_read(*q->xmit_recursion) > 0) {
> > > +               __qdisc_drop(skb, &to_free);
> > > +               rc = NET_XMIT_DROP;
> > > +               goto free_skb_list;
> > > +       }
> >
> >
> > I do not think we want to add yet another cache line miss and
> > complexity in tx fast path.
> >
>
> I empathize. The cache miss is due to a per-cpu variable? Otherwise
> that seems to be in the vicinity of the other fields being accessed in
> __dev_xmit_skb()
>
> > I think that mirred should  use a separate queue to kick a transmit
> > from the top level.
> >
> > (Like netif_rx() does)
> >
>
> Eric, here's my concern: this would entail restructuring mirred
> totally just to cater for one use case which is in itself _a bad
> config_ for egress qdisc case only. Mirred is very heavily used in
> many use cases and changing its behavior could likely introduce other
> corner cases for some use cases which we would be chasing for a while.
> Not to forget now we have to go via an extra transient queue.
> If i understood what you are suggesting is to add an equivalent of
> backlog queu for the tx side? I am assuming in a very similar nature
> to backlog, meaning per cpu fired by softirq? or is it something
> closer to qdisc->gso_skb
> For either of those cases, the amount of infrastructure code there is
> not a few lines of code. And then there's the desire to break the loop
> etc.
>
> Some questions regarding your proposal - something I am not following
> And i may have misunderstood what you are suggesting, but i am missing
> what scenario mirred can directly call tcf_dev_queue_xmit() (see my
> comment below)..
>
> > Using a softnet.xmit_qdisc_recursion (not a qdisc-per-cpu thing),
> > would allow mirred to bypass this additional queue
> > in most cases.
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index cb37817d6382c29117afd8ce54db6dba94f8c930..62ba5ef554860496ee928f7ed6b7c3ea46b8ee1d
> > 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -3217,7 +3217,8 @@ struct softnet_data {
> >  #endif
> >         /* written and read only by owning cpu: */
> >         struct {
> > -               u16 recursion;
> > +               u8 recursion;
> > +               u8 qdisc_recursion;
> >                 u8  more;
> >  #ifdef CONFIG_NET_EGRESS
> >                 u8  skip_txqueue;
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 9a67003e49db87f3f92b6c6296b3e7a5ca9d9171..7ac59835edef657e9558d4d4fc0a76b171aace93
> > 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -4298,7 +4298,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct
> > net_device *sb_dev)
> >
> >         trace_net_dev_queue(skb);
> >         if (q->enqueue) {
> > +               __this_cpu_inc(softnet_data.xmit.qdisc_recursion);
>
> This increments the count by 1..
>
> >                 rc = __dev_xmit_skb(skb, q, dev, txq);
> > +               __this_cpu_dec(softnet_data.xmit.qdisc_recursion);
> >                 goto out;
> >         }
> >
> > diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
> > index 5b38143659249e66718348e0ec4ed3c7bc21c13d..0f5f02e6744397d33ae2a72670ba7131aaa6942e
> > 100644
> > --- a/net/sched/act_mirred.c
> > +++ b/net/sched/act_mirred.c
> > @@ -237,8 +237,13 @@ tcf_mirred_forward(bool at_ingress, bool
> > want_ingress, struct sk_buff *skb)
> >  {
> >         int err;
> >
> > -       if (!want_ingress)
> > -               err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
> > +       if (!want_ingress) {
> > +               if (__this_cpu_read(softnet_data.xmit.qdisc_recursion)) {
>
> Where does the defered
> So this will always be 1 assuming the defer queue will have to be
> something like a workqueue

Sorry, sent too fast - meant we would always enter here..

> > +                       // Queue to top level, or drop
> > +               } else {
>
> and we'll never enter this..
>
> > +                       err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
> > +               }
> > +       }
> >         else if (!at_ingress)
> >                 err = netif_rx(skb);
> >         else
>
> cheers,
> jamal

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-03-27 23:12     ` Jamal Hadi Salim
@ 2024-04-02  2:00       ` renmingshuai
  2024-04-02 16:38         ` Jamal Hadi Salim
  0 siblings, 1 reply; 21+ messages in thread
From: renmingshuai @ 2024-04-02  2:00 UTC (permalink / raw)
  To: jhs
  Cc: davem, edumazet, jiri, kuba, netdev, pabeni, renmingshuai,
	victor, xiyou.wangcong

> On Wed, Mar 27, 2024 at 6:57 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> > On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > >
> > > > When the mirred action is used on a classful egress qdisc and a packet is
> > > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > > See trace below.
> > > >
> > > > [..... other info removed for brevity....]
> > > > [   82.890906]
> > > > [   82.890906] ============================================
> > > > [   82.890906] WARNING: possible recursive locking detected
> > > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > > [   82.890906] --------------------------------------------
> > > > [   82.890906] ping/418 is trying to acquire lock:
> > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > __dev_queue_xmit+0x1778/0x3550
> > > > [   82.890906]
> > > > [   82.890906] but task is already holding lock:
> > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > __dev_queue_xmit+0x1778/0x3550
> > > > [   82.890906]
> > > > [   82.890906] other info that might help us debug this:
> > > > [   82.890906]  Possible unsafe locking scenario:
> > > > [   82.890906]
> > > > [   82.890906]        CPU0
> > > > [   82.890906]        ----
> > > > [   82.890906]   lock(&sch->q.lock);
> > > > [   82.890906]   lock(&sch->q.lock);
> > > > [   82.890906]
> > > > [   82.890906]  *** DEADLOCK ***
> > > > [   82.890906]
> > > > [..... other info removed for brevity....]
> > > >
> > > > Example setup (eth0->eth0) to recreate
> > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > >      action mirred egress redirect dev eth0
> > > >
> > > > Another example(eth0->eth1->eth0) to recreate
> > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > >      action mirred egress redirect dev eth1
> > > >
> > > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > > >      action mirred egress redirect dev eth0
> > > >
> > > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > > incremented the first time a root qdisc is entered and on a second attempt
> > > > enter the same root qdisc from the top, the packet is dropped to break the
> > > > loop.
> > > >
> > > > Reported-by: renmingshuai@huawei.com
> > > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > ---
> > > >  include/net/sch_generic.h |  2 ++
> > > >  net/core/dev.c            |  9 +++++++++
> > > >  net/sched/sch_api.c       | 12 ++++++++++++
> > > >  net/sched/sch_generic.c   |  2 ++
> > > >  4 files changed, 25 insertions(+)
> > > >
> > > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > > index cefe0c4bdae3..f9f99df037ed 100644
> > > > --- a/include/net/sch_generic.h
> > > > +++ b/include/net/sch_generic.h
> > > > @@ -125,6 +125,8 @@ struct Qdisc {
> > > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > > >         spinlock_t              seqlock;
> > > >
> > > > +       u16 __percpu            *xmit_recursion;
> > > > +
> > > >         struct rcu_head         rcu;
> > > >         netdevice_tracker       dev_tracker;
> > > >         /* private data */
> > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > index 9a67003e49db..2b712388c06f 100644
> > > > --- a/net/core/dev.c
> > > > +++ b/net/core/dev.c
> > > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > > >         if (unlikely(contended))
> > > >                 spin_lock(&q->busylock);
> > >
> > > This could hang here (busylock)
> >
> > Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> > its code vicinity. Am I missing something?
> >
> > >
> > > >
> > > > +       if (__this_cpu_read(*q->xmit_recursion) > 0) {
> > > > +               __qdisc_drop(skb, &to_free);
> > > > +               rc = NET_XMIT_DROP;
> > > > +               goto free_skb_list;
> > > > +       }
> > >
> > >
> > > I do not think we want to add yet another cache line miss and
> > > complexity in tx fast path.
> > >
> >
> > I empathize. The cache miss is due to a per-cpu variable? Otherwise
> > that seems to be in the vicinity of the other fields being accessed in
> > __dev_xmit_skb()
> >
> > > I think that mirred should  use a separate queue to kick a transmit
> > > from the top level.
> > >
> > > (Like netif_rx() does)
> > >
> >
> > Eric, here's my concern: this would entail restructuring mirred
> > totally just to cater for one use case which is in itself _a bad
> > config_ for egress qdisc case only. Mirred is very heavily used in
> > many use cases and changing its behavior could likely introduce other
> > corner cases for some use cases which we would be chasing for a while.
> > Not to forget now we have to go via an extra transient queue.
> > If i understood what you are suggesting is to add an equivalent of
> > backlog queu for the tx side? I am assuming in a very similar nature
> > to backlog, meaning per cpu fired by softirq? or is it something
> > closer to qdisc->gso_skb
> > For either of those cases, the amount of infrastructure code there is
> > not a few lines of code. And then there's the desire to break the loop
> > etc.
> >
> > Some questions regarding your proposal - something I am not following
> > And i may have misunderstood what you are suggesting, but i am missing
> > what scenario mirred can directly call tcf_dev_queue_xmit() (see my
> > comment below)..
> >
> > > Using a softnet.xmit_qdisc_recursion (not a qdisc-per-cpu thing),
> > > would allow mirred to bypass this additional queue
> > > in most cases.
> > >
> > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > index cb37817d6382c29117afd8ce54db6dba94f8c930..62ba5ef554860496ee928f7ed6b7c3ea46b8ee1d
> > > 100644
> > > --- a/include/linux/netdevice.h
> > > +++ b/include/linux/netdevice.h
> > > @@ -3217,7 +3217,8 @@ struct softnet_data {
> > >  #endif
> > >         /* written and read only by owning cpu: */
> > >         struct {
> > > -               u16 recursion;
> > > +               u8 recursion;
> > > +               u8 qdisc_recursion;
> > >                 u8  more;
> > >  #ifdef CONFIG_NET_EGRESS
> > >                 u8  skip_txqueue;
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 9a67003e49db87f3f92b6c6296b3e7a5ca9d9171..7ac59835edef657e9558d4d4fc0a76b171aace93
> > > 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -4298,7 +4298,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct
> > > net_device *sb_dev)
> > >
> > >         trace_net_dev_queue(skb);
> > >         if (q->enqueue) {
> > > +               __this_cpu_inc(softnet_data.xmit.qdisc_recursion);
> >
> > This increments the count by 1..
> >
> > >                 rc = __dev_xmit_skb(skb, q, dev, txq);
> > > +               __this_cpu_dec(softnet_data.xmit.qdisc_recursion);
> > >                 goto out;
> > >         }
> > >
> > > diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
> > > index 5b38143659249e66718348e0ec4ed3c7bc21c13d..0f5f02e6744397d33ae2a72670ba7131aaa6942e
> > > 100644
> > > --- a/net/sched/act_mirred.c
> > > +++ b/net/sched/act_mirred.c
> > > @@ -237,8 +237,13 @@ tcf_mirred_forward(bool at_ingress, bool
> > > want_ingress, struct sk_buff *skb)
> > >  {
> > >         int err;
> > >
> > > -       if (!want_ingress)
> > > -               err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
> > > +       if (!want_ingress) {
> > > +               if (__this_cpu_read(softnet_data.xmit.qdisc_recursion)) {
> >
> > Where does the defered
> > So this will always be 1 assuming the defer queue will have to be
> > something like a workqueue
> 
> Sorry, sent too fast - meant we would always enter here..
Is there a final fix?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-02  2:00       ` renmingshuai
@ 2024-04-02 16:38         ` Jamal Hadi Salim
  0 siblings, 0 replies; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-04-02 16:38 UTC (permalink / raw)
  To: renmingshuai
  Cc: davem, edumazet, jiri, kuba, netdev, pabeni, victor, xiyou.wangcong

On Mon, Apr 1, 2024 at 10:15 PM renmingshuai <renmingshuai@huawei.com> wrote:
>
> > On Wed, Mar 27, 2024 at 6:57 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > >
> > > On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > >
> > > > > When the mirred action is used on a classful egress qdisc and a packet is
> > > > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > > > See trace below.
> > > > >
> > > > > [..... other info removed for brevity....]
> > > > > [   82.890906]
> > > > > [   82.890906] ============================================
> > > > > [   82.890906] WARNING: possible recursive locking detected
> > > > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > > > [   82.890906] --------------------------------------------
> > > > > [   82.890906] ping/418 is trying to acquire lock:
> > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > [   82.890906]
> > > > > [   82.890906] but task is already holding lock:
> > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > [   82.890906]
> > > > > [   82.890906] other info that might help us debug this:
> > > > > [   82.890906]  Possible unsafe locking scenario:
> > > > > [   82.890906]
> > > > > [   82.890906]        CPU0
> > > > > [   82.890906]        ----
> > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > [   82.890906]
> > > > > [   82.890906]  *** DEADLOCK ***
> > > > > [   82.890906]
> > > > > [..... other info removed for brevity....]
> > > > >
> > > > > Example setup (eth0->eth0) to recreate
> > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > >      action mirred egress redirect dev eth0
> > > > >
> > > > > Another example(eth0->eth1->eth0) to recreate
> > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > >      action mirred egress redirect dev eth1
> > > > >
> > > > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > > > >      action mirred egress redirect dev eth0
> > > > >
> > > > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > > > incremented the first time a root qdisc is entered and on a second attempt
> > > > > enter the same root qdisc from the top, the packet is dropped to break the
> > > > > loop.
> > > > >
> > > > > Reported-by: renmingshuai@huawei.com
> > > > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > > ---
> > > > >  include/net/sch_generic.h |  2 ++
> > > > >  net/core/dev.c            |  9 +++++++++
> > > > >  net/sched/sch_api.c       | 12 ++++++++++++
> > > > >  net/sched/sch_generic.c   |  2 ++
> > > > >  4 files changed, 25 insertions(+)
> > > > >
> > > > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > > > index cefe0c4bdae3..f9f99df037ed 100644
> > > > > --- a/include/net/sch_generic.h
> > > > > +++ b/include/net/sch_generic.h
> > > > > @@ -125,6 +125,8 @@ struct Qdisc {
> > > > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > > > >         spinlock_t              seqlock;
> > > > >
> > > > > +       u16 __percpu            *xmit_recursion;
> > > > > +
> > > > >         struct rcu_head         rcu;
> > > > >         netdevice_tracker       dev_tracker;
> > > > >         /* private data */
> > > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > > index 9a67003e49db..2b712388c06f 100644
> > > > > --- a/net/core/dev.c
> > > > > +++ b/net/core/dev.c
> > > > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > > > >         if (unlikely(contended))
> > > > >                 spin_lock(&q->busylock);
> > > >
> > > > This could hang here (busylock)
> > >
> > > Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> > > its code vicinity. Am I missing something?
> > >
> > > >
> > > > >
> > > > > +       if (__this_cpu_read(*q->xmit_recursion) > 0) {
> > > > > +               __qdisc_drop(skb, &to_free);
> > > > > +               rc = NET_XMIT_DROP;
> > > > > +               goto free_skb_list;
> > > > > +       }
> > > >
> > > >
> > > > I do not think we want to add yet another cache line miss and
> > > > complexity in tx fast path.
> > > >
> > >
> > > I empathize. The cache miss is due to a per-cpu variable? Otherwise
> > > that seems to be in the vicinity of the other fields being accessed in
> > > __dev_xmit_skb()
> > >
> > > > I think that mirred should  use a separate queue to kick a transmit
> > > > from the top level.
> > > >
> > > > (Like netif_rx() does)
> > > >
> > >
> > > Eric, here's my concern: this would entail restructuring mirred
> > > totally just to cater for one use case which is in itself _a bad
> > > config_ for egress qdisc case only. Mirred is very heavily used in
> > > many use cases and changing its behavior could likely introduce other
> > > corner cases for some use cases which we would be chasing for a while.
> > > Not to forget now we have to go via an extra transient queue.
> > > If i understood what you are suggesting is to add an equivalent of
> > > backlog queu for the tx side? I am assuming in a very similar nature
> > > to backlog, meaning per cpu fired by softirq? or is it something
> > > closer to qdisc->gso_skb
> > > For either of those cases, the amount of infrastructure code there is
> > > not a few lines of code. And then there's the desire to break the loop
> > > etc.
> > >
> > > Some questions regarding your proposal - something I am not following
> > > And i may have misunderstood what you are suggesting, but i am missing
> > > what scenario mirred can directly call tcf_dev_queue_xmit() (see my
> > > comment below)..
> > >
> > > > Using a softnet.xmit_qdisc_recursion (not a qdisc-per-cpu thing),
> > > > would allow mirred to bypass this additional queue
> > > > in most cases.
> > > >
> > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > > index cb37817d6382c29117afd8ce54db6dba94f8c930..62ba5ef554860496ee928f7ed6b7c3ea46b8ee1d
> > > > 100644
> > > > --- a/include/linux/netdevice.h
> > > > +++ b/include/linux/netdevice.h
> > > > @@ -3217,7 +3217,8 @@ struct softnet_data {
> > > >  #endif
> > > >         /* written and read only by owning cpu: */
> > > >         struct {
> > > > -               u16 recursion;
> > > > +               u8 recursion;
> > > > +               u8 qdisc_recursion;
> > > >                 u8  more;
> > > >  #ifdef CONFIG_NET_EGRESS
> > > >                 u8  skip_txqueue;
> > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > index 9a67003e49db87f3f92b6c6296b3e7a5ca9d9171..7ac59835edef657e9558d4d4fc0a76b171aace93
> > > > 100644
> > > > --- a/net/core/dev.c
> > > > +++ b/net/core/dev.c
> > > > @@ -4298,7 +4298,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct
> > > > net_device *sb_dev)
> > > >
> > > >         trace_net_dev_queue(skb);
> > > >         if (q->enqueue) {
> > > > +               __this_cpu_inc(softnet_data.xmit.qdisc_recursion);
> > >
> > > This increments the count by 1..
> > >
> > > >                 rc = __dev_xmit_skb(skb, q, dev, txq);
> > > > +               __this_cpu_dec(softnet_data.xmit.qdisc_recursion);
> > > >                 goto out;
> > > >         }
> > > >
> > > > diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
> > > > index 5b38143659249e66718348e0ec4ed3c7bc21c13d..0f5f02e6744397d33ae2a72670ba7131aaa6942e
> > > > 100644
> > > > --- a/net/sched/act_mirred.c
> > > > +++ b/net/sched/act_mirred.c
> > > > @@ -237,8 +237,13 @@ tcf_mirred_forward(bool at_ingress, bool
> > > > want_ingress, struct sk_buff *skb)
> > > >  {
> > > >         int err;
> > > >
> > > > -       if (!want_ingress)
> > > > -               err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
> > > > +       if (!want_ingress) {
> > > > +               if (__this_cpu_read(softnet_data.xmit.qdisc_recursion)) {
> > >
> > > Where does the defered
> > > So this will always be 1 assuming the defer queue will have to be
> > > something like a workqueue
> >
> > Sorry, sent too fast - meant we would always enter here..
> Is there a final fix?

That fix should work - but you can view this as "discussion in progress" ;->

cheers,
jamal

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-03-27 22:57   ` Jamal Hadi Salim
  2024-03-27 23:12     ` Jamal Hadi Salim
@ 2024-04-02 16:47     ` Eric Dumazet
  2024-04-02 17:35       ` Jamal Hadi Salim
  1 sibling, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2024-04-02 16:47 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Wed, Mar 27, 2024 at 11:58 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > >
> > > When the mirred action is used on a classful egress qdisc and a packet is
> > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > See trace below.
> > >
> > > [..... other info removed for brevity....]
> > > [   82.890906]
> > > [   82.890906] ============================================
> > > [   82.890906] WARNING: possible recursive locking detected
> > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > [   82.890906] --------------------------------------------
> > > [   82.890906] ping/418 is trying to acquire lock:
> > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > __dev_queue_xmit+0x1778/0x3550
> > > [   82.890906]
> > > [   82.890906] but task is already holding lock:
> > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > __dev_queue_xmit+0x1778/0x3550
> > > [   82.890906]
> > > [   82.890906] other info that might help us debug this:
> > > [   82.890906]  Possible unsafe locking scenario:
> > > [   82.890906]
> > > [   82.890906]        CPU0
> > > [   82.890906]        ----
> > > [   82.890906]   lock(&sch->q.lock);
> > > [   82.890906]   lock(&sch->q.lock);
> > > [   82.890906]
> > > [   82.890906]  *** DEADLOCK ***
> > > [   82.890906]
> > > [..... other info removed for brevity....]
> > >
> > > Example setup (eth0->eth0) to recreate
> > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > >      action mirred egress redirect dev eth0
> > >
> > > Another example(eth0->eth1->eth0) to recreate
> > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > >      action mirred egress redirect dev eth1
> > >
> > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > >      action mirred egress redirect dev eth0
> > >
> > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > incremented the first time a root qdisc is entered and on a second attempt
> > > enter the same root qdisc from the top, the packet is dropped to break the
> > > loop.
> > >
> > > Reported-by: renmingshuai@huawei.com
> > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > ---
> > >  include/net/sch_generic.h |  2 ++
> > >  net/core/dev.c            |  9 +++++++++
> > >  net/sched/sch_api.c       | 12 ++++++++++++
> > >  net/sched/sch_generic.c   |  2 ++
> > >  4 files changed, 25 insertions(+)
> > >
> > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > index cefe0c4bdae3..f9f99df037ed 100644
> > > --- a/include/net/sch_generic.h
> > > +++ b/include/net/sch_generic.h
> > > @@ -125,6 +125,8 @@ struct Qdisc {
> > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > >         spinlock_t              seqlock;
> > >
> > > +       u16 __percpu            *xmit_recursion;
> > > +
> > >         struct rcu_head         rcu;
> > >         netdevice_tracker       dev_tracker;
> > >         /* private data */
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 9a67003e49db..2b712388c06f 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > >         if (unlikely(contended))
> > >                 spin_lock(&q->busylock);
> >
> > This could hang here (busylock)
>
> Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> its code vicinity. Am I missing something?

The hang would happen in above spin_lock(&q->busylock), before you can
get a chance...

If you want to test your patch, add this debugging feature, pretending
the spinlock is contended.

diff --git a/net/core/dev.c b/net/core/dev.c
index 818699dea9d7040ee74532ccdebf01c4fd6887cc..b2fe3aa2716f0fe128ef10f9d06c2431b3246933
100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3816,7 +3816,7 @@ static inline int __dev_xmit_skb(struct sk_buff
*skb, struct Qdisc *q,
         * sent after the qdisc owner is scheduled again. To prevent this
         * scenario the task always serialize on the lock.
         */
-       contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
+       contended = true; // DEBUG for Jamal
        if (unlikely(contended))
                spin_lock(&q->busylock);



>
> >
> > >
> > > +       if (__this_cpu_read(*q->xmit_recursion) > 0) {
> > > +               __qdisc_drop(skb, &to_free);
> > > +               rc = NET_XMIT_DROP;
> > > +               goto free_skb_list;
> > > +       }
> >
> >
> > I do not think we want to add yet another cache line miss and
> > complexity in tx fast path.
> >
>
> I empathize. The cache miss is due to a per-cpu variable? Otherwise
> that seems to be in the vicinity of the other fields being accessed in
> __dev_xmit_skb()
>
> > I think that mirred should  use a separate queue to kick a transmit
> > from the top level.
> >
> > (Like netif_rx() does)
> >
>
> Eric, here's my concern: this would entail restructuring mirred
> totally just to cater for one use case which is in itself _a bad
> config_ for egress qdisc case only.

Why can't the bad config be detected in the ctl path ?

 Mirred is very heavily used in
> many use cases and changing its behavior could likely introduce other
> corner cases for some use cases which we would be chasing for a while.
> Not to forget now we have to go via an extra transient queue.
> If i understood what you are suggesting is to add an equivalent of
> backlog queu for the tx side? I am assuming in a very similar nature
> to backlog, meaning per cpu fired by softirq? or is it something
> closer to qdisc->gso_skb
> For either of those cases, the amount of infrastructure code there is
> not a few lines of code. And then there's the desire to break the loop
> etc.
>
> Some questions regarding your proposal - something I am not following
> And i may have misunderstood what you are suggesting, but i am missing
> what scenario mirred can directly call tcf_dev_queue_xmit() (see my
> comment below)..

I wish the act path would run without qdisc spinlock held, and use RCU instead.

Instead, we are adding cost in fast path only to detect bad configs.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-02 16:47     ` Eric Dumazet
@ 2024-04-02 17:35       ` Jamal Hadi Salim
  2024-04-10 20:30         ` Jamal Hadi Salim
  0 siblings, 1 reply; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-04-02 17:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Tue, Apr 2, 2024 at 12:47 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Mar 27, 2024 at 11:58 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> > On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > >
> > > > When the mirred action is used on a classful egress qdisc and a packet is
> > > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > > See trace below.
> > > >
> > > > [..... other info removed for brevity....]
> > > > [   82.890906]
> > > > [   82.890906] ============================================
> > > > [   82.890906] WARNING: possible recursive locking detected
> > > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > > [   82.890906] --------------------------------------------
> > > > [   82.890906] ping/418 is trying to acquire lock:
> > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > __dev_queue_xmit+0x1778/0x3550
> > > > [   82.890906]
> > > > [   82.890906] but task is already holding lock:
> > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > __dev_queue_xmit+0x1778/0x3550
> > > > [   82.890906]
> > > > [   82.890906] other info that might help us debug this:
> > > > [   82.890906]  Possible unsafe locking scenario:
> > > > [   82.890906]
> > > > [   82.890906]        CPU0
> > > > [   82.890906]        ----
> > > > [   82.890906]   lock(&sch->q.lock);
> > > > [   82.890906]   lock(&sch->q.lock);
> > > > [   82.890906]
> > > > [   82.890906]  *** DEADLOCK ***
> > > > [   82.890906]
> > > > [..... other info removed for brevity....]
> > > >
> > > > Example setup (eth0->eth0) to recreate
> > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > >      action mirred egress redirect dev eth0
> > > >
> > > > Another example(eth0->eth1->eth0) to recreate
> > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > >      action mirred egress redirect dev eth1
> > > >
> > > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > > >      action mirred egress redirect dev eth0
> > > >
> > > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > > incremented the first time a root qdisc is entered and on a second attempt
> > > > enter the same root qdisc from the top, the packet is dropped to break the
> > > > loop.
> > > >
> > > > Reported-by: renmingshuai@huawei.com
> > > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > ---
> > > >  include/net/sch_generic.h |  2 ++
> > > >  net/core/dev.c            |  9 +++++++++
> > > >  net/sched/sch_api.c       | 12 ++++++++++++
> > > >  net/sched/sch_generic.c   |  2 ++
> > > >  4 files changed, 25 insertions(+)
> > > >
> > > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > > index cefe0c4bdae3..f9f99df037ed 100644
> > > > --- a/include/net/sch_generic.h
> > > > +++ b/include/net/sch_generic.h
> > > > @@ -125,6 +125,8 @@ struct Qdisc {
> > > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > > >         spinlock_t              seqlock;
> > > >
> > > > +       u16 __percpu            *xmit_recursion;
> > > > +
> > > >         struct rcu_head         rcu;
> > > >         netdevice_tracker       dev_tracker;
> > > >         /* private data */
> > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > index 9a67003e49db..2b712388c06f 100644
> > > > --- a/net/core/dev.c
> > > > +++ b/net/core/dev.c
> > > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > > >         if (unlikely(contended))
> > > >                 spin_lock(&q->busylock);
> > >
> > > This could hang here (busylock)
> >
> > Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> > its code vicinity. Am I missing something?
>
> The hang would happen in above spin_lock(&q->busylock), before you can
> get a chance...
>
> If you want to test your patch, add this debugging feature, pretending
> the spinlock is contended.
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 818699dea9d7040ee74532ccdebf01c4fd6887cc..b2fe3aa2716f0fe128ef10f9d06c2431b3246933
> 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3816,7 +3816,7 @@ static inline int __dev_xmit_skb(struct sk_buff
> *skb, struct Qdisc *q,
>          * sent after the qdisc owner is scheduled again. To prevent this
>          * scenario the task always serialize on the lock.
>          */
> -       contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
> +       contended = true; // DEBUG for Jamal
>         if (unlikely(contended))
>                 spin_lock(&q->busylock);

Will do.

> >
> > >
> > > >
> > > > +       if (__this_cpu_read(*q->xmit_recursion) > 0) {
> > > > +               __qdisc_drop(skb, &to_free);
> > > > +               rc = NET_XMIT_DROP;
> > > > +               goto free_skb_list;
> > > > +       }
> > >
> > >
> > > I do not think we want to add yet another cache line miss and
> > > complexity in tx fast path.
> > >
> >
> > I empathize. The cache miss is due to a per-cpu variable? Otherwise
> > that seems to be in the vicinity of the other fields being accessed in
> > __dev_xmit_skb()
> >
> > > I think that mirred should  use a separate queue to kick a transmit
> > > from the top level.
> > >
> > > (Like netif_rx() does)
> > >
> >
> > Eric, here's my concern: this would entail restructuring mirred
> > totally just to cater for one use case which is in itself _a bad
> > config_ for egress qdisc case only.
>
> Why can't the bad config be detected in the ctl path ?
>

We actually discussed this. It looks simple until you dig in. To catch
this issue we will need to store some state across ports. This can be
done with a graph construct in the control plane. Assume the following
cases matching header "foo" (all using htb or other classful qdiscs):

case 1:
flower match foo on eth0 redirect to egress of eth0

This is easy to do. Your graph can check if src is eth0 and dst is eth0.

Case 2:
flower match foo on eth0 redirect to eth1
flower match foo on eth1 redirect to eth0

Now your graph has to keep state across netdevs. You can catch this
issue as well (but your "checking" code is now growing).

case 3:
flower match foo on eth0 redirect to eth1
flower match bar on eth1 redirect to eth0

There is clearly no loop here because we have different headers, but
you will have to add code to check for such a case.

case 4:
flower match foo on eth0 redirect to eth1
u32 match foo on eth1 redirect to eth0

Now you have to add code that checks all other classifiers(worse would
be ebpf) for matching headers. Worse is you have to consider wild
card. I am also likely missing some other use cases.
Also, I am sure we'll very quickly hear from people who want very fast
insertion rates of filters which now are going to be massively slowed
down when using mirred.
So i am sure it can be done, just not worth the effort given how many
lines of code would need to be added to catch a corner case of bad
config.

>  Mirred is very heavily used in
> > many use cases and changing its behavior could likely introduce other
> > corner cases for some use cases which we would be chasing for a while.
> > Not to forget now we have to go via an extra transient queue.
> > If i understood what you are suggesting is to add an equivalent of
> > backlog queu for the tx side? I am assuming in a very similar nature
> > to backlog, meaning per cpu fired by softirq? or is it something
> > closer to qdisc->gso_skb
> > For either of those cases, the amount of infrastructure code there is
> > not a few lines of code. And then there's the desire to break the loop
> > etc.
> >
> > Some questions regarding your proposal - something I am not following
> > And i may have misunderstood what you are suggesting, but i am missing
> > what scenario mirred can directly call tcf_dev_queue_xmit() (see my
> > comment below)..
>
> I wish the act path would run without qdisc spinlock held, and use RCU instead.
>

The main (only?) reason we need spinlock for qdiscs is for packet queues.

> Instead, we are adding cost in fast path only to detect bad configs.

Agreed but I dont know how we can totally avoid it. Would you be ok
with us using something that looks like qdisc->gso_skb and we check
such a list after releasing the qdisc lock in __dev_xmit_skb() to
invoke the redirect? The loop then can be caught inside mirred.

cheers,
jamal

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-02 17:35       ` Jamal Hadi Salim
@ 2024-04-10 20:30         ` Jamal Hadi Salim
  2024-04-15  9:20           ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-04-10 20:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

[-- Attachment #1: Type: text/plain, Size: 10224 bytes --]

On Tue, Apr 2, 2024 at 1:35 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Tue, Apr 2, 2024 at 12:47 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Mar 27, 2024 at 11:58 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > >
> > > On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > >
> > > > > When the mirred action is used on a classful egress qdisc and a packet is
> > > > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > > > See trace below.
> > > > >
> > > > > [..... other info removed for brevity....]
> > > > > [   82.890906]
> > > > > [   82.890906] ============================================
> > > > > [   82.890906] WARNING: possible recursive locking detected
> > > > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > > > [   82.890906] --------------------------------------------
> > > > > [   82.890906] ping/418 is trying to acquire lock:
> > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > [   82.890906]
> > > > > [   82.890906] but task is already holding lock:
> > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > [   82.890906]
> > > > > [   82.890906] other info that might help us debug this:
> > > > > [   82.890906]  Possible unsafe locking scenario:
> > > > > [   82.890906]
> > > > > [   82.890906]        CPU0
> > > > > [   82.890906]        ----
> > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > [   82.890906]
> > > > > [   82.890906]  *** DEADLOCK ***
> > > > > [   82.890906]
> > > > > [..... other info removed for brevity....]
> > > > >
> > > > > Example setup (eth0->eth0) to recreate
> > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > >      action mirred egress redirect dev eth0
> > > > >
> > > > > Another example(eth0->eth1->eth0) to recreate
> > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > >      action mirred egress redirect dev eth1
> > > > >
> > > > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > > > >      action mirred egress redirect dev eth0
> > > > >
> > > > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > > > incremented the first time a root qdisc is entered and on a second attempt
> > > > > enter the same root qdisc from the top, the packet is dropped to break the
> > > > > loop.
> > > > >
> > > > > Reported-by: renmingshuai@huawei.com
> > > > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > > ---
> > > > >  include/net/sch_generic.h |  2 ++
> > > > >  net/core/dev.c            |  9 +++++++++
> > > > >  net/sched/sch_api.c       | 12 ++++++++++++
> > > > >  net/sched/sch_generic.c   |  2 ++
> > > > >  4 files changed, 25 insertions(+)
> > > > >
> > > > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > > > index cefe0c4bdae3..f9f99df037ed 100644
> > > > > --- a/include/net/sch_generic.h
> > > > > +++ b/include/net/sch_generic.h
> > > > > @@ -125,6 +125,8 @@ struct Qdisc {
> > > > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > > > >         spinlock_t              seqlock;
> > > > >
> > > > > +       u16 __percpu            *xmit_recursion;
> > > > > +
> > > > >         struct rcu_head         rcu;
> > > > >         netdevice_tracker       dev_tracker;
> > > > >         /* private data */
> > > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > > index 9a67003e49db..2b712388c06f 100644
> > > > > --- a/net/core/dev.c
> > > > > +++ b/net/core/dev.c
> > > > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > > > >         if (unlikely(contended))
> > > > >                 spin_lock(&q->busylock);
> > > >
> > > > This could hang here (busylock)
> > >
> > > Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> > > its code vicinity. Am I missing something?
> >
> > The hang would happen in above spin_lock(&q->busylock), before you can
> > get a chance...
> >
> > If you want to test your patch, add this debugging feature, pretending
> > the spinlock is contended.
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 818699dea9d7040ee74532ccdebf01c4fd6887cc..b2fe3aa2716f0fe128ef10f9d06c2431b3246933
> > 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3816,7 +3816,7 @@ static inline int __dev_xmit_skb(struct sk_buff
> > *skb, struct Qdisc *q,
> >          * sent after the qdisc owner is scheduled again. To prevent this
> >          * scenario the task always serialize on the lock.
> >          */
> > -       contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
> > +       contended = true; // DEBUG for Jamal
> >         if (unlikely(contended))
> >                 spin_lock(&q->busylock);
>
> Will do.

Finally got time to look again. Probably being too clever, but moving
the check before the contended check resolves it as well. The only
strange thing is now with the latest net-next seems to be spitting
some false positive lockdep splat for the test of A->B->A (i am sure
it's fixable).

See attached. Didnt try the other idea, see if you like this one.

cheers,
jamal
> > >
> > > >
> > > > >
> > > > > +       if (__this_cpu_read(*q->xmit_recursion) > 0) {
> > > > > +               __qdisc_drop(skb, &to_free);
> > > > > +               rc = NET_XMIT_DROP;
> > > > > +               goto free_skb_list;
> > > > > +       }
> > > >
> > > >
> > > > I do not think we want to add yet another cache line miss and
> > > > complexity in tx fast path.
> > > >
> > >
> > > I empathize. The cache miss is due to a per-cpu variable? Otherwise
> > > that seems to be in the vicinity of the other fields being accessed in
> > > __dev_xmit_skb()
> > >
> > > > I think that mirred should  use a separate queue to kick a transmit
> > > > from the top level.
> > > >
> > > > (Like netif_rx() does)
> > > >
> > >
> > > Eric, here's my concern: this would entail restructuring mirred
> > > totally just to cater for one use case which is in itself _a bad
> > > config_ for egress qdisc case only.
> >
> > Why can't the bad config be detected in the ctl path ?
> >
>
> We actually discussed this. It looks simple until you dig in. To catch
> this issue we will need to store some state across ports. This can be
> done with a graph construct in the control plane. Assume the following
> cases matching header "foo" (all using htb or other classful qdiscs):
>
> case 1:
> flower match foo on eth0 redirect to egress of eth0
>
> This is easy to do. Your graph can check if src is eth0 and dst is eth0.
>
> Case 2:
> flower match foo on eth0 redirect to eth1
> flower match foo on eth1 redirect to eth0
>
> Now your graph has to keep state across netdevs. You can catch this
> issue as well (but your "checking" code is now growing).
>
> case 3:
> flower match foo on eth0 redirect to eth1
> flower match bar on eth1 redirect to eth0
>
> There is clearly no loop here because we have different headers, but
> you will have to add code to check for such a case.
>
> case 4:
> flower match foo on eth0 redirect to eth1
> u32 match foo on eth1 redirect to eth0
>
> Now you have to add code that checks all other classifiers(worse would
> be ebpf) for matching headers. Worse is you have to consider wild
> card. I am also likely missing some other use cases.
> Also, I am sure we'll very quickly hear from people who want very fast
> insertion rates of filters which now are going to be massively slowed
> down when using mirred.
> So i am sure it can be done, just not worth the effort given how many
> lines of code would need to be added to catch a corner case of bad
> config.
>
> >  Mirred is very heavily used in
> > > many use cases and changing its behavior could likely introduce other
> > > corner cases for some use cases which we would be chasing for a while.
> > > Not to forget now we have to go via an extra transient queue.
> > > If i understood what you are suggesting is to add an equivalent of
> > > backlog queu for the tx side? I am assuming in a very similar nature
> > > to backlog, meaning per cpu fired by softirq? or is it something
> > > closer to qdisc->gso_skb
> > > For either of those cases, the amount of infrastructure code there is
> > > not a few lines of code. And then there's the desire to break the loop
> > > etc.
> > >
> > > Some questions regarding your proposal - something I am not following
> > > And i may have misunderstood what you are suggesting, but i am missing
> > > what scenario mirred can directly call tcf_dev_queue_xmit() (see my
> > > comment below)..
> >
> > I wish the act path would run without qdisc spinlock held, and use RCU instead.
> >
>
> The main (only?) reason we need spinlock for qdiscs is for packet queues.
>
> > Instead, we are adding cost in fast path only to detect bad configs.
>
> Agreed but I dont know how we can totally avoid it. Would you be ok
> with us using something that looks like qdisc->gso_skb and we check
> such a list after releasing the qdisc lock in __dev_xmit_skb() to
> invoke the redirect? The loop then can be caught inside mirred.
>
> cheers,
> jamal

[-- Attachment #2: mirred-recursion-fix.patch --]
[-- Type: text/x-patch, Size: 2576 bytes --]

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 76db6be16083..f860545f17d5 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -125,6 +125,8 @@ struct Qdisc {
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
 
+	u16 __percpu            *xmit_recursion;
+
 	struct rcu_head		rcu;
 	netdevice_tracker	dev_tracker;
 	/* private data */
diff --git a/net/core/dev.c b/net/core/dev.c
index 854a3a28a8d8..d66f68018ac7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3808,6 +3808,14 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		return rc;
 	}
 
+	if (__this_cpu_read(*q->xmit_recursion) > 0) {
+		__qdisc_drop(skb, &to_free);
+		kfree_skb_list_reason(to_free,
+				      tcf_get_drop_reason(to_free));
+		return NET_XMIT_DROP;
+	}
+	__this_cpu_inc(*q->xmit_recursion);
+
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
@@ -3863,6 +3871,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				      tcf_get_drop_reason(to_free));
 	if (unlikely(contended))
 		spin_unlock(&q->busylock);
+	__this_cpu_dec(*q->xmit_recursion);
 	return rc;
 }
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 60239378d43f..7621dc7d1d45 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1260,6 +1260,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 	struct Qdisc *sch;
 	struct Qdisc_ops *ops;
 	struct qdisc_size_table *stab;
+	int cpu;
 
 	ops = qdisc_lookup_ops(kind);
 #ifdef CONFIG_MODULES
@@ -1376,11 +1377,22 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		}
 	}
 
+	sch->xmit_recursion = alloc_percpu(u16);
+	if (!sch->xmit_recursion) {
+		err = -ENOMEM;
+		goto err_out5;
+	}
+	for_each_possible_cpu(cpu)
+		(*per_cpu_ptr(sch->xmit_recursion, cpu)) = 0;
+
 	qdisc_hash_add(sch, false);
 	trace_qdisc_create(ops, dev, parent);
 
 	return sch;
 
+err_out5:
+	if (tca[TCA_RATE])
+		gen_kill_estimator(&sch->rate_est);
 err_out4:
 	/* Even if ops->init() failed, we call ops->destroy()
 	 * like qdisc_create_dflt().
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ff5336493777..afbbd2e885a4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1070,6 +1070,8 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	netdev_put(dev, &qdisc->dev_tracker);
 
+	free_percpu(qdisc->xmit_recursion);
+
 	trace_qdisc_destroy(qdisc);
 
 	call_rcu(&qdisc->rcu, qdisc_free_cb);

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-10 20:30         ` Jamal Hadi Salim
@ 2024-04-15  9:20           ` Eric Dumazet
  2024-04-15  9:29             ` Eric Dumazet
  2024-04-15 13:59             ` Jamal Hadi Salim
  0 siblings, 2 replies; 21+ messages in thread
From: Eric Dumazet @ 2024-04-15  9:20 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Wed, Apr 10, 2024 at 10:30 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Tue, Apr 2, 2024 at 1:35 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> > On Tue, Apr 2, 2024 at 12:47 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Wed, Mar 27, 2024 at 11:58 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > >
> > > > On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> > > > >
> > > > > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > > >
> > > > > > When the mirred action is used on a classful egress qdisc and a packet is
> > > > > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > > > > See trace below.
> > > > > >
> > > > > > [..... other info removed for brevity....]
> > > > > > [   82.890906]
> > > > > > [   82.890906] ============================================
> > > > > > [   82.890906] WARNING: possible recursive locking detected
> > > > > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > > > > [   82.890906] --------------------------------------------
> > > > > > [   82.890906] ping/418 is trying to acquire lock:
> > > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > > [   82.890906]
> > > > > > [   82.890906] but task is already holding lock:
> > > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > > [   82.890906]
> > > > > > [   82.890906] other info that might help us debug this:
> > > > > > [   82.890906]  Possible unsafe locking scenario:
> > > > > > [   82.890906]
> > > > > > [   82.890906]        CPU0
> > > > > > [   82.890906]        ----
> > > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > > [   82.890906]
> > > > > > [   82.890906]  *** DEADLOCK ***
> > > > > > [   82.890906]
> > > > > > [..... other info removed for brevity....]
> > > > > >
> > > > > > Example setup (eth0->eth0) to recreate
> > > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > > >      action mirred egress redirect dev eth0
> > > > > >
> > > > > > Another example(eth0->eth1->eth0) to recreate
> > > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > > >      action mirred egress redirect dev eth1
> > > > > >
> > > > > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > > > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > > > > >      action mirred egress redirect dev eth0
> > > > > >
> > > > > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > > > > incremented the first time a root qdisc is entered and on a second attempt
> > > > > > enter the same root qdisc from the top, the packet is dropped to break the
> > > > > > loop.
> > > > > >
> > > > > > Reported-by: renmingshuai@huawei.com
> > > > > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > > > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > > > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > > > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > > > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > > > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > > > ---
> > > > > >  include/net/sch_generic.h |  2 ++
> > > > > >  net/core/dev.c            |  9 +++++++++
> > > > > >  net/sched/sch_api.c       | 12 ++++++++++++
> > > > > >  net/sched/sch_generic.c   |  2 ++
> > > > > >  4 files changed, 25 insertions(+)
> > > > > >
> > > > > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > > > > index cefe0c4bdae3..f9f99df037ed 100644
> > > > > > --- a/include/net/sch_generic.h
> > > > > > +++ b/include/net/sch_generic.h
> > > > > > @@ -125,6 +125,8 @@ struct Qdisc {
> > > > > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > > > > >         spinlock_t              seqlock;
> > > > > >
> > > > > > +       u16 __percpu            *xmit_recursion;
> > > > > > +
> > > > > >         struct rcu_head         rcu;
> > > > > >         netdevice_tracker       dev_tracker;
> > > > > >         /* private data */
> > > > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > > > index 9a67003e49db..2b712388c06f 100644
> > > > > > --- a/net/core/dev.c
> > > > > > +++ b/net/core/dev.c
> > > > > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > > > > >         if (unlikely(contended))
> > > > > >                 spin_lock(&q->busylock);
> > > > >
> > > > > This could hang here (busylock)
> > > >
> > > > Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> > > > its code vicinity. Am I missing something?
> > >
> > > The hang would happen in above spin_lock(&q->busylock), before you can
> > > get a chance...
> > >
> > > If you want to test your patch, add this debugging feature, pretending
> > > the spinlock is contended.
> > >
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 818699dea9d7040ee74532ccdebf01c4fd6887cc..b2fe3aa2716f0fe128ef10f9d06c2431b3246933
> > > 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -3816,7 +3816,7 @@ static inline int __dev_xmit_skb(struct sk_buff
> > > *skb, struct Qdisc *q,
> > >          * sent after the qdisc owner is scheduled again. To prevent this
> > >          * scenario the task always serialize on the lock.
> > >          */
> > > -       contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
> > > +       contended = true; // DEBUG for Jamal
> > >         if (unlikely(contended))
> > >                 spin_lock(&q->busylock);
> >
> > Will do.
>
> Finally got time to look again. Probably being too clever, but moving
> the check before the contended check resolves it as well. The only
> strange thing is now with the latest net-next seems to be spitting
> some false positive lockdep splat for the test of A->B->A (i am sure
> it's fixable).
>
> See attached. Didnt try the other idea, see if you like this one.

A spinlock can only be held by one cpu at a time, so recording the cpu
number of the lock owner should be
enough to avoid a deadlock.

So I really do not understand your push for a per-cpu variable with
extra cache line misses.

I think the following would work just fine ? What do you think ?

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 76db6be1608315102495dd6372fc30e6c9d41a99..dcd92ed7f69fae00deaca2c88fed248a559108ea
100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -117,6 +117,7 @@ struct Qdisc {
        struct qdisc_skb_head   q;
        struct gnet_stats_basic_sync bstats;
        struct gnet_stats_queue qstats;
+       int                     owner;
        unsigned long           state;
        unsigned long           state2; /* must be written under qdisc
spinlock */
        struct Qdisc            *next_sched;
diff --git a/net/core/dev.c b/net/core/dev.c
index 854a3a28a8d85b335a9158378ae0cca6dfbf8b36..d77cac53df4b4af478548dd17e7a3a7cfe4bd792
100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3808,6 +3808,11 @@ static inline int __dev_xmit_skb(struct sk_buff
*skb, struct Qdisc *q,
                return rc;
        }

+       if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
+               /* add a specific drop_reason later in net-next */
+               kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
+               return NET_XMIT_DROP;
+       }
        /*
         * Heuristic to force contended enqueues to serialize on a
         * separate lock before trying to get qdisc main lock.
@@ -3847,7 +3852,9 @@ static inline int __dev_xmit_skb(struct sk_buff
*skb, struct Qdisc *q,
                qdisc_run_end(q);
                rc = NET_XMIT_SUCCESS;
        } else {
+               WRITE_ONCE(q->owner, smp_processor_id());
                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+               WRITE_ONCE(q->owner, -1);
                if (qdisc_run_begin(q)) {
                        if (unlikely(contended)) {
                                spin_unlock(&q->busylock);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-15  9:20           ` Eric Dumazet
@ 2024-04-15  9:29             ` Eric Dumazet
  2024-04-15 13:59             ` Jamal Hadi Salim
  1 sibling, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2024-04-15  9:29 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Mon, Apr 15, 2024 at 11:20 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Apr 10, 2024 at 10:30 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> > On Tue, Apr 2, 2024 at 1:35 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > >
> > > On Tue, Apr 2, 2024 at 12:47 PM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > On Wed, Mar 27, 2024 at 11:58 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > >
> > > > > On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> > > > > >
> > > > > > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > > > >
> > > > > > > When the mirred action is used on a classful egress qdisc and a packet is
> > > > > > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > > > > > See trace below.
> > > > > > >
> > > > > > > [..... other info removed for brevity....]
> > > > > > > [   82.890906]
> > > > > > > [   82.890906] ============================================
> > > > > > > [   82.890906] WARNING: possible recursive locking detected
> > > > > > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > > > > > [   82.890906] --------------------------------------------
> > > > > > > [   82.890906] ping/418 is trying to acquire lock:
> > > > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > > > [   82.890906]
> > > > > > > [   82.890906] but task is already holding lock:
> > > > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > > > [   82.890906]
> > > > > > > [   82.890906] other info that might help us debug this:
> > > > > > > [   82.890906]  Possible unsafe locking scenario:
> > > > > > > [   82.890906]
> > > > > > > [   82.890906]        CPU0
> > > > > > > [   82.890906]        ----
> > > > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > > > [   82.890906]
> > > > > > > [   82.890906]  *** DEADLOCK ***
> > > > > > > [   82.890906]
> > > > > > > [..... other info removed for brevity....]
> > > > > > >
> > > > > > > Example setup (eth0->eth0) to recreate
> > > > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > > > >      action mirred egress redirect dev eth0
> > > > > > >
> > > > > > > Another example(eth0->eth1->eth0) to recreate
> > > > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > > > >      action mirred egress redirect dev eth1
> > > > > > >
> > > > > > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > > > > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > > > > > >      action mirred egress redirect dev eth0
> > > > > > >
> > > > > > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > > > > > incremented the first time a root qdisc is entered and on a second attempt
> > > > > > > enter the same root qdisc from the top, the packet is dropped to break the
> > > > > > > loop.
> > > > > > >
> > > > > > > Reported-by: renmingshuai@huawei.com
> > > > > > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > > > > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > > > > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > > > > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > > > > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > > > > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > > > > ---
> > > > > > >  include/net/sch_generic.h |  2 ++
> > > > > > >  net/core/dev.c            |  9 +++++++++
> > > > > > >  net/sched/sch_api.c       | 12 ++++++++++++
> > > > > > >  net/sched/sch_generic.c   |  2 ++
> > > > > > >  4 files changed, 25 insertions(+)
> > > > > > >
> > > > > > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > > > > > index cefe0c4bdae3..f9f99df037ed 100644
> > > > > > > --- a/include/net/sch_generic.h
> > > > > > > +++ b/include/net/sch_generic.h
> > > > > > > @@ -125,6 +125,8 @@ struct Qdisc {
> > > > > > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > > > > > >         spinlock_t              seqlock;
> > > > > > >
> > > > > > > +       u16 __percpu            *xmit_recursion;
> > > > > > > +
> > > > > > >         struct rcu_head         rcu;
> > > > > > >         netdevice_tracker       dev_tracker;
> > > > > > >         /* private data */
> > > > > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > > > > index 9a67003e49db..2b712388c06f 100644
> > > > > > > --- a/net/core/dev.c
> > > > > > > +++ b/net/core/dev.c
> > > > > > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > > > > > >         if (unlikely(contended))
> > > > > > >                 spin_lock(&q->busylock);
> > > > > >
> > > > > > This could hang here (busylock)
> > > > >
> > > > > Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> > > > > its code vicinity. Am I missing something?
> > > >
> > > > The hang would happen in above spin_lock(&q->busylock), before you can
> > > > get a chance...
> > > >
> > > > If you want to test your patch, add this debugging feature, pretending
> > > > the spinlock is contended.
> > > >
> > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > index 818699dea9d7040ee74532ccdebf01c4fd6887cc..b2fe3aa2716f0fe128ef10f9d06c2431b3246933
> > > > 100644
> > > > --- a/net/core/dev.c
> > > > +++ b/net/core/dev.c
> > > > @@ -3816,7 +3816,7 @@ static inline int __dev_xmit_skb(struct sk_buff
> > > > *skb, struct Qdisc *q,
> > > >          * sent after the qdisc owner is scheduled again. To prevent this
> > > >          * scenario the task always serialize on the lock.
> > > >          */
> > > > -       contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
> > > > +       contended = true; // DEBUG for Jamal
> > > >         if (unlikely(contended))
> > > >                 spin_lock(&q->busylock);
> > >
> > > Will do.
> >
> > Finally got time to look again. Probably being too clever, but moving
> > the check before the contended check resolves it as well. The only
> > strange thing is now with the latest net-next seems to be spitting
> > some false positive lockdep splat for the test of A->B->A (i am sure
> > it's fixable).
> >
> > See attached. Didnt try the other idea, see if you like this one.
>
> A spinlock can only be held by one cpu at a time, so recording the cpu
> number of the lock owner should be
> enough to avoid a deadlock.
>
> So I really do not understand your push for a per-cpu variable with
> extra cache line misses.
>
> I think the following would work just fine ? What do you think ?
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 76db6be1608315102495dd6372fc30e6c9d41a99..dcd92ed7f69fae00deaca2c88fed248a559108ea
> 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -117,6 +117,7 @@ struct Qdisc {
>         struct qdisc_skb_head   q;
>         struct gnet_stats_basic_sync bstats;
>         struct gnet_stats_queue qstats;
> +       int                     owner;
>         unsigned long           state;
>         unsigned long           state2; /* must be written under qdisc
> spinlock */
>         struct Qdisc            *next_sched;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 854a3a28a8d85b335a9158378ae0cca6dfbf8b36..d77cac53df4b4af478548dd17e7a3a7cfe4bd792
> 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3808,6 +3808,11 @@ static inline int __dev_xmit_skb(struct sk_buff
> *skb, struct Qdisc *q,
>                 return rc;
>         }
>
> +       if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
> +               /* add a specific drop_reason later in net-next */
> +               kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
> +               return NET_XMIT_DROP;
> +       }

Or even better this expensive check could be moved into tcf_mirred_forward() ?

This would be done later in net-next, because this looks a bit more
complex than this simple solution.


>         /*
>          * Heuristic to force contended enqueues to serialize on a
>          * separate lock before trying to get qdisc main lock.
> @@ -3847,7 +3852,9 @@ static inline int __dev_xmit_skb(struct sk_buff
> *skb, struct Qdisc *q,
>                 qdisc_run_end(q);
>                 rc = NET_XMIT_SUCCESS;
>         } else {
> +               WRITE_ONCE(q->owner, smp_processor_id());
>                 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
> +               WRITE_ONCE(q->owner, -1);
>                 if (qdisc_run_begin(q)) {
>                         if (unlikely(contended)) {
>                                 spin_unlock(&q->busylock);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-15  9:20           ` Eric Dumazet
  2024-04-15  9:29             ` Eric Dumazet
@ 2024-04-15 13:59             ` Jamal Hadi Salim
  2024-04-15 14:01               ` Jamal Hadi Salim
  1 sibling, 1 reply; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-04-15 13:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

[-- Attachment #1: Type: text/plain, Size: 9500 bytes --]

On Mon, Apr 15, 2024 at 5:20 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Apr 10, 2024 at 10:30 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> > On Tue, Apr 2, 2024 at 1:35 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > >
> > > On Tue, Apr 2, 2024 at 12:47 PM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > On Wed, Mar 27, 2024 at 11:58 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > >
> > > > > On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> > > > > >
> > > > > > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > > > >
> > > > > > > When the mirred action is used on a classful egress qdisc and a packet is
> > > > > > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > > > > > See trace below.
> > > > > > >
> > > > > > > [..... other info removed for brevity....]
> > > > > > > [   82.890906]
> > > > > > > [   82.890906] ============================================
> > > > > > > [   82.890906] WARNING: possible recursive locking detected
> > > > > > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > > > > > [   82.890906] --------------------------------------------
> > > > > > > [   82.890906] ping/418 is trying to acquire lock:
> > > > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > > > [   82.890906]
> > > > > > > [   82.890906] but task is already holding lock:
> > > > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > > > [   82.890906]
> > > > > > > [   82.890906] other info that might help us debug this:
> > > > > > > [   82.890906]  Possible unsafe locking scenario:
> > > > > > > [   82.890906]
> > > > > > > [   82.890906]        CPU0
> > > > > > > [   82.890906]        ----
> > > > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > > > [   82.890906]
> > > > > > > [   82.890906]  *** DEADLOCK ***
> > > > > > > [   82.890906]
> > > > > > > [..... other info removed for brevity....]
> > > > > > >
> > > > > > > Example setup (eth0->eth0) to recreate
> > > > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > > > >      action mirred egress redirect dev eth0
> > > > > > >
> > > > > > > Another example(eth0->eth1->eth0) to recreate
> > > > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > > > >      action mirred egress redirect dev eth1
> > > > > > >
> > > > > > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > > > > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > > > > > >      action mirred egress redirect dev eth0
> > > > > > >
> > > > > > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > > > > > incremented the first time a root qdisc is entered and on a second attempt
> > > > > > > enter the same root qdisc from the top, the packet is dropped to break the
> > > > > > > loop.
> > > > > > >
> > > > > > > Reported-by: renmingshuai@huawei.com
> > > > > > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > > > > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > > > > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > > > > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > > > > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > > > > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > > > > ---
> > > > > > >  include/net/sch_generic.h |  2 ++
> > > > > > >  net/core/dev.c            |  9 +++++++++
> > > > > > >  net/sched/sch_api.c       | 12 ++++++++++++
> > > > > > >  net/sched/sch_generic.c   |  2 ++
> > > > > > >  4 files changed, 25 insertions(+)
> > > > > > >
> > > > > > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > > > > > index cefe0c4bdae3..f9f99df037ed 100644
> > > > > > > --- a/include/net/sch_generic.h
> > > > > > > +++ b/include/net/sch_generic.h
> > > > > > > @@ -125,6 +125,8 @@ struct Qdisc {
> > > > > > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > > > > > >         spinlock_t              seqlock;
> > > > > > >
> > > > > > > +       u16 __percpu            *xmit_recursion;
> > > > > > > +
> > > > > > >         struct rcu_head         rcu;
> > > > > > >         netdevice_tracker       dev_tracker;
> > > > > > >         /* private data */
> > > > > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > > > > index 9a67003e49db..2b712388c06f 100644
> > > > > > > --- a/net/core/dev.c
> > > > > > > +++ b/net/core/dev.c
> > > > > > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > > > > > >         if (unlikely(contended))
> > > > > > >                 spin_lock(&q->busylock);
> > > > > >
> > > > > > This could hang here (busylock)
> > > > >
> > > > > Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> > > > > its code vicinity. Am I missing something?
> > > >
> > > > The hang would happen in above spin_lock(&q->busylock), before you can
> > > > get a chance...
> > > >
> > > > If you want to test your patch, add this debugging feature, pretending
> > > > the spinlock is contended.
> > > >
> > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > index 818699dea9d7040ee74532ccdebf01c4fd6887cc..b2fe3aa2716f0fe128ef10f9d06c2431b3246933
> > > > 100644
> > > > --- a/net/core/dev.c
> > > > +++ b/net/core/dev.c
> > > > @@ -3816,7 +3816,7 @@ static inline int __dev_xmit_skb(struct sk_buff
> > > > *skb, struct Qdisc *q,
> > > >          * sent after the qdisc owner is scheduled again. To prevent this
> > > >          * scenario the task always serialize on the lock.
> > > >          */
> > > > -       contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
> > > > +       contended = true; // DEBUG for Jamal
> > > >         if (unlikely(contended))
> > > >                 spin_lock(&q->busylock);
> > >
> > > Will do.
> >
> > Finally got time to look again. Probably being too clever, but moving
> > the check before the contended check resolves it as well. The only
> > strange thing is now with the latest net-next seems to be spitting
> > some false positive lockdep splat for the test of A->B->A (i am sure
> > it's fixable).
> >
> > See attached. Didnt try the other idea, see if you like this one.
>
> A spinlock can only be held by one cpu at a time, so recording the cpu
> number of the lock owner should be
> enough to avoid a deadlock.
>
> So I really do not understand your push for a per-cpu variable with
> extra cache line misses.
>

I was asking earlier - do per-cpu (qdisc struct) variables incur extra
cache misses on top of the cache miss that is incurred when accessing
the qdisc struct? I think you have been saying yes, but not being
explicit ;->

> I think the following would work just fine ? What do you think ?
>

Yep, good compromise ;-> Small missing thing - the initialization. See attached.
Do you want to send it or should we? We are going to add the reason
and test cases later.

cheers,
jamal

> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 76db6be1608315102495dd6372fc30e6c9d41a99..dcd92ed7f69fae00deaca2c88fed248a559108ea
> 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -117,6 +117,7 @@ struct Qdisc {
>         struct qdisc_skb_head   q;
>         struct gnet_stats_basic_sync bstats;
>         struct gnet_stats_queue qstats;
> +       int                     owner;
>         unsigned long           state;
>         unsigned long           state2; /* must be written under qdisc
> spinlock */
>         struct Qdisc            *next_sched;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 854a3a28a8d85b335a9158378ae0cca6dfbf8b36..d77cac53df4b4af478548dd17e7a3a7cfe4bd792
> 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3808,6 +3808,11 @@ static inline int __dev_xmit_skb(struct sk_buff
> *skb, struct Qdisc *q,
>                 return rc;
>         }
>
> +       if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
> +               /* add a specific drop_reason later in net-next */
> +               kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
> +               return NET_XMIT_DROP;
> +       }
>         /*
>          * Heuristic to force contended enqueues to serialize on a
>          * separate lock before trying to get qdisc main lock.
> @@ -3847,7 +3852,9 @@ static inline int __dev_xmit_skb(struct sk_buff
> *skb, struct Qdisc *q,
>                 qdisc_run_end(q);
>                 rc = NET_XMIT_SUCCESS;
>         } else {
> +               WRITE_ONCE(q->owner, smp_processor_id());
>                 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
> +               WRITE_ONCE(q->owner, -1);
>                 if (qdisc_run_begin(q)) {
>                         if (unlikely(contended)) {
>                                 spin_unlock(&q->busylock);

[-- Attachment #2: mirred-loop-fix.patch --]
[-- Type: text/x-patch, Size: 1844 bytes --]

commit 5e819049c270f4771bb990db13d75f8ec88f6227
Author: Victor Nogueira <victor@mojatatu.com>
Date:   Mon Apr 15 12:10:40 2024 +0000

    Mirred nested loop fix

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 76db6be16083..f561dfb79743 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -117,6 +117,7 @@ struct Qdisc {
 	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_sync bstats;
 	struct gnet_stats_queue	qstats;
+	int                     owner;
 	unsigned long		state;
 	unsigned long		state2; /* must be written under qdisc spinlock */
 	struct Qdisc            *next_sched;
diff --git a/net/core/dev.c b/net/core/dev.c
index 854a3a28a8d8..f6c6e494f0a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3808,6 +3808,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		return rc;
 	}
 
+	if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
+		kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
+		return NET_XMIT_DROP;
+	}
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
@@ -3847,7 +3851,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
+		WRITE_ONCE(q->owner, smp_processor_id());
 		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+		WRITE_ONCE(q->owner, -1);
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 60239378d43f..15c715c16247 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1376,6 +1376,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
 		}
 	}
 
+	sch->owner = -1;
 	qdisc_hash_add(sch, false);
 	trace_qdisc_create(ops, dev, parent);
 

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-15 13:59             ` Jamal Hadi Salim
@ 2024-04-15 14:01               ` Jamal Hadi Salim
  2024-04-15 14:11                 ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-04-15 14:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Mon, Apr 15, 2024 at 9:59 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Mon, Apr 15, 2024 at 5:20 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Apr 10, 2024 at 10:30 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > >
> > > On Tue, Apr 2, 2024 at 1:35 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > >
> > > > On Tue, Apr 2, 2024 at 12:47 PM Eric Dumazet <edumazet@google.com> wrote:
> > > > >
> > > > > On Wed, Mar 27, 2024 at 11:58 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > > >
> > > > > > On Wed, Mar 27, 2024 at 9:23 AM Eric Dumazet <edumazet@google.com> wrote:
> > > > > > >
> > > > > > > On Wed, Mar 27, 2024 at 12:03 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> > > > > > > >
> > > > > > > > When the mirred action is used on a classful egress qdisc and a packet is
> > > > > > > > mirrored or redirected to self we hit a qdisc lock deadlock.
> > > > > > > > See trace below.
> > > > > > > >
> > > > > > > > [..... other info removed for brevity....]
> > > > > > > > [   82.890906]
> > > > > > > > [   82.890906] ============================================
> > > > > > > > [   82.890906] WARNING: possible recursive locking detected
> > > > > > > > [   82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G        W
> > > > > > > > [   82.890906] --------------------------------------------
> > > > > > > > [   82.890906] ping/418 is trying to acquire lock:
> > > > > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > > > > [   82.890906]
> > > > > > > > [   82.890906] but task is already holding lock:
> > > > > > > > [   82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at:
> > > > > > > > __dev_queue_xmit+0x1778/0x3550
> > > > > > > > [   82.890906]
> > > > > > > > [   82.890906] other info that might help us debug this:
> > > > > > > > [   82.890906]  Possible unsafe locking scenario:
> > > > > > > > [   82.890906]
> > > > > > > > [   82.890906]        CPU0
> > > > > > > > [   82.890906]        ----
> > > > > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > > > > [   82.890906]   lock(&sch->q.lock);
> > > > > > > > [   82.890906]
> > > > > > > > [   82.890906]  *** DEADLOCK ***
> > > > > > > > [   82.890906]
> > > > > > > > [..... other info removed for brevity....]
> > > > > > > >
> > > > > > > > Example setup (eth0->eth0) to recreate
> > > > > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > > > > >      action mirred egress redirect dev eth0
> > > > > > > >
> > > > > > > > Another example(eth0->eth1->eth0) to recreate
> > > > > > > > tc qdisc add dev eth0 root handle 1: htb default 30
> > > > > > > > tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
> > > > > > > >      action mirred egress redirect dev eth1
> > > > > > > >
> > > > > > > > tc qdisc add dev eth1 root handle 1: htb default 30
> > > > > > > > tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
> > > > > > > >      action mirred egress redirect dev eth0
> > > > > > > >
> > > > > > > > We fix this by adding a per-cpu, per-qdisc recursion counter which is
> > > > > > > > incremented the first time a root qdisc is entered and on a second attempt
> > > > > > > > enter the same root qdisc from the top, the packet is dropped to break the
> > > > > > > > loop.
> > > > > > > >
> > > > > > > > Reported-by: renmingshuai@huawei.com
> > > > > > > > Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/
> > > > > > > > Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()")
> > > > > > > > Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops")
> > > > > > > > Co-developed-by: Victor Nogueira <victor@mojatatu.com>
> > > > > > > > Signed-off-by: Victor Nogueira <victor@mojatatu.com>
> > > > > > > > Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > > > > > ---
> > > > > > > >  include/net/sch_generic.h |  2 ++
> > > > > > > >  net/core/dev.c            |  9 +++++++++
> > > > > > > >  net/sched/sch_api.c       | 12 ++++++++++++
> > > > > > > >  net/sched/sch_generic.c   |  2 ++
> > > > > > > >  4 files changed, 25 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > > > > > > index cefe0c4bdae3..f9f99df037ed 100644
> > > > > > > > --- a/include/net/sch_generic.h
> > > > > > > > +++ b/include/net/sch_generic.h
> > > > > > > > @@ -125,6 +125,8 @@ struct Qdisc {
> > > > > > > >         spinlock_t              busylock ____cacheline_aligned_in_smp;
> > > > > > > >         spinlock_t              seqlock;
> > > > > > > >
> > > > > > > > +       u16 __percpu            *xmit_recursion;
> > > > > > > > +
> > > > > > > >         struct rcu_head         rcu;
> > > > > > > >         netdevice_tracker       dev_tracker;
> > > > > > > >         /* private data */
> > > > > > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > > > > > index 9a67003e49db..2b712388c06f 100644
> > > > > > > > --- a/net/core/dev.c
> > > > > > > > +++ b/net/core/dev.c
> > > > > > > > @@ -3789,6 +3789,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
> > > > > > > >         if (unlikely(contended))
> > > > > > > >                 spin_lock(&q->busylock);
> > > > > > >
> > > > > > > This could hang here (busylock)
> > > > > >
> > > > > > Notice the goto free_skb_list has an spin_unlock(&q->busylock);  in
> > > > > > its code vicinity. Am I missing something?
> > > > >
> > > > > The hang would happen in above spin_lock(&q->busylock), before you can
> > > > > get a chance...
> > > > >
> > > > > If you want to test your patch, add this debugging feature, pretending
> > > > > the spinlock is contended.
> > > > >
> > > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > > index 818699dea9d7040ee74532ccdebf01c4fd6887cc..b2fe3aa2716f0fe128ef10f9d06c2431b3246933
> > > > > 100644
> > > > > --- a/net/core/dev.c
> > > > > +++ b/net/core/dev.c
> > > > > @@ -3816,7 +3816,7 @@ static inline int __dev_xmit_skb(struct sk_buff
> > > > > *skb, struct Qdisc *q,
> > > > >          * sent after the qdisc owner is scheduled again. To prevent this
> > > > >          * scenario the task always serialize on the lock.
> > > > >          */
> > > > > -       contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
> > > > > +       contended = true; // DEBUG for Jamal
> > > > >         if (unlikely(contended))
> > > > >                 spin_lock(&q->busylock);
> > > >
> > > > Will do.
> > >
> > > Finally got time to look again. Probably being too clever, but moving
> > > the check before the contended check resolves it as well. The only
> > > strange thing is now with the latest net-next seems to be spitting
> > > some false positive lockdep splat for the test of A->B->A (i am sure
> > > it's fixable).
> > >
> > > See attached. Didnt try the other idea, see if you like this one.
> >
> > A spinlock can only be held by one cpu at a time, so recording the cpu
> > number of the lock owner should be
> > enough to avoid a deadlock.
> >
> > So I really do not understand your push for a per-cpu variable with
> > extra cache line misses.
> >
>
> I was asking earlier - do per-cpu (qdisc struct) variables incur extra
> cache misses on top of the cache miss that is incurred when accessing
> the qdisc struct? I think you have been saying yes, but not being
> explicit ;->
>
> > I think the following would work just fine ? What do you think ?
> >
>
> Yep, good compromise ;-> Small missing thing - the initialization. See attached.
> Do you want to send it or should we?

Sorry - shows Victor's name but this is your patch, so feel free if
you send to add your name as author.

cheers,
jamal

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-15 14:01               ` Jamal Hadi Salim
@ 2024-04-15 14:11                 ` Eric Dumazet
  2024-04-15 14:17                   ` Jamal Hadi Salim
  2024-04-15 21:14                   ` Jamal Hadi Salim
  0 siblings, 2 replies; 21+ messages in thread
From: Eric Dumazet @ 2024-04-15 14:11 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Mon, Apr 15, 2024 at 4:01 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>

> Sorry - shows Victor's name but this is your patch, so feel free if
> you send to add your name as author.

Sure go ahead, but I would rather put the sch->owner init in
qdisc_alloc() so that qdisc_create_dflt() is covered.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-15 14:11                 ` Eric Dumazet
@ 2024-04-15 14:17                   ` Jamal Hadi Salim
  2024-04-15 21:14                   ` Jamal Hadi Salim
  1 sibling, 0 replies; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-04-15 14:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

On Mon, Apr 15, 2024 at 10:11 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Apr 15, 2024 at 4:01 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
>
> > Sorry - shows Victor's name but this is your patch, so feel free if
> > you send to add your name as author.
>
> Sure go ahead, but I would rather put the sch->owner init in
> qdisc_alloc() so that qdisc_create_dflt() is covered.

ok, will do.

cheers,
jamal

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-15 14:11                 ` Eric Dumazet
  2024-04-15 14:17                   ` Jamal Hadi Salim
@ 2024-04-15 21:14                   ` Jamal Hadi Salim
  2024-04-16  8:05                     ` Davide Caratti
  1 sibling, 1 reply; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-04-15 21:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, kuba, pabeni, jiri, xiyou.wangcong, netdev, renmingshuai,
	Victor Nogueira

[-- Attachment #1: Type: text/plain, Size: 2523 bytes --]

On Mon, Apr 15, 2024 at 10:11 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Apr 15, 2024 at 4:01 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
>
> > Sorry - shows Victor's name but this is your patch, so feel free if
> > you send to add your name as author.
>
> Sure go ahead, but I would rather put the sch->owner init in
> qdisc_alloc() so that qdisc_create_dflt() is covered.

Victor sent the patch. As i mentioned earlier, we found a lockdep
false positive for the case of redirect from eth0->eth1->eth0
(potential fix attached)

[   75.691724] ============================================
[   75.691964] WARNING: possible recursive locking detected
[   75.691964] 6.9.0-rc3-00861-g0a7d3ab066ff #60 Not tainted
[   75.691964] --------------------------------------------
[   75.691964] ping/421 is trying to acquire lock:
[   75.691964] ffff88800568e110 (&sch->q.lock){+.-.}-{3:3}, at:
__dev_queue_xmit+0x1828/0x3580
[   75.691964]
[   75.691964] but task is already holding lock:
[   75.691964] ffff88800bd2c110 (&sch->q.lock){+.-.}-{3:3}, at:
__dev_queue_xmit+0x1828/0x3580
[   75.691964]
[   75.691964] other info that might help us debug this:
[   75.691964]  Possible unsafe locking scenario:
[   75.691964]
[   75.691964]        CPU0
[   75.691964]        ----
[   75.691964]   lock(&sch->q.lock);
[   75.691964]   lock(&sch->q.lock);
[   75.691964]
[   75.691964]  *** DEADLOCK ***
[   75.691964]
[   75.691964]  May be due to missing lock nesting notation
[   75.691964]
[   75.691964] 9 locks held by ping/421:
[   75.691964]  #0: ffff888002564ff8 (sk_lock-AF_INET){+.+.}-{0:0},
at: raw_sendmsg+0xa32/0x2d80
[   75.691964]  #1: ffffffffa7233540 (rcu_read_lock){....}-{1:3}, at:
ip_finish_output2+0x284/0x1f80
[   75.691964]  #2: ffffffffa7233540 (rcu_read_lock){....}-{1:3}, at:
process_backlog+0x210/0x660
[   75.691964]  #3: ffffffffa7233540 (rcu_read_lock){....}-{1:3}, at:
ip_local_deliver_finish+0x21e/0x4d0
[   75.691964]  #4: ffff8880025648a8 (k-slock-AF_INET){+...}-{3:3},
at: icmp_reply+0x2e6/0xa20
[   75.691964]  #5: ffffffffa7233540 (rcu_read_lock){....}-{1:3}, at:
ip_finish_output2+0x284/0x1f80
[   75.691964]  #6: ffffffffa72334e0 (rcu_read_lock_bh){....}-{1:3},
at: __dev_queue_xmit+0x224/0x3580
[   75.691964]  #7: ffff88800bd2c110 (&sch->q.lock){+.-.}-{3:3}, at:
__dev_queue_xmit+0x1828/0x3580
[   75.691964]  #8: ffffffffa72334e0 (rcu_read_lock_bh){....}-{1:3},
at: __dev_queue_xmit+0x224/0x3580

cheers,
jamal

[-- Attachment #2: lockdep-fix.patch --]
[-- Type: text/x-patch, Size: 1210 bytes --]

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f561dfb79743..4ddc46f106b9 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -126,6 +126,7 @@ struct Qdisc {
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
 
+	struct lock_class_key   qdisc_tx_rootlock;
 	struct rcu_head		rcu;
 	netdevice_tracker	dev_tracker;
 	/* private data */
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 4a2c763e2d11..809e974204f8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -945,7 +945,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	__skb_queue_head_init(&sch->gso_skb);
 	__skb_queue_head_init(&sch->skb_bad_txq);
 	gnet_stats_basic_sync_init(&sch->bstats);
+	lockdep_register_key(&sch->qdisc_tx_rootlock);
 	spin_lock_init(&sch->q.lock);
+	lockdep_set_class(&sch->q.lock, &sch->qdisc_tx_rootlock);
 
 	if (ops->static_flags & TCQ_F_CPUSTATS) {
 		sch->cpu_bstats =
@@ -1070,6 +1072,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
 
 	module_put(ops->owner);
 	netdev_put(dev, &qdisc->dev_tracker);
+	lockdep_unregister_key(&qdisc->qdisc_tx_rootlock);
 
 	trace_qdisc_destroy(qdisc);
 

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-15 21:14                   ` Jamal Hadi Salim
@ 2024-04-16  8:05                     ` Davide Caratti
  2024-04-16  9:14                       ` Eric Dumazet
  2024-04-16 10:30                       ` Jamal Hadi Salim
  0 siblings, 2 replies; 21+ messages in thread
From: Davide Caratti @ 2024-04-16  8:05 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Eric Dumazet, davem, kuba, pabeni, jiri, xiyou.wangcong, netdev,
	renmingshuai, Victor Nogueira

hello,

On Mon, Apr 15, 2024 at 11:15 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
[...]

> Victor sent the patch. As i mentioned earlier, we found a lockdep
> false positive for the case of redirect from eth0->eth1->eth0
> (potential fix attached)

I tried a similar approach some months ago [1],  but  _ like Eric
noticed  _ it might slowdown qdisc_destroy() too much because of the
call to synchronize_rcu(). Maybe the key unregistering can be done
later (e.g. when the qdisc is freed) ?

[1] https://lore.kernel.org/netdev/73065927a49619fcd60e5b765c929f899a66cd1a.1701853200.git.dcaratti@redhat.com/

-- 
davide


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-16  8:05                     ` Davide Caratti
@ 2024-04-16  9:14                       ` Eric Dumazet
  2024-04-16  9:28                         ` Davide Caratti
  2024-04-16 10:30                       ` Jamal Hadi Salim
  1 sibling, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2024-04-16  9:14 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Jamal Hadi Salim, davem, kuba, pabeni, jiri, xiyou.wangcong,
	netdev, renmingshuai, Victor Nogueira

On Tue, Apr 16, 2024 at 10:05 AM Davide Caratti <dcaratti@redhat.com> wrote:
>
> hello,
>
> On Mon, Apr 15, 2024 at 11:15 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> [...]
>
> > Victor sent the patch. As i mentioned earlier, we found a lockdep
> > false positive for the case of redirect from eth0->eth1->eth0
> > (potential fix attached)
>
> I tried a similar approach some months ago [1],  but  _ like Eric
> noticed  _ it might slowdown qdisc_destroy() too much because of the
> call to synchronize_rcu(). Maybe the key unregistering can be done
> later (e.g. when the qdisc is freed) ?
>
> [1] https://lore.kernel.org/netdev/73065927a49619fcd60e5b765c929f899a66cd1a.1701853200.git.dcaratti@redhat.com/
>

Hmmm, I think I missed that lockdep_unregister_key() was a NOP unless
CONFIG_LOCKDEP=y

Sorry for this, can you repost your patch ?

Thanks.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-16  9:14                       ` Eric Dumazet
@ 2024-04-16  9:28                         ` Davide Caratti
  2024-04-16  9:37                           ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Davide Caratti @ 2024-04-16  9:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jamal Hadi Salim, davem, kuba, pabeni, jiri, xiyou.wangcong,
	netdev, renmingshuai, Victor Nogueira

hi Eric, thanks for looking at this!

On Tue, Apr 16, 2024 at 11:14 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Apr 16, 2024 at 10:05 AM Davide Caratti <dcaratti@redhat.com> wrote:

[...]

(FTR, the discussion started off-list :) more context at
https://github.com/multipath-tcp/mptcp_net-next/issues/451 )

> > I tried a similar approach some months ago [1],  but  _ like Eric
> > noticed  _ it might slowdown qdisc_destroy() too much because of the
> > call to synchronize_rcu(). Maybe the key unregistering can be done
> > later (e.g. when the qdisc is freed) ?
> >
> > [1] https://lore.kernel.org/netdev/73065927a49619fcd60e5b765c929f899a66cd1a.1701853200.git.dcaratti@redhat.com/
> >
>
> Hmmm, I think I missed that lockdep_unregister_key() was a NOP unless
> CONFIG_LOCKDEP=y

yes, the slowdown is there only on debug kernels.

> Sorry for this, can you repost your patch ?

Sure, but please leave me some time to understand what to do with HTB
(you were right at [1] and I didn't notice
htb_set_lockdep_class_child()  [2] at that time).

thanks!

[1] https://lore.kernel.org/netdev/CANn89iLXjstLx-=hFR0Rhav462+9pH3JTyE45t+nyiszKKCPTQ@mail.gmail.com/
[2] https://elixir.bootlin.com/linux/latest/source/net/sched/sch_htb.c#L1042
-- 
davide


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-16  9:28                         ` Davide Caratti
@ 2024-04-16  9:37                           ` Eric Dumazet
  0 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2024-04-16  9:37 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Jamal Hadi Salim, davem, kuba, pabeni, jiri, xiyou.wangcong,
	netdev, renmingshuai, Victor Nogueira

On Tue, Apr 16, 2024 at 11:28 AM Davide Caratti <dcaratti@redhat.com> wrote:
>
> hi Eric, thanks for looking at this!
>
> On Tue, Apr 16, 2024 at 11:14 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Tue, Apr 16, 2024 at 10:05 AM Davide Caratti <dcaratti@redhat.com> wrote:
>
> [...]
>
> (FTR, the discussion started off-list :) more context at
> https://github.com/multipath-tcp/mptcp_net-next/issues/451 )
>
> > > I tried a similar approach some months ago [1],  but  _ like Eric
> > > noticed  _ it might slowdown qdisc_destroy() too much because of the
> > > call to synchronize_rcu(). Maybe the key unregistering can be done
> > > later (e.g. when the qdisc is freed) ?
> > >
> > > [1] https://lore.kernel.org/netdev/73065927a49619fcd60e5b765c929f899a66cd1a.1701853200.git.dcaratti@redhat.com/
> > >
> >
> > Hmmm, I think I missed that lockdep_unregister_key() was a NOP unless
> > CONFIG_LOCKDEP=y
>
> yes, the slowdown is there only on debug kernels.
>
> > Sorry for this, can you repost your patch ?
>
> Sure, but please leave me some time to understand what to do with HTB
> (you were right at [1] and I didn't notice
> htb_set_lockdep_class_child()  [2] at that time).
>
> thanks!
>
> [1] https://lore.kernel.org/netdev/CANn89iLXjstLx-=hFR0Rhav462+9pH3JTyE45t+nyiszKKCPTQ@mail.gmail.com/
> [2] https://elixir.bootlin.com/linux/latest/source/net/sched/sch_htb.c#L1042

Great, thanks for your help !

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion
  2024-04-16  8:05                     ` Davide Caratti
  2024-04-16  9:14                       ` Eric Dumazet
@ 2024-04-16 10:30                       ` Jamal Hadi Salim
  1 sibling, 0 replies; 21+ messages in thread
From: Jamal Hadi Salim @ 2024-04-16 10:30 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Eric Dumazet, davem, kuba, pabeni, jiri, xiyou.wangcong, netdev,
	renmingshuai, Victor Nogueira

On Tue, Apr 16, 2024 at 4:05 AM Davide Caratti <dcaratti@redhat.com> wrote:
>
> hello,
>
> On Mon, Apr 15, 2024 at 11:15 PM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> >
> [...]
>
> > Victor sent the patch. As i mentioned earlier, we found a lockdep
> > false positive for the case of redirect from eth0->eth1->eth0
> > (potential fix attached)
>
> I tried a similar approach some months ago [1],  but  _ like Eric
> noticed  _ it might slowdown qdisc_destroy() too much because of the
> call to synchronize_rcu(). Maybe the key unregistering can be done
> later (e.g. when the qdisc is freed) ?
>
> [1] https://lore.kernel.org/netdev/73065927a49619fcd60e5b765c929f899a66cd1a.1701853200.git.dcaratti@redhat.com/

I wish i'd remembered this ;->  It is exactly the same scenario.
Triggered  example(eth0-->eth1-->eth0) as such:

tc qdisc add dev eth0 root handle 1: htb default 30
tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \
     action mirred egress redirect dev eth1

tc qdisc add dev eth1 root handle 1: htb default 30
tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \
     action mirred egress redirect dev eth0

So leaving it to you Davide..

cheers,
jamal

> --
> davide
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2024-04-16 10:30 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-26 23:03 [PATCH RFC net 1/1] net/sched: Fix mirred to self recursion Jamal Hadi Salim
2024-03-27 13:23 ` Eric Dumazet
2024-03-27 22:57   ` Jamal Hadi Salim
2024-03-27 23:12     ` Jamal Hadi Salim
2024-04-02  2:00       ` renmingshuai
2024-04-02 16:38         ` Jamal Hadi Salim
2024-04-02 16:47     ` Eric Dumazet
2024-04-02 17:35       ` Jamal Hadi Salim
2024-04-10 20:30         ` Jamal Hadi Salim
2024-04-15  9:20           ` Eric Dumazet
2024-04-15  9:29             ` Eric Dumazet
2024-04-15 13:59             ` Jamal Hadi Salim
2024-04-15 14:01               ` Jamal Hadi Salim
2024-04-15 14:11                 ` Eric Dumazet
2024-04-15 14:17                   ` Jamal Hadi Salim
2024-04-15 21:14                   ` Jamal Hadi Salim
2024-04-16  8:05                     ` Davide Caratti
2024-04-16  9:14                       ` Eric Dumazet
2024-04-16  9:28                         ` Davide Caratti
2024-04-16  9:37                           ` Eric Dumazet
2024-04-16 10:30                       ` Jamal Hadi Salim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.