* [Patch net] net_sched: add a temporary refcnt for struct tcindex_data
@ 2020-03-28 19:12 Cong Wang
2020-03-30 21:35 ` Paul E. McKenney
2020-04-01 18:07 ` David Miller
0 siblings, 2 replies; 7+ messages in thread
From: Cong Wang @ 2020-03-28 19:12 UTC (permalink / raw)
To: netdev
Cc: Cong Wang, syzbot+46f513c3033d592409d2, Thomas Gleixner,
Paul E . McKenney, Jamal Hadi Salim, Jiri Pirko
Although we intentionally use an ordered workqueue for all tc
filter works, the ordering is not guaranteed by RCU work,
given that tcf_queue_work() is esstenially a call_rcu().
This problem is demostrated by Thomas:
CPU 0:
tcf_queue_work()
tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
-> Migration to CPU 1
CPU 1:
tcf_queue_work(&p->rwork, tcindex_destroy_work);
so the 2nd work could be queued before the 1st one, which leads
to a free-after-free.
Enforcing this order in RCU work is hard as it requires to change
RCU code too. Fortunately we can workaround this problem in tcindex
filter by taking a temporary refcnt, we only refcnt it right before
we begin to destroy it. This simplifies the code a lot as a full
refcnt requires much more changes in tcindex_set_parms().
Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com
Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()")
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
net/sched/cls_tcindex.c | 44 +++++++++++++++++++++++++++++++++++------
1 file changed, 38 insertions(+), 6 deletions(-)
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 9904299424a1..065345832a69 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -11,6 +11,7 @@
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/refcount.h>
#include <net/act_api.h>
#include <net/netlink.h>
#include <net/pkt_cls.h>
@@ -26,9 +27,12 @@
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
+struct tcindex_data;
+
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
+ struct tcindex_data *p;
struct rcu_work rwork;
};
@@ -49,6 +53,7 @@ struct tcindex_data {
u32 hash; /* hash table size; 0 if undefined */
u32 alloc_hash; /* allocated size */
u32 fall_through; /* 0: only classify if explicit match */
+ refcount_t refcnt; /* a temporary refcnt for perfect hash */
struct rcu_work rwork;
};
@@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
return tcf_exts_has_actions(&r->exts) || r->res.classid;
}
+static void tcindex_data_get(struct tcindex_data *p)
+{
+ refcount_inc(&p->refcnt);
+}
+
+static void tcindex_data_put(struct tcindex_data *p)
+{
+ if (refcount_dec_and_test(&p->refcnt)) {
+ kfree(p->perfect);
+ kfree(p->h);
+ kfree(p);
+ }
+}
+
static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
u16 key)
{
@@ -141,6 +160,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
{
tcf_exts_destroy(&r->exts);
tcf_exts_put_net(&r->exts);
+ tcindex_data_put(r->p);
}
static void tcindex_destroy_rexts_work(struct work_struct *work)
@@ -212,6 +232,8 @@ static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
else
__tcindex_destroy_fexts(f);
} else {
+ tcindex_data_get(p);
+
if (tcf_exts_get_net(&r->exts))
tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
else
@@ -228,9 +250,7 @@ static void tcindex_destroy_work(struct work_struct *work)
struct tcindex_data,
rwork);
- kfree(p->perfect);
- kfree(p->h);
- kfree(p);
+ tcindex_data_put(p);
}
static inline int
@@ -248,9 +268,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
};
static int tcindex_filter_result_init(struct tcindex_filter_result *r,
+ struct tcindex_data *p,
struct net *net)
{
memset(r, 0, sizeof(*r));
+ r->p = p;
return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT,
TCA_TCINDEX_POLICE);
}
@@ -290,6 +312,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
if (err < 0)
goto errout;
+ cp->perfect[i].p = cp;
}
return 0;
@@ -334,6 +357,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
cp->alloc_hash = p->alloc_hash;
cp->fall_through = p->fall_through;
cp->tp = tp;
+ refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */
if (tb[TCA_TCINDEX_HASH])
cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
@@ -366,7 +390,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
}
cp->h = p->h;
- err = tcindex_filter_result_init(&new_filter_result, net);
+ err = tcindex_filter_result_init(&new_filter_result, cp, net);
if (err < 0)
goto errout_alloc;
if (old_r)
@@ -434,7 +458,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
goto errout_alloc;
f->key = handle;
f->next = NULL;
- err = tcindex_filter_result_init(&f->result, net);
+ err = tcindex_filter_result_init(&f->result, cp, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -447,7 +471,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
}
if (old_r && old_r != r) {
- err = tcindex_filter_result_init(old_r, net);
+ err = tcindex_filter_result_init(old_r, cp, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -571,6 +595,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held,
for (i = 0; i < p->hash; i++) {
struct tcindex_filter_result *r = p->perfect + i;
+ /* tcf_queue_work() does not guarantee the ordering we
+ * want, so we have to take this refcnt temporarily to
+ * ensure 'p' is freed after all tcindex_filter_result
+ * here. Imperfect hash does not need this, because it
+ * uses linked lists rather than an array.
+ */
+ tcindex_data_get(p);
+
tcf_unbind_filter(tp, &r->res);
if (tcf_exts_get_net(&r->exts))
tcf_queue_work(&r->rwork,
--
2.21.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [Patch net] net_sched: add a temporary refcnt for struct tcindex_data
2020-03-28 19:12 [Patch net] net_sched: add a temporary refcnt for struct tcindex_data Cong Wang
@ 2020-03-30 21:35 ` Paul E. McKenney
2020-03-30 23:24 ` Cong Wang
2020-04-01 18:07 ` David Miller
1 sibling, 1 reply; 7+ messages in thread
From: Paul E. McKenney @ 2020-03-30 21:35 UTC (permalink / raw)
To: Cong Wang
Cc: netdev, syzbot+46f513c3033d592409d2, Thomas Gleixner,
Jamal Hadi Salim, Jiri Pirko
On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote:
> Although we intentionally use an ordered workqueue for all tc
> filter works, the ordering is not guaranteed by RCU work,
> given that tcf_queue_work() is esstenially a call_rcu().
>
> This problem is demostrated by Thomas:
>
> CPU 0:
> tcf_queue_work()
> tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
>
> -> Migration to CPU 1
>
> CPU 1:
> tcf_queue_work(&p->rwork, tcindex_destroy_work);
>
> so the 2nd work could be queued before the 1st one, which leads
> to a free-after-free.
>
> Enforcing this order in RCU work is hard as it requires to change
> RCU code too. Fortunately we can workaround this problem in tcindex
> filter by taking a temporary refcnt, we only refcnt it right before
> we begin to destroy it. This simplifies the code a lot as a full
> refcnt requires much more changes in tcindex_set_parms().
>
> Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com
> Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()")
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Paul E. McKenney <paulmck@kernel.org>
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Cc: Jiri Pirko <jiri@resnulli.us>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Looks plausible, but what did you do to verify that the structures
were in fact being freed? See below for more detail.
Thanx, Paul
> ---
> net/sched/cls_tcindex.c | 44 +++++++++++++++++++++++++++++++++++------
> 1 file changed, 38 insertions(+), 6 deletions(-)
>
> diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
> index 9904299424a1..065345832a69 100644
> --- a/net/sched/cls_tcindex.c
> +++ b/net/sched/cls_tcindex.c
> @@ -11,6 +11,7 @@
> #include <linux/skbuff.h>
> #include <linux/errno.h>
> #include <linux/slab.h>
> +#include <linux/refcount.h>
> #include <net/act_api.h>
> #include <net/netlink.h>
> #include <net/pkt_cls.h>
> @@ -26,9 +27,12 @@
> #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
>
>
> +struct tcindex_data;
> +
> struct tcindex_filter_result {
> struct tcf_exts exts;
> struct tcf_result res;
> + struct tcindex_data *p;
> struct rcu_work rwork;
> };
>
> @@ -49,6 +53,7 @@ struct tcindex_data {
> u32 hash; /* hash table size; 0 if undefined */
> u32 alloc_hash; /* allocated size */
> u32 fall_through; /* 0: only classify if explicit match */
> + refcount_t refcnt; /* a temporary refcnt for perfect hash */
> struct rcu_work rwork;
> };
>
> @@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
> return tcf_exts_has_actions(&r->exts) || r->res.classid;
> }
>
> +static void tcindex_data_get(struct tcindex_data *p)
> +{
> + refcount_inc(&p->refcnt);
> +}
> +
> +static void tcindex_data_put(struct tcindex_data *p)
> +{
> + if (refcount_dec_and_test(&p->refcnt)) {
> + kfree(p->perfect);
> + kfree(p->h);
> + kfree(p);
> + }
> +}
> +
> static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
> u16 key)
> {
> @@ -141,6 +160,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
> {
> tcf_exts_destroy(&r->exts);
> tcf_exts_put_net(&r->exts);
> + tcindex_data_put(r->p);
> }
>
> static void tcindex_destroy_rexts_work(struct work_struct *work)
> @@ -212,6 +232,8 @@ static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
> else
> __tcindex_destroy_fexts(f);
> } else {
> + tcindex_data_get(p);
Good! You need this one to prevent the counter prematurely reaching zero.
> +
> if (tcf_exts_get_net(&r->exts))
> tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
> else
> @@ -228,9 +250,7 @@ static void tcindex_destroy_work(struct work_struct *work)
> struct tcindex_data,
> rwork);
>
> - kfree(p->perfect);
> - kfree(p->h);
> - kfree(p);
> + tcindex_data_put(p);
But what did you do to verify that the counter actually reaches zero?
> }
>
> static inline int
> @@ -248,9 +268,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
> };
>
> static int tcindex_filter_result_init(struct tcindex_filter_result *r,
> + struct tcindex_data *p,
> struct net *net)
> {
> memset(r, 0, sizeof(*r));
> + r->p = p;
> return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT,
> TCA_TCINDEX_POLICE);
> }
> @@ -290,6 +312,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
> TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
> if (err < 0)
> goto errout;
> + cp->perfect[i].p = cp;
> }
>
> return 0;
> @@ -334,6 +357,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
> cp->alloc_hash = p->alloc_hash;
> cp->fall_through = p->fall_through;
> cp->tp = tp;
> + refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */
The bit about checking that the counter reaches zero is underscored by the
apparent initialization to the value 1.
> if (tb[TCA_TCINDEX_HASH])
> cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
> @@ -366,7 +390,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
> }
> cp->h = p->h;
>
> - err = tcindex_filter_result_init(&new_filter_result, net);
> + err = tcindex_filter_result_init(&new_filter_result, cp, net);
> if (err < 0)
> goto errout_alloc;
> if (old_r)
> @@ -434,7 +458,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
> goto errout_alloc;
> f->key = handle;
> f->next = NULL;
> - err = tcindex_filter_result_init(&f->result, net);
> + err = tcindex_filter_result_init(&f->result, cp, net);
> if (err < 0) {
> kfree(f);
> goto errout_alloc;
> @@ -447,7 +471,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
> }
>
> if (old_r && old_r != r) {
> - err = tcindex_filter_result_init(old_r, net);
> + err = tcindex_filter_result_init(old_r, cp, net);
> if (err < 0) {
> kfree(f);
> goto errout_alloc;
> @@ -571,6 +595,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held,
> for (i = 0; i < p->hash; i++) {
> struct tcindex_filter_result *r = p->perfect + i;
>
> + /* tcf_queue_work() does not guarantee the ordering we
> + * want, so we have to take this refcnt temporarily to
> + * ensure 'p' is freed after all tcindex_filter_result
> + * here. Imperfect hash does not need this, because it
> + * uses linked lists rather than an array.
> + */
> + tcindex_data_get(p);
> +
> tcf_unbind_filter(tp, &r->res);
> if (tcf_exts_get_net(&r->exts))
> tcf_queue_work(&r->rwork,
> --
> 2.21.1
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [Patch net] net_sched: add a temporary refcnt for struct tcindex_data
2020-03-30 21:35 ` Paul E. McKenney
@ 2020-03-30 23:24 ` Cong Wang
2020-03-31 2:30 ` Paul E. McKenney
0 siblings, 1 reply; 7+ messages in thread
From: Cong Wang @ 2020-03-30 23:24 UTC (permalink / raw)
To: Paul E . McKenney
Cc: Linux Kernel Network Developers, syzbot, Thomas Gleixner,
Jamal Hadi Salim, Jiri Pirko
On Mon, Mar 30, 2020 at 2:35 PM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote:
> > Although we intentionally use an ordered workqueue for all tc
> > filter works, the ordering is not guaranteed by RCU work,
> > given that tcf_queue_work() is esstenially a call_rcu().
> >
> > This problem is demostrated by Thomas:
> >
> > CPU 0:
> > tcf_queue_work()
> > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
> >
> > -> Migration to CPU 1
> >
> > CPU 1:
> > tcf_queue_work(&p->rwork, tcindex_destroy_work);
> >
> > so the 2nd work could be queued before the 1st one, which leads
> > to a free-after-free.
> >
> > Enforcing this order in RCU work is hard as it requires to change
> > RCU code too. Fortunately we can workaround this problem in tcindex
> > filter by taking a temporary refcnt, we only refcnt it right before
> > we begin to destroy it. This simplifies the code a lot as a full
> > refcnt requires much more changes in tcindex_set_parms().
> >
> > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com
> > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()")
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Paul E. McKenney <paulmck@kernel.org>
> > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > Cc: Jiri Pirko <jiri@resnulli.us>
> > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
>
> Looks plausible, but what did you do to verify that the structures
> were in fact being freed? See below for more detail.
I ran the syzbot reproducer for about 20 minutes, there was no
memory leak reported after scanning.
Thanks.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [Patch net] net_sched: add a temporary refcnt for struct tcindex_data
2020-03-30 23:24 ` Cong Wang
@ 2020-03-31 2:30 ` Paul E. McKenney
2020-03-31 2:54 ` Cong Wang
0 siblings, 1 reply; 7+ messages in thread
From: Paul E. McKenney @ 2020-03-31 2:30 UTC (permalink / raw)
To: Cong Wang
Cc: Linux Kernel Network Developers, syzbot, Thomas Gleixner,
Jamal Hadi Salim, Jiri Pirko
On Mon, Mar 30, 2020 at 04:24:42PM -0700, Cong Wang wrote:
> On Mon, Mar 30, 2020 at 2:35 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote:
> > > Although we intentionally use an ordered workqueue for all tc
> > > filter works, the ordering is not guaranteed by RCU work,
> > > given that tcf_queue_work() is esstenially a call_rcu().
> > >
> > > This problem is demostrated by Thomas:
> > >
> > > CPU 0:
> > > tcf_queue_work()
> > > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
> > >
> > > -> Migration to CPU 1
> > >
> > > CPU 1:
> > > tcf_queue_work(&p->rwork, tcindex_destroy_work);
> > >
> > > so the 2nd work could be queued before the 1st one, which leads
> > > to a free-after-free.
> > >
> > > Enforcing this order in RCU work is hard as it requires to change
> > > RCU code too. Fortunately we can workaround this problem in tcindex
> > > filter by taking a temporary refcnt, we only refcnt it right before
> > > we begin to destroy it. This simplifies the code a lot as a full
> > > refcnt requires much more changes in tcindex_set_parms().
> > >
> > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com
> > > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()")
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Paul E. McKenney <paulmck@kernel.org>
> > > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > > Cc: Jiri Pirko <jiri@resnulli.us>
> > > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> >
> > Looks plausible, but what did you do to verify that the structures
> > were in fact being freed? See below for more detail.
>
> I ran the syzbot reproducer for about 20 minutes, there was no
> memory leak reported after scanning.
And if you (say) set the initial reference count to two instead of one,
there is a memory leak reported, correct?
Thanx, Paul
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [Patch net] net_sched: add a temporary refcnt for struct tcindex_data
2020-03-31 2:30 ` Paul E. McKenney
@ 2020-03-31 2:54 ` Cong Wang
2020-03-31 13:30 ` Paul E. McKenney
0 siblings, 1 reply; 7+ messages in thread
From: Cong Wang @ 2020-03-31 2:54 UTC (permalink / raw)
To: Paul E . McKenney
Cc: Linux Kernel Network Developers, syzbot, Thomas Gleixner,
Jamal Hadi Salim, Jiri Pirko
On Mon, Mar 30, 2020 at 7:30 PM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Mon, Mar 30, 2020 at 04:24:42PM -0700, Cong Wang wrote:
> > On Mon, Mar 30, 2020 at 2:35 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > >
> > > On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote:
> > > > Although we intentionally use an ordered workqueue for all tc
> > > > filter works, the ordering is not guaranteed by RCU work,
> > > > given that tcf_queue_work() is esstenially a call_rcu().
> > > >
> > > > This problem is demostrated by Thomas:
> > > >
> > > > CPU 0:
> > > > tcf_queue_work()
> > > > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
> > > >
> > > > -> Migration to CPU 1
> > > >
> > > > CPU 1:
> > > > tcf_queue_work(&p->rwork, tcindex_destroy_work);
> > > >
> > > > so the 2nd work could be queued before the 1st one, which leads
> > > > to a free-after-free.
> > > >
> > > > Enforcing this order in RCU work is hard as it requires to change
> > > > RCU code too. Fortunately we can workaround this problem in tcindex
> > > > filter by taking a temporary refcnt, we only refcnt it right before
> > > > we begin to destroy it. This simplifies the code a lot as a full
> > > > refcnt requires much more changes in tcindex_set_parms().
> > > >
> > > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com
> > > > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()")
> > > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > > Cc: Paul E. McKenney <paulmck@kernel.org>
> > > > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > Cc: Jiri Pirko <jiri@resnulli.us>
> > > > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> > >
> > > Looks plausible, but what did you do to verify that the structures
> > > were in fact being freed? See below for more detail.
> >
> > I ran the syzbot reproducer for about 20 minutes, there was no
> > memory leak reported after scanning.
>
> And if you (say) set the initial reference count to two instead of one,
> there is a memory leak reported, correct?
No, I didn't do an A/B test. I just added a printk right before the kfree(),
if it helps to convince you, here is one portion of the kernel log:
[ 39.159298] a.out (703) used greatest stack depth: 11624 bytes left
[ 39.166365] a.out (701) used greatest stack depth: 11352 bytes left
[ 39.453257] freeing struct tcindex_data.
[ 39.573554] freeing struct tcindex_data.
[ 39.681540] freeing struct tcindex_data.
[ 39.781158] freeing struct tcindex_data.
[ 39.877726] freeing struct tcindex_data.
[ 39.985515] freeing struct tcindex_data.
[ 40.097687] freeing struct tcindex_data.
[ 40.213691] freeing struct tcindex_data.
[ 40.271465] device bridge_slave_1 left promiscuous mode
[ 40.274078] bridge0: port 2(bridge_slave_1) entered disabled state
[ 40.297258] device bridge_slave_0 left promiscuous mode
[ 40.299377] bridge0: port 1(bridge_slave_0) entered disabled state
[ 40.733355] device hsr_slave_0 left promiscuous mode
[ 40.749322] device hsr_slave_1 left promiscuous mode
[ 40.784220] team0 (unregistering): Port device team_slave_1 removed
[ 40.792641] team0 (unregistering): Port device team_slave_0 removed
[ 40.806302] bond0 (unregistering): (slave bond_slave_1): Releasing
backup interface
[ 40.836972] bond0 (unregistering): (slave bond_slave_0): Releasing
backup interface
[ 40.931688] bond0 (unregistering): Released all slaves
[ 44.149970] freeing struct tcindex_data.
[ 44.159568] freeing struct tcindex_data.
[ 44.172786] freeing struct tcindex_data.
[ 44.813214] freeing struct tcindex_data.
[ 44.821857] freeing struct tcindex_data.
[ 44.825064] freeing struct tcindex_data.
[ 44.826889] freeing struct tcindex_data.
[ 45.294254] freeing struct tcindex_data.
[ 45.297980] freeing struct tcindex_data.
[ 45.300623] freeing struct tcindex_data.
And no memory leak of course:
[root@localhost tmp]# echo scan > /sys/kernel/debug/kmemleak
[root@localhost tmp]# echo scan > /sys/kernel/debug/kmemleak
[root@localhost tmp]# cat /sys/kernel/debug/kmemleak
Thanks.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [Patch net] net_sched: add a temporary refcnt for struct tcindex_data
2020-03-31 2:54 ` Cong Wang
@ 2020-03-31 13:30 ` Paul E. McKenney
0 siblings, 0 replies; 7+ messages in thread
From: Paul E. McKenney @ 2020-03-31 13:30 UTC (permalink / raw)
To: Cong Wang
Cc: Linux Kernel Network Developers, syzbot, Thomas Gleixner,
Jamal Hadi Salim, Jiri Pirko
On Mon, Mar 30, 2020 at 07:54:09PM -0700, Cong Wang wrote:
> On Mon, Mar 30, 2020 at 7:30 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Mon, Mar 30, 2020 at 04:24:42PM -0700, Cong Wang wrote:
> > > On Mon, Mar 30, 2020 at 2:35 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > > >
> > > > On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote:
> > > > > Although we intentionally use an ordered workqueue for all tc
> > > > > filter works, the ordering is not guaranteed by RCU work,
> > > > > given that tcf_queue_work() is esstenially a call_rcu().
> > > > >
> > > > > This problem is demostrated by Thomas:
> > > > >
> > > > > CPU 0:
> > > > > tcf_queue_work()
> > > > > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
> > > > >
> > > > > -> Migration to CPU 1
> > > > >
> > > > > CPU 1:
> > > > > tcf_queue_work(&p->rwork, tcindex_destroy_work);
> > > > >
> > > > > so the 2nd work could be queued before the 1st one, which leads
> > > > > to a free-after-free.
> > > > >
> > > > > Enforcing this order in RCU work is hard as it requires to change
> > > > > RCU code too. Fortunately we can workaround this problem in tcindex
> > > > > filter by taking a temporary refcnt, we only refcnt it right before
> > > > > we begin to destroy it. This simplifies the code a lot as a full
> > > > > refcnt requires much more changes in tcindex_set_parms().
> > > > >
> > > > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com
> > > > > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()")
> > > > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > > > Cc: Paul E. McKenney <paulmck@kernel.org>
> > > > > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > > > > Cc: Jiri Pirko <jiri@resnulli.us>
> > > > > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> > > >
> > > > Looks plausible, but what did you do to verify that the structures
> > > > were in fact being freed? See below for more detail.
> > >
> > > I ran the syzbot reproducer for about 20 minutes, there was no
> > > memory leak reported after scanning.
> >
> > And if you (say) set the initial reference count to two instead of one,
> > there is a memory leak reported, correct?
>
> No, I didn't do an A/B test. I just added a printk right before the kfree(),
> if it helps to convince you, here is one portion of the kernel log:
>
> [ 39.159298] a.out (703) used greatest stack depth: 11624 bytes left
> [ 39.166365] a.out (701) used greatest stack depth: 11352 bytes left
> [ 39.453257] freeing struct tcindex_data.
> [ 39.573554] freeing struct tcindex_data.
> [ 39.681540] freeing struct tcindex_data.
> [ 39.781158] freeing struct tcindex_data.
> [ 39.877726] freeing struct tcindex_data.
> [ 39.985515] freeing struct tcindex_data.
> [ 40.097687] freeing struct tcindex_data.
> [ 40.213691] freeing struct tcindex_data.
> [ 40.271465] device bridge_slave_1 left promiscuous mode
> [ 40.274078] bridge0: port 2(bridge_slave_1) entered disabled state
> [ 40.297258] device bridge_slave_0 left promiscuous mode
> [ 40.299377] bridge0: port 1(bridge_slave_0) entered disabled state
> [ 40.733355] device hsr_slave_0 left promiscuous mode
> [ 40.749322] device hsr_slave_1 left promiscuous mode
> [ 40.784220] team0 (unregistering): Port device team_slave_1 removed
> [ 40.792641] team0 (unregistering): Port device team_slave_0 removed
> [ 40.806302] bond0 (unregistering): (slave bond_slave_1): Releasing
> backup interface
> [ 40.836972] bond0 (unregistering): (slave bond_slave_0): Releasing
> backup interface
> [ 40.931688] bond0 (unregistering): Released all slaves
> [ 44.149970] freeing struct tcindex_data.
> [ 44.159568] freeing struct tcindex_data.
> [ 44.172786] freeing struct tcindex_data.
> [ 44.813214] freeing struct tcindex_data.
> [ 44.821857] freeing struct tcindex_data.
> [ 44.825064] freeing struct tcindex_data.
> [ 44.826889] freeing struct tcindex_data.
> [ 45.294254] freeing struct tcindex_data.
> [ 45.297980] freeing struct tcindex_data.
> [ 45.300623] freeing struct tcindex_data.
>
> And no memory leak of course:
>
> [root@localhost tmp]# echo scan > /sys/kernel/debug/kmemleak
> [root@localhost tmp]# echo scan > /sys/kernel/debug/kmemleak
> [root@localhost tmp]# cat /sys/kernel/debug/kmemleak
Much more convincing, thank you!
Feel free to add:
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Thanx, Paul
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [Patch net] net_sched: add a temporary refcnt for struct tcindex_data
2020-03-28 19:12 [Patch net] net_sched: add a temporary refcnt for struct tcindex_data Cong Wang
2020-03-30 21:35 ` Paul E. McKenney
@ 2020-04-01 18:07 ` David Miller
1 sibling, 0 replies; 7+ messages in thread
From: David Miller @ 2020-04-01 18:07 UTC (permalink / raw)
To: xiyou.wangcong
Cc: netdev, syzbot+46f513c3033d592409d2, tglx, paulmck, jhs, jiri
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 28 Mar 2020 12:12:59 -0700
> Although we intentionally use an ordered workqueue for all tc
> filter works, the ordering is not guaranteed by RCU work,
> given that tcf_queue_work() is esstenially a call_rcu().
>
> This problem is demostrated by Thomas:
>
> CPU 0:
> tcf_queue_work()
> tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
>
> -> Migration to CPU 1
>
> CPU 1:
> tcf_queue_work(&p->rwork, tcindex_destroy_work);
>
> so the 2nd work could be queued before the 1st one, which leads
> to a free-after-free.
>
> Enforcing this order in RCU work is hard as it requires to change
> RCU code too. Fortunately we can workaround this problem in tcindex
> filter by taking a temporary refcnt, we only refcnt it right before
> we begin to destroy it. This simplifies the code a lot as a full
> refcnt requires much more changes in tcindex_set_parms().
>
> Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com
> Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()")
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Paul E. McKenney <paulmck@kernel.org>
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Cc: Jiri Pirko <jiri@resnulli.us>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Applied and queued up for -stable, thanks.
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2020-04-01 18:07 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-28 19:12 [Patch net] net_sched: add a temporary refcnt for struct tcindex_data Cong Wang
2020-03-30 21:35 ` Paul E. McKenney
2020-03-30 23:24 ` Cong Wang
2020-03-31 2:30 ` Paul E. McKenney
2020-03-31 2:54 ` Cong Wang
2020-03-31 13:30 ` Paul E. McKenney
2020-04-01 18:07 ` David Miller
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.