All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/1] ipset patch for nf
@ 2016-02-24 20:19 Jozsef Kadlecsik
  2016-02-24 20:19 ` [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel Jozsef Kadlecsik
  0 siblings, 1 reply; 7+ messages in thread
From: Jozsef Kadlecsik @ 2016-02-24 20:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Pablo Neira Ayuso

Hi Pablo,

Please apply the next patch against the nf tree:

- Deniz Eren reported that parallel flush/dump of list:set type of sets
  can lead to kernel crash. The bug was due to non-RCU compatible flushing,
  listing in the set type, fixed by me.

The following changes since commit 5cc6ce9ff27565949a1001a2889a8dd9fd09e772:

  netfilter: nft_counter: fix erroneous return values (2016-02-08 13:05:02 +0100)

are available in the git repository at:

  git://blackhole.kfki.hu/nf master

for you to fetch changes up to 45040978c8994d1401baf5cc5ac71c1495d4e120:

  netfilter: ipset: Fix set:list type crash when flush/dump set in parallel (2016-02-24 20:32:21 +0100)

----------------------------------------------------------------
Jozsef Kadlecsik (1):
      netfilter: ipset: Fix set:list type crash when flush/dump set in parallel

 net/netfilter/ipset/ip_set_core.c     |  3 +++
 net/netfilter/ipset/ip_set_list_set.c | 55 ++++++++++++++++++++++--------------------------
 2 files changed, 28 insertions(+), 30 deletions(-)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel
  2016-02-24 20:19 [PATCH 0/1] ipset patch for nf Jozsef Kadlecsik
@ 2016-02-24 20:19 ` Jozsef Kadlecsik
  2016-02-29 12:22   ` Pablo Neira Ayuso
  0 siblings, 1 reply; 7+ messages in thread
From: Jozsef Kadlecsik @ 2016-02-24 20:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Pablo Neira Ayuso

Flushing/listing entries was not RCU safe, so parallel flush/dump
could lead to kernel crash. Bug reported by Deniz Eren.

Fixes netfilter bugzilla id #1050.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/ipset/ip_set_core.c     |  3 ++
 net/netfilter/ipset/ip_set_list_set.c | 55 ++++++++++++++++-------------------
 2 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 95db43f..7e6568c 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
 	if (unlikely(protocol_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
+	/* Must wait for flush to be really finished in list:set */
+	rcu_barrier();
+
 	/* Commands are serialized and references are
 	 * protected by the ip_set_ref_lock.
 	 * External systems (i.e. xt_set) must call
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index bbede95..24c6c19 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -30,6 +30,7 @@ MODULE_ALIAS("ip_set_list:set");
 struct set_elem {
 	struct rcu_head rcu;
 	struct list_head list;
+	struct ip_set *set;	/* Sigh, in order to cleanup reference */
 	ip_set_id_t id;
 } __aligned(__alignof__(u64));
 
@@ -151,30 +152,29 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
 /* Userspace interfaces: we are protected by the nfnl mutex */
 
 static void
-__list_set_del(struct ip_set *set, struct set_elem *e)
+__list_set_del_rcu(struct rcu_head * rcu)
 {
+	struct set_elem *e = container_of(rcu, struct set_elem, rcu);
+	struct ip_set *set = e->set;
 	struct list_set *map = set->data;
 
 	ip_set_put_byindex(map->net, e->id);
-	/* We may call it, because we don't have a to be destroyed
-	 * extension which is used by the kernel.
-	 */
 	ip_set_ext_destroy(set, e);
-	kfree_rcu(e, rcu);
+	kfree(e);
 }
 
 static inline void
 list_set_del(struct ip_set *set, struct set_elem *e)
 {
 	list_del_rcu(&e->list);
-	__list_set_del(set, e);
+	call_rcu(&e->rcu, __list_set_del_rcu);
 }
 
 static inline void
-list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
+list_set_replace(struct set_elem *e, struct set_elem *old)
 {
 	list_replace_rcu(&old->list, &e->list);
-	__list_set_del(set, old);
+	call_rcu(&old->rcu, __list_set_del_rcu);
 }
 
 static void
@@ -244,9 +244,6 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	struct set_elem *e, *n, *prev, *next;
 	bool flag_exist = flags & IPSET_FLAG_EXIST;
 
-	if (SET_WITH_TIMEOUT(set))
-		set_cleanup_entries(set);
-
 	/* Find where to add the new entry */
 	n = prev = next = NULL;
 	list_for_each_entry(e, &map->members, list) {
@@ -301,10 +298,11 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	if (!e)
 		return -ENOMEM;
 	e->id = d->id;
+	e->set = set;
 	INIT_LIST_HEAD(&e->list);
 	list_set_init_extensions(set, ext, e);
 	if (n)
-		list_set_replace(set, e, n);
+		list_set_replace(e, n);
 	else if (next)
 		list_add_tail_rcu(&e->list, &next->list);
 	else if (prev)
@@ -431,6 +429,7 @@ list_set_destroy(struct ip_set *set)
 
 	if (SET_WITH_TIMEOUT(set))
 		del_timer_sync(&map->gc);
+
 	list_for_each_entry_safe(e, n, &map->members, list) {
 		list_del(&e->list);
 		ip_set_put_byindex(map->net, e->id);
@@ -450,8 +449,10 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
 	struct set_elem *e;
 	u32 n = 0;
 
-	list_for_each_entry(e, &map->members, list)
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &map->members, list)
 		n++;
+	rcu_read_unlock();
 
 	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
 	if (!nested)
@@ -483,33 +484,25 @@ list_set_list(const struct ip_set *set,
 	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
 	if (!atd)
 		return -EMSGSIZE;
-	list_for_each_entry(e, &map->members, list) {
-		if (i == first)
-			break;
-		i++;
-	}
 
 	rcu_read_lock();
-	list_for_each_entry_from(e, &map->members, list) {
-		i++;
-		if (SET_WITH_TIMEOUT(set) &&
-		    ip_set_timeout_expired(ext_timeout(e, set)))
+	list_for_each_entry_rcu(e, &map->members, list) {
+		if (i < first ||
+		    (SET_WITH_TIMEOUT(set) &&
+		     ip_set_timeout_expired(ext_timeout(e, set)))) {
+			i++;
 			continue;
+		}
 		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
-		if (!nested) {
-			if (i == first) {
-				nla_nest_cancel(skb, atd);
-				ret = -EMSGSIZE;
-				goto out;
-			}
+		if (!nested)
 			goto nla_put_failure;
-		}
 		if (nla_put_string(skb, IPSET_ATTR_NAME,
 				   ip_set_name_byindex(map->net, e->id)))
 			goto nla_put_failure;
 		if (ip_set_put_extensions(skb, set, e, true))
 			goto nla_put_failure;
 		ipset_nest_end(skb, nested);
+		i++;
 	}
 
 	ipset_nest_end(skb, atd);
@@ -520,10 +513,12 @@ list_set_list(const struct ip_set *set,
 nla_put_failure:
 	nla_nest_cancel(skb, nested);
 	if (unlikely(i == first)) {
+		nla_nest_cancel(skb, atd);
 		cb->args[IPSET_CB_ARG0] = 0;
 		ret = -EMSGSIZE;
+	} else {
+		cb->args[IPSET_CB_ARG0] = i;
 	}
-	cb->args[IPSET_CB_ARG0] = i - 1;
 	ipset_nest_end(skb, atd);
 out:
 	rcu_read_unlock();
-- 
1.8.5.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel
  2016-02-24 20:19 ` [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel Jozsef Kadlecsik
@ 2016-02-29 12:22   ` Pablo Neira Ayuso
  2016-02-29 12:47     ` Jozsef Kadlecsik
  0 siblings, 1 reply; 7+ messages in thread
From: Pablo Neira Ayuso @ 2016-02-29 12:22 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel

On Wed, Feb 24, 2016 at 09:19:26PM +0100, Jozsef Kadlecsik wrote:
> Flushing/listing entries was not RCU safe, so parallel flush/dump
> could lead to kernel crash. Bug reported by Deniz Eren.
> 
> Fixes netfilter bugzilla id #1050.
> 
> Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> ---
>  net/netfilter/ipset/ip_set_core.c     |  3 ++
>  net/netfilter/ipset/ip_set_list_set.c | 55 ++++++++++++++++-------------------
>  2 files changed, 28 insertions(+), 30 deletions(-)
> 
> diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> index 95db43f..7e6568c 100644
> --- a/net/netfilter/ipset/ip_set_core.c
> +++ b/net/netfilter/ipset/ip_set_core.c
> @@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
>  	if (unlikely(protocol_failed(attr)))
>  		return -IPSET_ERR_PROTOCOL;
>  
> +	/* Must wait for flush to be really finished in list:set */
> +	rcu_barrier();

Jozsef, are you sure you need this rcu_barrier()?

This is waiting for rcu callback completion (ie. decrement the set
reference counter and releasing the extension and object itself).

The rcu read side should be safe when accessing old copies from the
dumping path when using call_rcu().

Let me know, thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel
  2016-02-29 12:22   ` Pablo Neira Ayuso
@ 2016-02-29 12:47     ` Jozsef Kadlecsik
  2016-03-05 12:32       ` Pablo Neira Ayuso
  0 siblings, 1 reply; 7+ messages in thread
From: Jozsef Kadlecsik @ 2016-02-29 12:47 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel

On Mon, 29 Feb 2016, Pablo Neira Ayuso wrote:

> On Wed, Feb 24, 2016 at 09:19:26PM +0100, Jozsef Kadlecsik wrote:
> > Flushing/listing entries was not RCU safe, so parallel flush/dump
> > could lead to kernel crash. Bug reported by Deniz Eren.
> > 
> > Fixes netfilter bugzilla id #1050.
> > 
> > Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> > ---
> >  net/netfilter/ipset/ip_set_core.c     |  3 ++
> >  net/netfilter/ipset/ip_set_list_set.c | 55 ++++++++++++++++-------------------
> >  2 files changed, 28 insertions(+), 30 deletions(-)
> > 
> > diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> > index 95db43f..7e6568c 100644
> > --- a/net/netfilter/ipset/ip_set_core.c
> > +++ b/net/netfilter/ipset/ip_set_core.c
> > @@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
> >  	if (unlikely(protocol_failed(attr)))
> >  		return -IPSET_ERR_PROTOCOL;
> >  
> > +	/* Must wait for flush to be really finished in list:set */
> > +	rcu_barrier();
> 
> Jozsef, are you sure you need this rcu_barrier()?
> 
> This is waiting for rcu callback completion (ie. decrement the set
> reference counter and releasing the extension and object itself).

Yes, that's exactly what it's waiting for, in the case of sets which are 
elements in set:list type of sets.

> The rcu read side should be safe when accessing old copies from the
> dumping path when using call_rcu().

Three competing actions play role here: flush/delete, destroy, dumping. 
The rcu_barrier() above waits for the flush/delete actions. The rcu read 
side has access to the set id only and must lookup the set by id in order 
to get the name, which is protected by a reference counter. The function 
ip_set_name_byindex() intentionally checks that the reference counter 
cannot be equal to zero and I believe it's an important internal sanity 
checking and I don't want to remove it. The problem was that the reference 
counter of the set was first decremented at flush/delete and so we could 
get a BUG_ON() crash in ip_set_name_byindex() when an ongoing dumping was 
processed. So the reference counter must be released last for a successful 
parallel dumping. If rcu_barrier() was not added to the destroy path, a 
"destroy" immediately following a "flush" can lead to the error "set is in 
use, cannot delete". I tested it and couldn't find a simpler way.

It is the destroy path for sets, so it should not be performance critical.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel
  2016-02-29 12:47     ` Jozsef Kadlecsik
@ 2016-03-05 12:32       ` Pablo Neira Ayuso
  2016-03-08 19:27         ` Jozsef Kadlecsik
  0 siblings, 1 reply; 7+ messages in thread
From: Pablo Neira Ayuso @ 2016-03-05 12:32 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel

Hi Jozsef,

On Mon, Feb 29, 2016 at 01:47:59PM +0100, Jozsef Kadlecsik wrote:
> On Mon, 29 Feb 2016, Pablo Neira Ayuso wrote:
> 
> > On Wed, Feb 24, 2016 at 09:19:26PM +0100, Jozsef Kadlecsik wrote:
> > > Flushing/listing entries was not RCU safe, so parallel flush/dump
> > > could lead to kernel crash. Bug reported by Deniz Eren.
> > > 
> > > Fixes netfilter bugzilla id #1050.
> > > 
> > > Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> > > ---
> > >  net/netfilter/ipset/ip_set_core.c     |  3 ++
> > >  net/netfilter/ipset/ip_set_list_set.c | 55 ++++++++++++++++-------------------
> > >  2 files changed, 28 insertions(+), 30 deletions(-)
> > > 
> > > diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> > > index 95db43f..7e6568c 100644
> > > --- a/net/netfilter/ipset/ip_set_core.c
> > > +++ b/net/netfilter/ipset/ip_set_core.c
> > > @@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
> > >  	if (unlikely(protocol_failed(attr)))
> > >  		return -IPSET_ERR_PROTOCOL;
> > >  
> > > +	/* Must wait for flush to be really finished in list:set */
> > > +	rcu_barrier();
> > 
> > Jozsef, are you sure you need this rcu_barrier()?
> > 
> > This is waiting for rcu callback completion (ie. decrement the set
> > reference counter and releasing the extension and object itself).
> 
> Yes, that's exactly what it's waiting for, in the case of sets which are 
> elements in set:list type of sets.
> 
> > The rcu read side should be safe when accessing old copies from the
> > dumping path when using call_rcu().
> 
> Three competing actions play role here: flush/delete, destroy, dumping. 
> The rcu_barrier() above waits for the flush/delete actions. The rcu read 
> side has access to the set id only and must lookup the set by id in order 
> to get the name, which is protected by a reference counter. The function 
> ip_set_name_byindex() intentionally checks that the reference counter 
> cannot be equal to zero and I believe it's an important internal sanity 
> checking and I don't want to remove it. The problem was that the reference 
> counter of the set was first decremented at flush/delete and so we could 
> get a BUG_ON() crash in ip_set_name_byindex() when an ongoing dumping was 
> processed. So the reference counter must be released last for a successful 
> parallel dumping. If rcu_barrier() was not added to the destroy path, a 
> "destroy" immediately following a "flush" can lead to the error "set is in 
> use, cannot delete". I tested it and couldn't find a simpler way.

ip_set_destroy() calls ip_set_destroy_set() which releases the set
object and its content without waiting for the rcu grace period. I
think readers (dump path) may still be walking on the set by when
you're releasing this object.

Our nfnetlink dump path is lockless (rcu read side protected), so we
unpublish the set object and release objects via rcu callback when
operating from the control plane.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel
  2016-03-05 12:32       ` Pablo Neira Ayuso
@ 2016-03-08 19:27         ` Jozsef Kadlecsik
  2016-03-10 18:27           ` Pablo Neira Ayuso
  0 siblings, 1 reply; 7+ messages in thread
From: Jozsef Kadlecsik @ 2016-03-08 19:27 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel

Hi Pablo,

On Sat, 5 Mar 2016, Pablo Neira Ayuso wrote:

> On Mon, Feb 29, 2016 at 01:47:59PM +0100, Jozsef Kadlecsik wrote:
> > On Mon, 29 Feb 2016, Pablo Neira Ayuso wrote:
> > 
> > > On Wed, Feb 24, 2016 at 09:19:26PM +0100, Jozsef Kadlecsik wrote:
> > > > Flushing/listing entries was not RCU safe, so parallel flush/dump
> > > > could lead to kernel crash. Bug reported by Deniz Eren.
> > > > 
> > > > Fixes netfilter bugzilla id #1050.
> > > > 
> > > > Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> > > > ---
> > > >  net/netfilter/ipset/ip_set_core.c     |  3 ++
> > > >  net/netfilter/ipset/ip_set_list_set.c | 55 ++++++++++++++++-------------------
> > > >  2 files changed, 28 insertions(+), 30 deletions(-)
> > > > 
> > > > diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> > > > index 95db43f..7e6568c 100644
> > > > --- a/net/netfilter/ipset/ip_set_core.c
> > > > +++ b/net/netfilter/ipset/ip_set_core.c
> > > > @@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
> > > >  	if (unlikely(protocol_failed(attr)))
> > > >  		return -IPSET_ERR_PROTOCOL;
> > > >  
> > > > +	/* Must wait for flush to be really finished in list:set */
> > > > +	rcu_barrier();
> > > 
> > > Jozsef, are you sure you need this rcu_barrier()?
> > > 
> > > This is waiting for rcu callback completion (ie. decrement the set
> > > reference counter and releasing the extension and object itself).
> > 
> > Yes, that's exactly what it's waiting for, in the case of sets which are 
> > elements in set:list type of sets.
> > 
> > > The rcu read side should be safe when accessing old copies from the
> > > dumping path when using call_rcu().
> > 
> > Three competing actions play role here: flush/delete, destroy, dumping. 
> > The rcu_barrier() above waits for the flush/delete actions. The rcu read 
> > side has access to the set id only and must lookup the set by id in order 
> > to get the name, which is protected by a reference counter. The function 
> > ip_set_name_byindex() intentionally checks that the reference counter 
> > cannot be equal to zero and I believe it's an important internal sanity 
> > checking and I don't want to remove it. The problem was that the reference 
> > counter of the set was first decremented at flush/delete and so we could 
> > get a BUG_ON() crash in ip_set_name_byindex() when an ongoing dumping was 
> > processed. So the reference counter must be released last for a successful 
> > parallel dumping. If rcu_barrier() was not added to the destroy path, a 
> > "destroy" immediately following a "flush" can lead to the error "set is in 
> > use, cannot delete". I tested it and couldn't find a simpler way.
> 
> ip_set_destroy() calls ip_set_destroy_set() which releases the set
> object and its content without waiting for the rcu grace period. I
> think readers (dump path) may still be walking on the set by when
> you're releasing this object.
> 
> Our nfnetlink dump path is lockless (rcu read side protected), so we
> unpublish the set object and release objects via rcu callback when
> operating from the control plane.

Yes, but we are dealing with flush and then destroy, when the set we are 
flushing is a list:set type of set where the elements are sets. It is not 
possible to unpublish the flushed objects, because those are totally valid 
sets.

Before I sent my patch I tested it without rcu_barrier() and the destroy 
command failed (quite rightly) because 'Set is in use' - destroy has to 
wait for flush to be completed anyway.

After your email I have been thinking on it a lot and couldn't figure out 
a better way. Would it be OK for you to limit rcu_barrier() to the 
list:set case with something like:

	mutex_lock(&module_mutex);
	struct module *m = find_module("ip_set_list_set");
	if (m && module_refcount(m) > 0)
		rcu_barrier();
	mutex_unlock(&module_mutex);

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel
  2016-03-08 19:27         ` Jozsef Kadlecsik
@ 2016-03-10 18:27           ` Pablo Neira Ayuso
  0 siblings, 0 replies; 7+ messages in thread
From: Pablo Neira Ayuso @ 2016-03-10 18:27 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel

On Tue, Mar 08, 2016 at 08:27:30PM +0100, Jozsef Kadlecsik wrote:
> Hi Pablo,
> 
> On Sat, 5 Mar 2016, Pablo Neira Ayuso wrote:
> 
> > On Mon, Feb 29, 2016 at 01:47:59PM +0100, Jozsef Kadlecsik wrote:
> > > On Mon, 29 Feb 2016, Pablo Neira Ayuso wrote:
> > > 
> > > > On Wed, Feb 24, 2016 at 09:19:26PM +0100, Jozsef Kadlecsik wrote:
> > > > > Flushing/listing entries was not RCU safe, so parallel flush/dump
> > > > > could lead to kernel crash. Bug reported by Deniz Eren.
> > > > > 
> > > > > Fixes netfilter bugzilla id #1050.
> > > > > 
> > > > > Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> > > > > ---
> > > > >  net/netfilter/ipset/ip_set_core.c     |  3 ++
> > > > >  net/netfilter/ipset/ip_set_list_set.c | 55 ++++++++++++++++-------------------
> > > > >  2 files changed, 28 insertions(+), 30 deletions(-)
> > > > > 
> > > > > diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> > > > > index 95db43f..7e6568c 100644
> > > > > --- a/net/netfilter/ipset/ip_set_core.c
> > > > > +++ b/net/netfilter/ipset/ip_set_core.c
> > > > > @@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
> > > > >  	if (unlikely(protocol_failed(attr)))
> > > > >  		return -IPSET_ERR_PROTOCOL;
> > > > >  
> > > > > +	/* Must wait for flush to be really finished in list:set */
> > > > > +	rcu_barrier();
> > > > 
> > > > Jozsef, are you sure you need this rcu_barrier()?
> > > > 
> > > > This is waiting for rcu callback completion (ie. decrement the set
> > > > reference counter and releasing the extension and object itself).
> > > 
> > > Yes, that's exactly what it's waiting for, in the case of sets which are 
> > > elements in set:list type of sets.
> > > 
> > > > The rcu read side should be safe when accessing old copies from the
> > > > dumping path when using call_rcu().
> > > 
> > > Three competing actions play role here: flush/delete, destroy, dumping. 
> > > The rcu_barrier() above waits for the flush/delete actions. The rcu read 
> > > side has access to the set id only and must lookup the set by id in order 
> > > to get the name, which is protected by a reference counter. The function 
> > > ip_set_name_byindex() intentionally checks that the reference counter 
> > > cannot be equal to zero and I believe it's an important internal sanity 
> > > checking and I don't want to remove it. The problem was that the reference 
> > > counter of the set was first decremented at flush/delete and so we could 
> > > get a BUG_ON() crash in ip_set_name_byindex() when an ongoing dumping was 
> > > processed. So the reference counter must be released last for a successful 
> > > parallel dumping. If rcu_barrier() was not added to the destroy path, a 
> > > "destroy" immediately following a "flush" can lead to the error "set is in 
> > > use, cannot delete". I tested it and couldn't find a simpler way.
> > 
> > ip_set_destroy() calls ip_set_destroy_set() which releases the set
> > object and its content without waiting for the rcu grace period. I
> > think readers (dump path) may still be walking on the set by when
> > you're releasing this object.
> > 
> > Our nfnetlink dump path is lockless (rcu read side protected), so we
> > unpublish the set object and release objects via rcu callback when
> > operating from the control plane.
> 
> Yes, but we are dealing with flush and then destroy, when the set we are 
> flushing is a list:set type of set where the elements are sets. It is not 
> possible to unpublish the flushed objects, because those are totally valid 
> sets.
> 
> Before I sent my patch I tested it without rcu_barrier() and the destroy 
> command failed (quite rightly) because 'Set is in use' - destroy has to 
> wait for flush to be completed anyway.

OK, thanks for explaining. I don't find any better way at this moment
to handle the interdependencies that the set:list type creates with
this elements that are actually sets, so this rcu_barrier() guarantees
the in-order set releasing that you need not to trigger the bug.
Sorry, I misunderstood the goal of this and I just wanted to make sure
this rcu_barrier() was not actually hiding a real problem.

I have pulled this into the nf-next tree, thanks for your patience.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-03-10 18:27 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-24 20:19 [PATCH 0/1] ipset patch for nf Jozsef Kadlecsik
2016-02-24 20:19 ` [PATCH 1/1] netfilter: ipset: Fix set:list type crash when flush/dump set in parallel Jozsef Kadlecsik
2016-02-29 12:22   ` Pablo Neira Ayuso
2016-02-29 12:47     ` Jozsef Kadlecsik
2016-03-05 12:32       ` Pablo Neira Ayuso
2016-03-08 19:27         ` Jozsef Kadlecsik
2016-03-10 18:27           ` Pablo Neira Ayuso

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.