All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
@ 2017-07-26  0:09 Florian Westphal
  2017-07-26 10:34 ` Eric Dumazet
  2017-07-26 11:15 ` Pablo Neira Ayuso
  0 siblings, 2 replies; 5+ messages in thread
From: Florian Westphal @ 2017-07-26  0:09 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Florian Westphal

switch to lockless lockup. write side now also increments sequence
counter.  On lookup, sample counter value and only take the lock
if we did not find a match and the counter has changed.

This avoids need to write to private area in normal (lookup) cases.

Note that we take the non-blocking variant (raw_seqcount_begin), i.e.
read side will not wait for writer to finish.

If we did not find a result we will fall back to use of read-lock.

The readlock is also used during dumps to ensure we get a consistent
tree walk.

Similar technique (rbtree+seqlock) was used by David Howells in rxrpc.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nft_set_rbtree.c | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index bce5382f1d49..dee75fe8d862 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -19,8 +19,9 @@
 #include <net/netfilter/nf_tables.h>
 
 struct nft_rbtree {
-	rwlock_t		lock;
 	struct rb_root		root;
+	rwlock_t		lock;
+	seqcount_t		count;
 };
 
 struct nft_rbtree_elem {
@@ -40,8 +41,8 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
 	return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
 }
 
-static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
-			      const u32 *key, const struct nft_set_ext **ext)
+static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+				const u32 *key, const struct nft_set_ext **ext)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
 	const struct nft_rbtree_elem *rbe, *interval = NULL;
@@ -50,15 +51,14 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 	const void *this;
 	int d;
 
-	read_lock_bh(&priv->lock);
-	parent = priv->root.rb_node;
+	parent = rcu_dereference_raw(priv->root.rb_node);
 	while (parent != NULL) {
 		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 
 		this = nft_set_ext_key(&rbe->ext);
 		d = memcmp(this, key, set->klen);
 		if (d < 0) {
-			parent = parent->rb_left;
+			parent = rcu_dereference_raw(parent->rb_left);
 			if (interval &&
 			    nft_rbtree_equal(set, this, interval) &&
 			    nft_rbtree_interval_end(this) &&
@@ -66,15 +66,14 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 				continue;
 			interval = rbe;
 		} else if (d > 0)
-			parent = parent->rb_right;
+			parent = rcu_dereference_raw(parent->rb_right);
 		else {
 			if (!nft_set_elem_active(&rbe->ext, genmask)) {
-				parent = parent->rb_left;
+				parent = rcu_dereference_raw(parent->rb_left);
 				continue;
 			}
 			if (nft_rbtree_interval_end(rbe))
 				goto out;
-			read_unlock_bh(&priv->lock);
 
 			*ext = &rbe->ext;
 			return true;
@@ -84,15 +83,31 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 	if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 	    nft_set_elem_active(&interval->ext, genmask) &&
 	    !nft_rbtree_interval_end(interval)) {
-		read_unlock_bh(&priv->lock);
 		*ext = &interval->ext;
 		return true;
 	}
 out:
-	read_unlock_bh(&priv->lock);
 	return false;
 }
 
+static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+			      const u32 *key, const struct nft_set_ext **ext)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+	unsigned int seq = raw_seqcount_begin(&priv->count);
+	bool ret;
+
+	ret = __nft_rbtree_lookup(net, set, key, ext);
+	if (ret || !read_seqcount_retry(&priv->count, seq))
+		return ret;
+
+	read_lock_bh(&priv->lock);
+	ret = __nft_rbtree_lookup(net, set, key, ext);
+	read_unlock_bh(&priv->lock);
+
+	return ret;
+}
+
 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			       struct nft_rbtree_elem *new,
 			       struct nft_set_ext **ext)
@@ -144,7 +159,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 	int err;
 
 	write_lock_bh(&priv->lock);
+	write_seqcount_begin(&priv->count);
 	err = __nft_rbtree_insert(net, set, rbe, ext);
+	write_seqcount_end(&priv->count);
 	write_unlock_bh(&priv->lock);
 
 	return err;
@@ -158,7 +175,9 @@ static void nft_rbtree_remove(const struct net *net,
 	struct nft_rbtree_elem *rbe = elem->priv;
 
 	write_lock_bh(&priv->lock);
+	write_seqcount_begin(&priv->count);
 	rb_erase(&rbe->node, &priv->root);
+	write_seqcount_end(&priv->count);
 	write_unlock_bh(&priv->lock);
 }
 
@@ -264,6 +283,7 @@ static int nft_rbtree_init(const struct nft_set *set,
 	struct nft_rbtree *priv = nft_set_priv(set);
 
 	rwlock_init(&priv->lock);
+	seqcount_init(&priv->count);
 	priv->root = RB_ROOT;
 	return 0;
 }
-- 
2.13.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
  2017-07-26  0:09 [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases Florian Westphal
@ 2017-07-26 10:34 ` Eric Dumazet
  2017-07-26 11:04   ` Florian Westphal
  2017-07-26 11:15 ` Pablo Neira Ayuso
  1 sibling, 1 reply; 5+ messages in thread
From: Eric Dumazet @ 2017-07-26 10:34 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Wed, 2017-07-26 at 02:09 +0200, Florian Westphal wrote:
> switch to lockless lockup. write side now also increments sequence
> counter.  On lookup, sample counter value and only take the lock
> if we did not find a match and the counter has changed.
> 
> This avoids need to write to private area in normal (lookup) cases.
> 
> Note that we take the non-blocking variant (raw_seqcount_begin), i.e.
> read side will not wait for writer to finish.
> 
> If we did not find a result we will fall back to use of read-lock.
> 
> The readlock is also used during dumps to ensure we get a consistent
> tree walk.
> 
> Similar technique (rbtree+seqlock) was used by David Howells in rxrpc.


Please note that in commit b145425f269a17ed344d737f746b844dfac60c82
("inetpeer: remove AVL implementation in favor of RB tree")

I chose to also pass the sequence so that the lookup could abort.

I am not sure that during rb tree write operations, some nodes could be
left with some kind of loop.




^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
  2017-07-26 10:34 ` Eric Dumazet
@ 2017-07-26 11:04   ` Florian Westphal
  0 siblings, 0 replies; 5+ messages in thread
From: Florian Westphal @ 2017-07-26 11:04 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netfilter-devel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2017-07-26 at 02:09 +0200, Florian Westphal wrote:
> > switch to lockless lockup. write side now also increments sequence
> > counter.  On lookup, sample counter value and only take the lock
> > if we did not find a match and the counter has changed.
> > 
> > This avoids need to write to private area in normal (lookup) cases.
> > 
> > Note that we take the non-blocking variant (raw_seqcount_begin), i.e.
> > read side will not wait for writer to finish.
> > 
> > If we did not find a result we will fall back to use of read-lock.
> > 
> > The readlock is also used during dumps to ensure we get a consistent
> > tree walk.
> > 
> > Similar technique (rbtree+seqlock) was used by David Howells in rxrpc.
> 
> Please note that in commit b145425f269a17ed344d737f746b844dfac60c82
> ("inetpeer: remove AVL implementation in favor of RB tree")
> 
> I chose to also pass the sequence so that the lookup could abort.
> I am not sure that during rb tree write operations, some nodes could be
> left with some kind of loop.

I see.

Ok, I will spin a v2 and will pass the sequence too, thanks Eric.

If we have to abort on seqretry anyway then I can also use
read_seqcount_begin to force readers to wait until writer is done, so I
will change that as well.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
  2017-07-26  0:09 [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases Florian Westphal
  2017-07-26 10:34 ` Eric Dumazet
@ 2017-07-26 11:15 ` Pablo Neira Ayuso
  2017-07-26 11:25   ` Pablo Neira Ayuso
  1 sibling, 1 reply; 5+ messages in thread
From: Pablo Neira Ayuso @ 2017-07-26 11:15 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Wed, Jul 26, 2017 at 02:09:41AM +0200, Florian Westphal wrote:
[...]
> @@ -144,7 +159,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
>  	int err;
>  
>  	write_lock_bh(&priv->lock);
> +	write_seqcount_begin(&priv->count);
>  	err = __nft_rbtree_insert(net, set, rbe, ext);
> +	write_seqcount_end(&priv->count);
>  	write_unlock_bh(&priv->lock);
>  
>  	return err;
> @@ -158,7 +175,9 @@ static void nft_rbtree_remove(const struct net *net,
>  	struct nft_rbtree_elem *rbe = elem->priv;
>  
>  	write_lock_bh(&priv->lock);

Do we need the spinlock anymore? This is protected by mutex from
userspace, and we have no support for neither timeouts nor dynamic set
population from packet path yet.

> +	write_seqcount_begin(&priv->count);
>  	rb_erase(&rbe->node, &priv->root);
> +	write_seqcount_end(&priv->count);
>  	write_unlock_bh(&priv->lock);
>  }
>  

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
  2017-07-26 11:15 ` Pablo Neira Ayuso
@ 2017-07-26 11:25   ` Pablo Neira Ayuso
  0 siblings, 0 replies; 5+ messages in thread
From: Pablo Neira Ayuso @ 2017-07-26 11:25 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Wed, Jul 26, 2017 at 01:15:06PM +0200, Pablo Neira Ayuso wrote:
> On Wed, Jul 26, 2017 at 02:09:41AM +0200, Florian Westphal wrote:
> [...]
> > @@ -144,7 +159,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
> >  	int err;
> >  
> >  	write_lock_bh(&priv->lock);
> > +	write_seqcount_begin(&priv->count);
> >  	err = __nft_rbtree_insert(net, set, rbe, ext);
> > +	write_seqcount_end(&priv->count);
> >  	write_unlock_bh(&priv->lock);
> >  
> >  	return err;
> > @@ -158,7 +175,9 @@ static void nft_rbtree_remove(const struct net *net,
> >  	struct nft_rbtree_elem *rbe = elem->priv;
> >  
> >  	write_lock_bh(&priv->lock);
> 
> Do we need the spinlock anymore? This is protected by mutex from
> userspace, and we have no support for neither timeouts nor dynamic set
> population from packet path yet.

Forget this, I overlook we have a slow path in case seq mismatches.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-07-26 11:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-26  0:09 [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases Florian Westphal
2017-07-26 10:34 ` Eric Dumazet
2017-07-26 11:04   ` Florian Westphal
2017-07-26 11:15 ` Pablo Neira Ayuso
2017-07-26 11:25   ` Pablo Neira Ayuso

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.