* [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
@ 2017-07-26 0:09 Florian Westphal
2017-07-26 10:34 ` Eric Dumazet
2017-07-26 11:15 ` Pablo Neira Ayuso
0 siblings, 2 replies; 5+ messages in thread
From: Florian Westphal @ 2017-07-26 0:09 UTC (permalink / raw)
To: netfilter-devel; +Cc: Florian Westphal
switch to lockless lockup. write side now also increments sequence
counter. On lookup, sample counter value and only take the lock
if we did not find a match and the counter has changed.
This avoids need to write to private area in normal (lookup) cases.
Note that we take the non-blocking variant (raw_seqcount_begin), i.e.
read side will not wait for writer to finish.
If we did not find a result we will fall back to use of read-lock.
The readlock is also used during dumps to ensure we get a consistent
tree walk.
Similar technique (rbtree+seqlock) was used by David Howells in rxrpc.
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nft_set_rbtree.c | 42 +++++++++++++++++++++++++++++++-----------
1 file changed, 31 insertions(+), 11 deletions(-)
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index bce5382f1d49..dee75fe8d862 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -19,8 +19,9 @@
#include <net/netfilter/nf_tables.h>
struct nft_rbtree {
- rwlock_t lock;
struct rb_root root;
+ rwlock_t lock;
+ seqcount_t count;
};
struct nft_rbtree_elem {
@@ -40,8 +41,8 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
}
-static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
{
struct nft_rbtree *priv = nft_set_priv(set);
const struct nft_rbtree_elem *rbe, *interval = NULL;
@@ -50,15 +51,14 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
const void *this;
int d;
- read_lock_bh(&priv->lock);
- parent = priv->root.rb_node;
+ parent = rcu_dereference_raw(priv->root.rb_node);
while (parent != NULL) {
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
this = nft_set_ext_key(&rbe->ext);
d = memcmp(this, key, set->klen);
if (d < 0) {
- parent = parent->rb_left;
+ parent = rcu_dereference_raw(parent->rb_left);
if (interval &&
nft_rbtree_equal(set, this, interval) &&
nft_rbtree_interval_end(this) &&
@@ -66,15 +66,14 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
continue;
interval = rbe;
} else if (d > 0)
- parent = parent->rb_right;
+ parent = rcu_dereference_raw(parent->rb_right);
else {
if (!nft_set_elem_active(&rbe->ext, genmask)) {
- parent = parent->rb_left;
+ parent = rcu_dereference_raw(parent->rb_left);
continue;
}
if (nft_rbtree_interval_end(rbe))
goto out;
- read_unlock_bh(&priv->lock);
*ext = &rbe->ext;
return true;
@@ -84,15 +83,31 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
!nft_rbtree_interval_end(interval)) {
- read_unlock_bh(&priv->lock);
*ext = &interval->ext;
return true;
}
out:
- read_unlock_bh(&priv->lock);
return false;
}
+static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ unsigned int seq = raw_seqcount_begin(&priv->count);
+ bool ret;
+
+ ret = __nft_rbtree_lookup(net, set, key, ext);
+ if (ret || !read_seqcount_retry(&priv->count, seq))
+ return ret;
+
+ read_lock_bh(&priv->lock);
+ ret = __nft_rbtree_lookup(net, set, key, ext);
+ read_unlock_bh(&priv->lock);
+
+ return ret;
+}
+
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new,
struct nft_set_ext **ext)
@@ -144,7 +159,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
int err;
write_lock_bh(&priv->lock);
+ write_seqcount_begin(&priv->count);
err = __nft_rbtree_insert(net, set, rbe, ext);
+ write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
return err;
@@ -158,7 +175,9 @@ static void nft_rbtree_remove(const struct net *net,
struct nft_rbtree_elem *rbe = elem->priv;
write_lock_bh(&priv->lock);
+ write_seqcount_begin(&priv->count);
rb_erase(&rbe->node, &priv->root);
+ write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
}
@@ -264,6 +283,7 @@ static int nft_rbtree_init(const struct nft_set *set,
struct nft_rbtree *priv = nft_set_priv(set);
rwlock_init(&priv->lock);
+ seqcount_init(&priv->count);
priv->root = RB_ROOT;
return 0;
}
--
2.13.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
2017-07-26 0:09 [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases Florian Westphal
@ 2017-07-26 10:34 ` Eric Dumazet
2017-07-26 11:04 ` Florian Westphal
2017-07-26 11:15 ` Pablo Neira Ayuso
1 sibling, 1 reply; 5+ messages in thread
From: Eric Dumazet @ 2017-07-26 10:34 UTC (permalink / raw)
To: Florian Westphal; +Cc: netfilter-devel
On Wed, 2017-07-26 at 02:09 +0200, Florian Westphal wrote:
> switch to lockless lockup. write side now also increments sequence
> counter. On lookup, sample counter value and only take the lock
> if we did not find a match and the counter has changed.
>
> This avoids need to write to private area in normal (lookup) cases.
>
> Note that we take the non-blocking variant (raw_seqcount_begin), i.e.
> read side will not wait for writer to finish.
>
> If we did not find a result we will fall back to use of read-lock.
>
> The readlock is also used during dumps to ensure we get a consistent
> tree walk.
>
> Similar technique (rbtree+seqlock) was used by David Howells in rxrpc.
Please note that in commit b145425f269a17ed344d737f746b844dfac60c82
("inetpeer: remove AVL implementation in favor of RB tree")
I chose to also pass the sequence so that the lookup could abort.
I am not sure that during rb tree write operations, some nodes could be
left with some kind of loop.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
2017-07-26 10:34 ` Eric Dumazet
@ 2017-07-26 11:04 ` Florian Westphal
0 siblings, 0 replies; 5+ messages in thread
From: Florian Westphal @ 2017-07-26 11:04 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Florian Westphal, netfilter-devel
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2017-07-26 at 02:09 +0200, Florian Westphal wrote:
> > switch to lockless lockup. write side now also increments sequence
> > counter. On lookup, sample counter value and only take the lock
> > if we did not find a match and the counter has changed.
> >
> > This avoids need to write to private area in normal (lookup) cases.
> >
> > Note that we take the non-blocking variant (raw_seqcount_begin), i.e.
> > read side will not wait for writer to finish.
> >
> > If we did not find a result we will fall back to use of read-lock.
> >
> > The readlock is also used during dumps to ensure we get a consistent
> > tree walk.
> >
> > Similar technique (rbtree+seqlock) was used by David Howells in rxrpc.
>
> Please note that in commit b145425f269a17ed344d737f746b844dfac60c82
> ("inetpeer: remove AVL implementation in favor of RB tree")
>
> I chose to also pass the sequence so that the lookup could abort.
> I am not sure that during rb tree write operations, some nodes could be
> left with some kind of loop.
I see.
Ok, I will spin a v2 and will pass the sequence too, thanks Eric.
If we have to abort on seqretry anyway then I can also use
read_seqcount_begin to force readers to wait until writer is done, so I
will change that as well.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
2017-07-26 0:09 [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases Florian Westphal
2017-07-26 10:34 ` Eric Dumazet
@ 2017-07-26 11:15 ` Pablo Neira Ayuso
2017-07-26 11:25 ` Pablo Neira Ayuso
1 sibling, 1 reply; 5+ messages in thread
From: Pablo Neira Ayuso @ 2017-07-26 11:15 UTC (permalink / raw)
To: Florian Westphal; +Cc: netfilter-devel
On Wed, Jul 26, 2017 at 02:09:41AM +0200, Florian Westphal wrote:
[...]
> @@ -144,7 +159,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
> int err;
>
> write_lock_bh(&priv->lock);
> + write_seqcount_begin(&priv->count);
> err = __nft_rbtree_insert(net, set, rbe, ext);
> + write_seqcount_end(&priv->count);
> write_unlock_bh(&priv->lock);
>
> return err;
> @@ -158,7 +175,9 @@ static void nft_rbtree_remove(const struct net *net,
> struct nft_rbtree_elem *rbe = elem->priv;
>
> write_lock_bh(&priv->lock);
Do we need the spinlock anymore? This is protected by mutex from
userspace, and we have no support for neither timeouts nor dynamic set
population from packet path yet.
> + write_seqcount_begin(&priv->count);
> rb_erase(&rbe->node, &priv->root);
> + write_seqcount_end(&priv->count);
> write_unlock_bh(&priv->lock);
> }
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases
2017-07-26 11:15 ` Pablo Neira Ayuso
@ 2017-07-26 11:25 ` Pablo Neira Ayuso
0 siblings, 0 replies; 5+ messages in thread
From: Pablo Neira Ayuso @ 2017-07-26 11:25 UTC (permalink / raw)
To: Florian Westphal; +Cc: netfilter-devel
On Wed, Jul 26, 2017 at 01:15:06PM +0200, Pablo Neira Ayuso wrote:
> On Wed, Jul 26, 2017 at 02:09:41AM +0200, Florian Westphal wrote:
> [...]
> > @@ -144,7 +159,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
> > int err;
> >
> > write_lock_bh(&priv->lock);
> > + write_seqcount_begin(&priv->count);
> > err = __nft_rbtree_insert(net, set, rbe, ext);
> > + write_seqcount_end(&priv->count);
> > write_unlock_bh(&priv->lock);
> >
> > return err;
> > @@ -158,7 +175,9 @@ static void nft_rbtree_remove(const struct net *net,
> > struct nft_rbtree_elem *rbe = elem->priv;
> >
> > write_lock_bh(&priv->lock);
>
> Do we need the spinlock anymore? This is protected by mutex from
> userspace, and we have no support for neither timeouts nor dynamic set
> population from packet path yet.
Forget this, I overlook we have a slow path in case seq mismatches.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2017-07-26 11:25 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-26 0:09 [PATCH nf-next] netfilter: nft_set_rbtree: use seqcount to avoid lock in most cases Florian Westphal
2017-07-26 10:34 ` Eric Dumazet
2017-07-26 11:04 ` Florian Westphal
2017-07-26 11:15 ` Pablo Neira Ayuso
2017-07-26 11:25 ` Pablo Neira Ayuso
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.