* [PATCH 1/5] netfilter: nf_tables: fix suspicious RCU usage in nft_chain_stats_replace()
2018-12-13 1:06 [PATCH 0/5] Netfilter fixes for net Pablo Neira Ayuso
@ 2018-12-13 1:06 ` Pablo Neira Ayuso
2018-12-13 1:06 ` [PATCH 2/5] netfilter: seqadj: re-load tcp header pointer after possible head reallocation Pablo Neira Ayuso
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Pablo Neira Ayuso @ 2018-12-13 1:06 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
From: Taehee Yoo <ap420073@gmail.com>
basechain->stats is rcu protected data which is updated from
nft_chain_stats_replace(). This function is executed from the commit
phase which holds the pernet nf_tables commit mutex - not the global
nfnetlink subsystem mutex.
Test commands to reproduce the problem are:
%iptables-nft -I INPUT
%iptables-nft -Z
%iptables-nft -Z
This patch uses RCU calls to handle basechain->stats updates to fix a
splat that looks like:
[89279.358755] =============================
[89279.363656] WARNING: suspicious RCU usage
[89279.368458] 4.20.0-rc2+ #44 Tainted: G W L
[89279.374661] -----------------------------
[89279.379542] net/netfilter/nf_tables_api.c:1404 suspicious rcu_dereference_protected() usage!
[...]
[89279.406556] 1 lock held by iptables-nft/5225:
[89279.411728] #0: 00000000bf45a000 (&net->nft.commit_mutex){+.+.}, at: nf_tables_valid_genid+0x1f/0x70 [nf_tables]
[89279.424022] stack backtrace:
[89279.429236] CPU: 0 PID: 5225 Comm: iptables-nft Tainted: G W L 4.20.0-rc2+ #44
[89279.430135] Call Trace:
[89279.430135] dump_stack+0xc9/0x16b
[89279.430135] ? show_regs_print_info+0x5/0x5
[89279.430135] ? lockdep_rcu_suspicious+0x117/0x160
[89279.430135] nft_chain_commit_update+0x4ea/0x640 [nf_tables]
[89279.430135] ? sched_clock_local+0xd4/0x140
[89279.430135] ? check_flags.part.35+0x440/0x440
[89279.430135] ? __rhashtable_remove_fast.constprop.67+0xec0/0xec0 [nf_tables]
[89279.430135] ? sched_clock_cpu+0x126/0x170
[89279.430135] ? find_held_lock+0x39/0x1c0
[89279.430135] ? hlock_class+0x140/0x140
[89279.430135] ? is_bpf_text_address+0x5/0xf0
[89279.430135] ? check_flags.part.35+0x440/0x440
[89279.430135] ? __lock_is_held+0xb4/0x140
[89279.430135] nf_tables_commit+0x2555/0x39c0 [nf_tables]
Fixes: f102d66b335a4 ("netfilter: nf_tables: use dedicated mutex to guard transactions")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netfilter/nfnetlink.h | 12 ------------
net/netfilter/nf_tables_api.c | 21 +++++++++++++--------
net/netfilter/nf_tables_core.c | 2 +-
3 files changed, 14 insertions(+), 21 deletions(-)
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 4a520d3304a2..cf09ab37b45b 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -62,18 +62,6 @@ static inline bool lockdep_nfnl_is_held(__u8 subsys_id)
}
#endif /* CONFIG_PROVE_LOCKING */
-/*
- * nfnl_dereference - fetch RCU pointer when updates are prevented by subsys mutex
- *
- * @p: The pointer to read, prior to dereferencing
- * @ss: The nfnetlink subsystem ID
- *
- * Return the value of the specified RCU-protected pointer, but omit
- * the READ_ONCE(), because caller holds the NFNL subsystem mutex.
- */
-#define nfnl_dereference(p, ss) \
- rcu_dereference_protected(p, lockdep_nfnl_is_held(ss))
-
#define MODULE_ALIAS_NFNL_SUBSYS(subsys) \
MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys))
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2e61aab6ed73..6e548d7c9f67 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1216,7 +1216,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
goto nla_put_failure;
- if (basechain->stats && nft_dump_stats(skb, basechain->stats))
+ if (rcu_access_pointer(basechain->stats) &&
+ nft_dump_stats(skb, rcu_dereference(basechain->stats)))
goto nla_put_failure;
}
@@ -1392,7 +1393,8 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
return newstats;
}
-static void nft_chain_stats_replace(struct nft_base_chain *chain,
+static void nft_chain_stats_replace(struct net *net,
+ struct nft_base_chain *chain,
struct nft_stats __percpu *newstats)
{
struct nft_stats __percpu *oldstats;
@@ -1400,8 +1402,9 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
if (newstats == NULL)
return;
- if (chain->stats) {
- oldstats = nfnl_dereference(chain->stats, NFNL_SUBSYS_NFTABLES);
+ if (rcu_access_pointer(chain->stats)) {
+ oldstats = rcu_dereference_protected(chain->stats,
+ lockdep_commit_lock_is_held(net));
rcu_assign_pointer(chain->stats, newstats);
synchronize_rcu();
free_percpu(oldstats);
@@ -1439,9 +1442,10 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
struct nft_base_chain *basechain = nft_base_chain(chain);
module_put(basechain->type->owner);
- free_percpu(basechain->stats);
- if (basechain->stats)
+ if (rcu_access_pointer(basechain->stats)) {
static_branch_dec(&nft_counters_enabled);
+ free_percpu(rcu_dereference_raw(basechain->stats));
+ }
kfree(chain->name);
kfree(basechain);
} else {
@@ -1590,7 +1594,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
kfree(basechain);
return PTR_ERR(stats);
}
- basechain->stats = stats;
+ rcu_assign_pointer(basechain->stats, stats);
static_branch_inc(&nft_counters_enabled);
}
@@ -6180,7 +6184,8 @@ static void nft_chain_commit_update(struct nft_trans *trans)
return;
basechain = nft_base_chain(trans->ctx.chain);
- nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans));
+ nft_chain_stats_replace(trans->ctx.net, basechain,
+ nft_trans_chain_stats(trans));
switch (nft_trans_chain_policy(trans)) {
case NF_DROP:
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 3fbce3b9c5ec..a50500232b0a 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -101,7 +101,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
struct nft_stats *stats;
base_chain = nft_base_chain(chain);
- if (!base_chain->stats)
+ if (!rcu_access_pointer(base_chain->stats))
return;
local_bh_disable();
--
2.11.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/5] netfilter: seqadj: re-load tcp header pointer after possible head reallocation
2018-12-13 1:06 [PATCH 0/5] Netfilter fixes for net Pablo Neira Ayuso
2018-12-13 1:06 ` [PATCH 1/5] netfilter: nf_tables: fix suspicious RCU usage in nft_chain_stats_replace() Pablo Neira Ayuso
@ 2018-12-13 1:06 ` Pablo Neira Ayuso
2018-12-13 1:06 ` [PATCH 3/5] netfilter: ipset: do not call ipset_nest_end after nla_nest_cancel Pablo Neira Ayuso
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Pablo Neira Ayuso @ 2018-12-13 1:06 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
From: Florian Westphal <fw@strlen.de>
When adjusting sack block sequence numbers, skb_make_writable() gets
called to make sure tcp options are all in the linear area, and buffer
is not shared.
This can cause tcp header pointer to get reallocated, so we must
reaload it to avoid memory corruption.
This bug pre-dates git history.
Reported-by: Neel Mehta <nmehta@google.com>
Reported-by: Shane Huntley <shuntley@google.com>
Reported-by: Heather Adkins <argv@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conntrack_seqadj.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index a975efd6b8c3..9da303461069 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -115,12 +115,12 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb,
/* TCP SACK sequence number adjustment */
static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
unsigned int protoff,
- struct tcphdr *tcph,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
- unsigned int dir, optoff, optend;
+ struct tcphdr *tcph = (void *)skb->data + protoff;
struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ unsigned int dir, optoff, optend;
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + tcph->doff * 4;
@@ -128,6 +128,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
if (!skb_make_writable(skb, optend))
return 0;
+ tcph = (void *)skb->data + protoff;
dir = CTINFO2DIR(ctinfo);
while (optoff < optend) {
@@ -207,7 +208,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
ntohl(newack));
tcph->ack_seq = newack;
- res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+ res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo);
out:
spin_unlock_bh(&ct->lock);
--
2.11.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 3/5] netfilter: ipset: do not call ipset_nest_end after nla_nest_cancel
2018-12-13 1:06 [PATCH 0/5] Netfilter fixes for net Pablo Neira Ayuso
2018-12-13 1:06 ` [PATCH 1/5] netfilter: nf_tables: fix suspicious RCU usage in nft_chain_stats_replace() Pablo Neira Ayuso
2018-12-13 1:06 ` [PATCH 2/5] netfilter: seqadj: re-load tcp header pointer after possible head reallocation Pablo Neira Ayuso
@ 2018-12-13 1:06 ` Pablo Neira Ayuso
2018-12-13 1:06 ` [PATCH 4/5] netfilter: nat: can't use dst_hold on noref dst Pablo Neira Ayuso
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Pablo Neira Ayuso @ 2018-12-13 1:06 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
From: Pan Bian <bianpan2016@163.com>
In the error handling block, nla_nest_cancel(skb, atd) is called to
cancel the nest operation. But then, ipset_nest_end(skb, atd) is
unexpected called to end the nest operation. This patch calls the
ipset_nest_end only on the branch that nla_nest_cancel is not called.
Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel")
Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/ipset/ip_set_list_set.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 4eef55da0878..8da228da53ae 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -531,8 +531,8 @@ list_set_list(const struct ip_set *set,
ret = -EMSGSIZE;
} else {
cb->args[IPSET_CB_ARG0] = i;
+ ipset_nest_end(skb, atd);
}
- ipset_nest_end(skb, atd);
out:
rcu_read_unlock();
return ret;
--
2.11.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 4/5] netfilter: nat: can't use dst_hold on noref dst
2018-12-13 1:06 [PATCH 0/5] Netfilter fixes for net Pablo Neira Ayuso
` (2 preceding siblings ...)
2018-12-13 1:06 ` [PATCH 3/5] netfilter: ipset: do not call ipset_nest_end after nla_nest_cancel Pablo Neira Ayuso
@ 2018-12-13 1:06 ` Pablo Neira Ayuso
2018-12-13 1:06 ` [PATCH 5/5] netfilter: nf_conncount: use rb_link_node_rcu() instead of rb_link_node() Pablo Neira Ayuso
2018-12-13 5:37 ` [PATCH 0/5] Netfilter fixes for net David Miller
5 siblings, 0 replies; 7+ messages in thread
From: Pablo Neira Ayuso @ 2018-12-13 1:06 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
From: Florian Westphal <fw@strlen.de>
The dst entry might already have a zero refcount, waiting on rcu list
to be free'd. Using dst_hold() transitions its reference count to 1, and
next dst release will try to free it again -- resulting in a double free:
WARNING: CPU: 1 PID: 0 at include/net/dst.h:239 nf_xfrm_me_harder+0xe7/0x130 [nf_nat]
RIP: 0010:nf_xfrm_me_harder+0xe7/0x130 [nf_nat]
Code: 48 8b 5c 24 60 65 48 33 1c 25 28 00 00 00 75 53 48 83 c4 68 5b 5d 41 5c c3 85 c0 74 0d 8d 48 01 f0 0f b1 0a 74 86 85 c0 75 f3 <0f> 0b e9 7b ff ff ff 29 c6 31 d2 b9 20 00 48 00 4c 89 e7 e8 31 27
Call Trace:
nf_nat_ipv4_out+0x78/0x90 [nf_nat_ipv4]
nf_hook_slow+0x36/0xd0
ip_output+0x9f/0xd0
ip_forward+0x328/0x440
ip_rcv+0x8a/0xb0
Use dst_hold_safe instead and bail out if we cannot take a reference.
Fixes: a4c2fd7f7891 ("net: remove DST_NOCACHE flag")
Reported-by: Martin Zaharinov <micron10@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_nat_core.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index e2b196054dfc..2268b10a9dcf 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -117,7 +117,8 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
dst = skb_dst(skb);
if (dst->xfrm)
dst = ((struct xfrm_dst *)dst)->route;
- dst_hold(dst);
+ if (!dst_hold_safe(dst))
+ return -EHOSTUNREACH;
if (sk && !net_eq(net, sock_net(sk)))
sk = NULL;
--
2.11.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 5/5] netfilter: nf_conncount: use rb_link_node_rcu() instead of rb_link_node()
2018-12-13 1:06 [PATCH 0/5] Netfilter fixes for net Pablo Neira Ayuso
` (3 preceding siblings ...)
2018-12-13 1:06 ` [PATCH 4/5] netfilter: nat: can't use dst_hold on noref dst Pablo Neira Ayuso
@ 2018-12-13 1:06 ` Pablo Neira Ayuso
2018-12-13 5:37 ` [PATCH 0/5] Netfilter fixes for net David Miller
5 siblings, 0 replies; 7+ messages in thread
From: Pablo Neira Ayuso @ 2018-12-13 1:06 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
From: Taehee Yoo <ap420073@gmail.com>
rbnode in insert_tree() is rcu protected pointer.
So, in order to handle this pointer, _rcu function should be used.
rb_link_node_rcu() is a rcu version of rb_link_node().
Fixes: 34848d5c896e ("netfilter: nf_conncount: Split insert and traversal")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conncount.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index b6d0f6deea86..9cd180bda092 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -427,7 +427,7 @@ insert_tree(struct net *net,
count = 1;
rbconn->list.count = count;
- rb_link_node(&rbconn->node, parent, rbnode);
+ rb_link_node_rcu(&rbconn->node, parent, rbnode);
rb_insert_color(&rbconn->node, root);
out_unlock:
spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
--
2.11.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 0/5] Netfilter fixes for net
2018-12-13 1:06 [PATCH 0/5] Netfilter fixes for net Pablo Neira Ayuso
` (4 preceding siblings ...)
2018-12-13 1:06 ` [PATCH 5/5] netfilter: nf_conncount: use rb_link_node_rcu() instead of rb_link_node() Pablo Neira Ayuso
@ 2018-12-13 5:37 ` David Miller
5 siblings, 0 replies; 7+ messages in thread
From: David Miller @ 2018-12-13 5:37 UTC (permalink / raw)
Cc: netfilter-devel, netdev
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 13 Dec 2018 02:06:26 +0100
> The following patchset contains Netfilter fixes for net:
>
> 1) Fix warnings suspicious rcu usage when handling base chain
> statistics, from Taehee Yoo.
>
> 2) Refetch pointer to tcp header from nf_ct_sack_adjust() since
> skb_make_writable() may reallocate data area, reported by Google
> folks patch from Florian.
>
> 3) Incorrect netlink nest end after previous cancellation from error
> path in ipset, from Pan Bian.
>
> 4) Use dst_hold_safe() from nf_xfrm_me_harder(), from Florian.
>
> 5) Use rb_link_node_rcu() for rcu-protected rbtree node in
> nf_conncount, from Taehee Yoo.
>
> You can pull these changes from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
Pulled, thanks.
^ permalink raw reply [flat|nested] 7+ messages in thread