All of lore.kernel.org
 help / color / mirror / Atom feed
* cgroup matches in INPUT chain
@ 2015-03-19 18:41 Daniel Mack
  2015-03-19 18:58 ` Florian Westphal
  0 siblings, 1 reply; 8+ messages in thread
From: Daniel Mack @ 2015-03-19 18:41 UTC (permalink / raw)
  To: Daniel Borkmann, Alexey Perevalov; +Cc: Pablo Neira Ayuso, netdev

[-- Attachment #1: Type: text/plain, Size: 1281 bytes --]

Hi,

I'm currently looking into the netclass CGroup controller and its
netfilter module in order to build a per-application firewall with it.
I'm having trouble understanding the commit log of a00e76349f35
("netfilter: x_tables: allow to use cgroup match for LOCAL_IN nf
hooks"), especially the following paragraph:

> It's possible to get classified sk_buff after PREROUTING, due to
> socket lookup being done in early_demux (tcp_v4_early_demux). Also
> it works for udp as well.

What is "after PREROUTING" supposed to mean exactly? After all, the
examples in the commit log put the rules into the "INPUT" chain.

In my tests, however, NF_INET_LOCAL_IN is iterated before early_demux()
is called, and for skbs that do not have a socket assigned, the cgroup
match code bails out early, making the rules ineffective. Hence,
NF_INET_LOCAL_IN can't work reliably for these matches IMO, as the
cgroup rules don't apply to at least every first packet in a TCP stream.
Am I missing something?

It would also possible to do something similar to what the "socket"
module does, and look up a listening socket directly from cgroup_mt() in
case skb->sk == NULL. I've attached a patch that implements that and
which works for me, but I'm not sure if that's a sane way to go.


Thanks,
Daniel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-netfilter-x_tables-implement-matching-for-skb-sk-NUL.patch --]
[-- Type: text/x-diff; name="0001-netfilter-x_tables-implement-matching-for-skb-sk-NUL.patch", Size: 5676 bytes --]

From 7562d97b6090a36395f0c95b54f5d5ecd6bfc01f Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Fri, 13 Mar 2015 15:48:41 +0100
Subject: [PATCH RFC] netfilter: x_tables: implement cgroup matching for skb->sk == NULL

For skbs which do not have a socket assigned (which is at least the case
for every first packet in a stream), the cgroup matching code currently
bails out early, which makes cgroup rules ineffective. Only subsequently
received packets of the stream (if any) are caught.

In order to use this type of matches for a per-application firewall, we
need to make sure to catch all packets, including the first one.

This patch adds code to look up listening TCP and UDP sockets in case
the skb does not have a socket assigned yet. If one is found, the
cgroup match is done against the class ID of the found socket. As this
makes the implementation specific to tcp/udp, the module now has to
implement match functions for both IPv4 and IPv6.

Signed-off-by: Daniel Mack <daniel@zonque.org>
---
 net/netfilter/xt_cgroup.c | 141 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 124 insertions(+), 17 deletions(-)

diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 7198d66..766732f 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -16,7 +16,10 @@
 #include <linux/module.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_cgroup.h>
+#include <net/inet6_hashtables.h>
 #include <net/sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
@@ -35,37 +38,141 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par)
 }
 
 static bool
-cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+cgroup_mt_ipv4(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_cgroup_info *info = par->matchinfo;
+	struct sock *sk = skb->sk;
+	bool ret;
 
-	if (skb->sk == NULL)
-		return false;
+	if (!sk) {
+		const struct iphdr *iph = ip_hdr(skb);
+		struct udphdr hdr, *hp;
 
-	return (info->id == skb->sk->sk_classid) ^ info->invert;
+		hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(hdr), &hdr);
+		if (!hp)
+			return false;
+
+		switch (iph->protocol) {
+		case IPPROTO_UDP:
+			sk = udp4_lib_lookup(dev_net(skb->dev),
+					     iph->saddr, hp->source,
+					     iph->daddr, hp->dest,
+					     par->in->ifindex);
+			break;
+
+		case IPPROTO_TCP:
+			sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo,
+					   iph->saddr, hp->source,
+					   iph->daddr, hp->dest,
+					   par->in->ifindex);
+			break;
+
+		default:
+			break;
+		}
+
+		if (!sk)
+			return false;
+	}
+
+	ret = (info->id == sk->sk_classid) ^ info->invert;
+
+	if (sk != skb->sk)
+		sock_gen_put(sk);
+
+	return ret;
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static bool
+cgroup_mt_ipv6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_cgroup_info *info = par->matchinfo;
+	struct sock *sk = skb->sk;
+	bool ret;
+
+	if (!sk) {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+		struct udphdr hdr, *hp;
+		int tproto, thoff = 0;
+
+		tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+		if (tproto < 0)
+			return false;
+
+		hp = skb_header_pointer(skb, thoff, sizeof(hdr), &hdr);
+		if (!hp)
+			return false;
+
+		switch (tproto) {
+		case IPPROTO_UDP:
+			sk = udp6_lib_lookup(dev_net(skb->dev),
+					     &iph->saddr, hp->source,
+					     &iph->daddr, hp->dest,
+					     par->in->ifindex);
+			break;
+
+		case IPPROTO_TCP:
+			sk = inet6_lookup(dev_net(skb->dev), &tcp_hashinfo,
+					  &iph->saddr, hp->source,
+					  &iph->daddr, hp->dest,
+					  par->in->ifindex);
+			break;
+
+		default:
+			break;
+		}
+
+		if (!sk)
+			return false;
+	}
+
+	ret = (info->id == sk->sk_classid) ^ info->invert;
+
+	if (sk != skb->sk)
+		sock_gen_put(sk);
+
+	return ret;
 }
+#endif
 
-static struct xt_match cgroup_mt_reg __read_mostly = {
-	.name       = "cgroup",
-	.revision   = 0,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = cgroup_mt_check,
-	.match      = cgroup_mt,
-	.matchsize  = sizeof(struct xt_cgroup_info),
-	.me         = THIS_MODULE,
-	.hooks      = (1 << NF_INET_LOCAL_OUT) |
-		      (1 << NF_INET_POST_ROUTING) |
-		      (1 << NF_INET_LOCAL_IN),
+static struct xt_match cgroup_mt_reg[] __read_mostly = {
+	{
+		.name       = "cgroup",
+		.revision   = 0,
+		.family     = NFPROTO_IPV4,
+		.checkentry = cgroup_mt_check,
+		.match      = cgroup_mt_ipv4,
+		.matchsize  = sizeof(struct xt_cgroup_info),
+		.me         = THIS_MODULE,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+			      (1 << NF_INET_POST_ROUTING) |
+			      (1 << NF_INET_LOCAL_IN),
+	},
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+	{
+		.name       = "cgroup",
+		.revision   = 0,
+		.family     = NFPROTO_IPV6,
+		.checkentry = cgroup_mt_check,
+		.match      = cgroup_mt_ipv6,
+		.matchsize  = sizeof(struct xt_cgroup_info),
+		.me         = THIS_MODULE,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+			      (1 << NF_INET_POST_ROUTING) |
+			      (1 << NF_INET_LOCAL_IN),
+	},
+#endif
 };
 
 static int __init cgroup_mt_init(void)
 {
-	return xt_register_match(&cgroup_mt_reg);
+	return xt_register_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg));
 }
 
 static void __exit cgroup_mt_exit(void)
 {
-	xt_unregister_match(&cgroup_mt_reg);
+	xt_unregister_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg));
 }
 
 module_init(cgroup_mt_init);
-- 
2.3.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: cgroup matches in INPUT chain
  2015-03-19 18:41 cgroup matches in INPUT chain Daniel Mack
@ 2015-03-19 18:58 ` Florian Westphal
  2015-03-20 13:57   ` Daniel Mack
  0 siblings, 1 reply; 8+ messages in thread
From: Florian Westphal @ 2015-03-19 18:58 UTC (permalink / raw)
  To: Daniel Mack; +Cc: Daniel Borkmann, Alexey Perevalov, Pablo Neira Ayuso, netdev

Daniel Mack <daniel@zonque.org> wrote:
> In my tests, however, NF_INET_LOCAL_IN is iterated before early_demux()
> is called,

Early demux occurs after PRE_ROUTING but before LOCAL_IN.

Otherwise edemux would make little sense since its used to avoid the
routing lookup that decides wheter skb has to be locally delivered or
forwarded. IOW, in NF_INET_LOCAL_IN we've already decided on local
delivery and would not need a 'early' socket lookup any more.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: cgroup matches in INPUT chain
  2015-03-19 18:58 ` Florian Westphal
@ 2015-03-20 13:57   ` Daniel Mack
  2015-03-20 16:11     ` Florian Westphal
  0 siblings, 1 reply; 8+ messages in thread
From: Daniel Mack @ 2015-03-20 13:57 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Daniel Borkmann, Alexey Perevalov, Pablo Neira Ayuso, netdev

Hi,

On 03/19/2015 07:58 PM, Florian Westphal wrote:
> Daniel Mack <daniel@zonque.org> wrote:
>> In my tests, however, NF_INET_LOCAL_IN is iterated before early_demux()
>> is called,
> 
> Early demux occurs after PRE_ROUTING but before LOCAL_IN.

Hmm, you're right, except it isn't in my case. I'm not familiar with
that code, so please bear with me :)

In my simple test setup, when skbs are dequeued by process_backlog(),
they have skb->_skb_refdst set, and hence ip_rcv_finish() does not call
into early_demux() prior to iterating the INPUT chain:

ip_rcv_finish()
    if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL)
        ...
        ipprot->early_demux(skb);
        ...

Therefore, cgroup_mt() in xt_cgroup.c will be called with skb->sk ==
NULL, which makes the match callback ineffective. From looking at the
code, I assume xt_owner has the same problem.

However, when the skb is processed directly from the NIC's interrupt
handler, early_demux() is called as expected, and the match succeeds.

Any pointers how this can be solved would be greatly appreciated.


Thanks,
Daniel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: cgroup matches in INPUT chain
  2015-03-20 13:57   ` Daniel Mack
@ 2015-03-20 16:11     ` Florian Westphal
  2015-03-20 16:21       ` Daniel Mack
  0 siblings, 1 reply; 8+ messages in thread
From: Florian Westphal @ 2015-03-20 16:11 UTC (permalink / raw)
  To: Daniel Mack
  Cc: Florian Westphal, Daniel Borkmann, Alexey Perevalov,
	Pablo Neira Ayuso, netdev

Daniel Mack <daniel@zonque.org> wrote:
> On 03/19/2015 07:58 PM, Florian Westphal wrote:
> > Daniel Mack <daniel@zonque.org> wrote:
> >> In my tests, however, NF_INET_LOCAL_IN is iterated before early_demux()
> >> is called,
> > 
> > Early demux occurs after PRE_ROUTING but before LOCAL_IN.
> 
> Hmm, you're right, except it isn't in my case. I'm not familiar with
> that code, so please bear with me :)
> 
> In my simple test setup, when skbs are dequeued by process_backlog(),
> they have skb->_skb_refdst set, and hence ip_rcv_finish() does not call
> into early_demux() prior to iterating the INPUT chain:

Yes, because we already have a route set.

Are we talking about loopback?
What are you trying to do?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: cgroup matches in INPUT chain
  2015-03-20 16:11     ` Florian Westphal
@ 2015-03-20 16:21       ` Daniel Mack
  2015-03-20 20:18         ` Daniel Borkmann
  2015-03-20 20:55         ` Cong Wang
  0 siblings, 2 replies; 8+ messages in thread
From: Daniel Mack @ 2015-03-20 16:21 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Daniel Borkmann, Alexey Perevalov, Pablo Neira Ayuso, netdev

On 03/20/2015 05:11 PM, Florian Westphal wrote:
> Daniel Mack <daniel@zonque.org> wrote:
>> In my simple test setup, when skbs are dequeued by process_backlog(),
>> they have skb->_skb_refdst set, and hence ip_rcv_finish() does not call
>> into early_demux() prior to iterating the INPUT chain:
> 
> Yes, because we already have a route set.
> 
> Are we talking about loopback?

I'm testing this on the lookback device, but I've seen similar behavior
on external interfaces too. However, I fail to see a pattern in that.

> What are you trying to do?

Basically, I have a simple server that listens to a TCP port, accepts a
connection, writes out a short string and closes the connection again.
The process is put into a netcls cgroup controller, and a classid is
assigned to it, and I'm trying catch all traffic sent to it (regardless
of the interface in use) with a netfilter rule.

However, that doesn't work, because under the described circumstances,
the match callback of the cgroup netfilter module is always called with
an skb that has no sk set.


Thanks for your help!


Daniel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: cgroup matches in INPUT chain
  2015-03-20 16:21       ` Daniel Mack
@ 2015-03-20 20:18         ` Daniel Borkmann
  2015-03-20 20:55         ` Cong Wang
  1 sibling, 0 replies; 8+ messages in thread
From: Daniel Borkmann @ 2015-03-20 20:18 UTC (permalink / raw)
  To: Daniel Mack, Florian Westphal; +Cc: Alexey Perevalov, Pablo Neira Ayuso, netdev

On 03/20/2015 05:21 PM, Daniel Mack wrote:
> On 03/20/2015 05:11 PM, Florian Westphal wrote:
>> Daniel Mack <daniel@zonque.org> wrote:
>>> In my simple test setup, when skbs are dequeued by process_backlog(),
>>> they have skb->_skb_refdst set, and hence ip_rcv_finish() does not call
>>> into early_demux() prior to iterating the INPUT chain:
>>
>> Yes, because we already have a route set.
>>
>> Are we talking about loopback?
>
> I'm testing this on the lookback device, but I've seen similar behavior
> on external interfaces too. However, I fail to see a pattern in that.
>
>> What are you trying to do?
>
> Basically, I have a simple server that listens to a TCP port, accepts a
> connection, writes out a short string and closes the connection again.
> The process is put into a netcls cgroup controller, and a classid is
> assigned to it, and I'm trying catch all traffic sent to it (regardless
> of the interface in use) with a netfilter rule.
>
> However, that doesn't work, because under the described circumstances,
> the match callback of the cgroup netfilter module is always called with
> an skb that has no sk set.

Thanks for the report Daniel. I see the same here, so let me look
closer into it on Monday and get back to you. Looks like commit
a00e76349f3564bb is not sufficient.

Cheers,
Daniel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: cgroup matches in INPUT chain
  2015-03-20 16:21       ` Daniel Mack
  2015-03-20 20:18         ` Daniel Borkmann
@ 2015-03-20 20:55         ` Cong Wang
  2015-03-20 22:07           ` Daniel Mack
  1 sibling, 1 reply; 8+ messages in thread
From: Cong Wang @ 2015-03-20 20:55 UTC (permalink / raw)
  To: Daniel Mack
  Cc: Florian Westphal, Daniel Borkmann, Alexey Perevalov,
	Pablo Neira Ayuso, netdev

On Fri, Mar 20, 2015 at 9:21 AM, Daniel Mack <daniel@zonque.org> wrote:
> On 03/20/2015 05:11 PM, Florian Westphal wrote:
>> Daniel Mack <daniel@zonque.org> wrote:
>>> In my simple test setup, when skbs are dequeued by process_backlog(),
>>> they have skb->_skb_refdst set, and hence ip_rcv_finish() does not call
>>> into early_demux() prior to iterating the INPUT chain:
>>
>> Yes, because we already have a route set.
>>
>> Are we talking about loopback?
>
> I'm testing this on the lookback device, but I've seen similar behavior
> on external interfaces too. However, I fail to see a pattern in that.
>

Loopback is special because the skb->dst is kept across TX and RX.

How possible is your external interface sets dst for the packets?
Are you using a tunnel device or you have some other setup you didn't mention?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: cgroup matches in INPUT chain
  2015-03-20 20:55         ` Cong Wang
@ 2015-03-20 22:07           ` Daniel Mack
  0 siblings, 0 replies; 8+ messages in thread
From: Daniel Mack @ 2015-03-20 22:07 UTC (permalink / raw)
  To: Cong Wang
  Cc: Florian Westphal, Daniel Borkmann, Alexey Perevalov,
	Pablo Neira Ayuso, netdev

On 03/20/2015 09:55 PM, Cong Wang wrote:
> On Fri, Mar 20, 2015 at 9:21 AM, Daniel Mack <daniel@zonque.org> wrote:
>> I'm testing this on the lookback device, but I've seen similar behavior
>> on external interfaces too. However, I fail to see a pattern in that.
> 
> Loopback is special because the skb->dst is kept across TX and RX.

Ok, but that alone means we need special treatment in netfilter modules
that want to make a verdict on incoming packets based on information
stored in skb->sk, at least in case the packet happens to arrive on the
loopback device. Daniel Borkmann gave me a heads-up that xt_owner is
only for OUTPUT, so it's not affected by this issue. And xt_socket
implements its own socket lookup, so AFAICS the only module that's left
is xt_cgroup.

> How possible is your external interface sets dst for the packets?

That's what I don't know either, but my knowledge on the network core
details is admittedly limited.

> Are you using a tunnel device or you have some other setup you didn't mention?

Nope. I'm running my test with VirtualBox and do port forwarding from
the host into the VM. No tunnel devices or otherwise unusual network
setup is in place.

Inside the VM, I'm starting a very simple server that listens to a TCP
socket and I install a dummy netfilter rule for a cgroup into the INPUT
chain, just to make the match callback fire.

When I connect to that port from the host (via port forwarding), in the
netfilter callbacks skb->sk is NULL, skb->_skb_refdst is non-NULL, and a
stack trace produced by a WARN_ON(!skb->sk) in cgroup_mt() looks like this:

  <IRQ>  [<ffffffff8170fb51>] dump_stack+0x45/0x57
  [<ffffffff8109820a>] warn_slowpath_common+0x8a/0xc0
  [<ffffffff8109833a>] warn_slowpath_null+0x1a/0x20
  [<ffffffffa01b90b3>] cgroup_mt+0x93/0x95 [xt_cgroup]
  [<ffffffff81696965>] ipt_do_table+0x2a5/0x730
  [<ffffffff8163f6a0>] ? ip_rcv_finish+0x320/0x320
  [<ffffffff81698e54>] iptable_filter_hook+0x34/0x70
  [<ffffffff8163614a>] nf_iterate+0xaa/0xc0
  [<ffffffff8163f6a0>] ? ip_rcv_finish+0x320/0x320
  [<ffffffff816361e4>] nf_hook_slow+0x84/0x130
  [<ffffffff8163f6a0>] ? ip_rcv_finish+0x320/0x320
  [<ffffffff8163fa57>] ip_local_deliver+0x77/0x90
  [<ffffffff8163f3fa>] ip_rcv_finish+0x7a/0x320
  [<ffffffff8163fd08>] ip_rcv+0x298/0x390
  [<ffffffff8160050c>] __netif_receive_skb_core+0x1bc/0x9e0
  [<ffffffff81105274>] ? run_posix_cpu_timers+0x54/0x590
  [<ffffffff81600d48>] __netif_receive_skb+0x18/0x60
  [<ffffffff81600dd0>] netif_receive_skb_internal+0x40/0xc0
  [<ffffffff81601a48>] napi_gro_receive+0xc8/0x100
  [<ffffffffa0012144>] e1000_clean_rx_irq+0x164/0x520 [e1000]
  [<ffffffffa0013fa8>] e1000_clean+0x288/0x910 [e1000]
  [<ffffffff8104b92d>] ? lapic_next_event+0x1d/0x30
  [<ffffffff81718b56>] ? smp_apic_timer_interrupt+0x46/0x60
  [<ffffffff816012da>] net_rx_action+0x1ca/0x2f0
  [<ffffffff8109c5ab>] __do_softirq+0x10b/0x2d0
  [<ffffffff8109c9b5>] irq_exit+0x145/0x150
  [<ffffffff81718a78>] do_IRQ+0x58/0xf0
  [<ffffffff817169ad>] common_interrupt+0x6d/0x6d
  <EOI>  [<ffffffff811105e0>] ? tick_nohz_idle_exit+0xc0/0x140
  [<ffffffff811105d9>] ? tick_nohz_idle_exit+0xb9/0x140
  [<ffffffff810da160>] cpu_startup_entry+0x180/0x430
  [<ffffffff81706957>] rest_init+0x77/0x80
  [<ffffffff81d22025>] start_kernel+0x486/0x4a7
  [<ffffffff81d21120>] ? early_idt_handlers+0x120/0x120
  [<ffffffff81d21339>] x86_64_start_reservations+0x2a/0x2c
  [<ffffffff81d2149c>] x86_64_start_kernel+0x161/0x184
 ---[ end trace b96fff2079da6cf9 ]---


Thanks for looking into this,
Daniel

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2015-03-20 22:07 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-19 18:41 cgroup matches in INPUT chain Daniel Mack
2015-03-19 18:58 ` Florian Westphal
2015-03-20 13:57   ` Daniel Mack
2015-03-20 16:11     ` Florian Westphal
2015-03-20 16:21       ` Daniel Mack
2015-03-20 20:18         ` Daniel Borkmann
2015-03-20 20:55         ` Cong Wang
2015-03-20 22:07           ` Daniel Mack

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.