All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] net: reflect the fwmark for replies with no socket
@ 2014-03-11 11:15 Lorenzo Colitti
  2014-03-11 20:52 ` Julian Anastasov
  2014-03-12 20:17 ` David Miller
  0 siblings, 2 replies; 12+ messages in thread
From: Lorenzo Colitti @ 2014-03-11 11:15 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Lorenzo Colitti

Kernel-originated IP packets that have no user socket associated
with them (e.g., ICMP errors and echo replies, TCP RSTs, etc.)
are emitted with a mark of zero. Instead, make them have the
same mark as the packet they are replying to.

This is consistent with TOS, which is also reflected from the
incoming packet, and it allows the administrator to use
mark-based routing, firewalling, etc. for these replies by
marking the original packets inbound.

Also fix the IPv6 code to reflect the tclass in replies like the
IPv4 code does.

Change-Id: Ifd8dd75016e60dc982e7860f720d45c27dcaf04c
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
---
 net/ipv4/icmp.c      | 10 ++++++++--
 net/ipv4/ip_output.c |  2 +-
 net/ipv6/tcp_ipv6.c  |  7 +++++--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0134663..3b101a4 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -349,6 +349,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	icmp_param->data.icmph.checksum = 0;
 
 	inet->tos = ip_hdr(skb)->tos;
+	sk->sk_mark = skb->mark;
 	daddr = ipc.addr = ip_hdr(skb)->saddr;
 	saddr = fib_compute_spec_dst(skb);
 	ipc.opt = NULL;
@@ -364,6 +365,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = daddr;
 	fl4.saddr = saddr;
+	fl4.flowi4_mark = skb->mark;
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -382,7 +384,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 					struct flowi4 *fl4,
 					struct sk_buff *skb_in,
 					const struct iphdr *iph,
-					__be32 saddr, u8 tos,
+					__be32 saddr, u8 tos, u32 mark,
 					int type, int code,
 					struct icmp_bxm *param)
 {
@@ -394,6 +396,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->daddr = (param->replyopts.opt.opt.srr ?
 		      param->replyopts.opt.opt.faddr : iph->saddr);
 	fl4->saddr = saddr;
+	fl4->flowi4_mark = mark;
 	fl4->flowi4_tos = RT_TOS(tos);
 	fl4->flowi4_proto = IPPROTO_ICMP;
 	fl4->fl4_icmp_type = type;
@@ -491,6 +494,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	struct flowi4 fl4;
 	__be32 saddr;
 	u8  tos;
+	u32 mark;
 	struct net *net;
 	struct sock *sk;
 
@@ -592,6 +596,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
 					   IPTOS_PREC_INTERNETCONTROL) :
 					  iph->tos;
+	mark = skb_in->mark;
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
 		goto out_unlock;
@@ -608,13 +613,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	icmp_param->skb	  = skb_in;
 	icmp_param->offset = skb_network_offset(skb_in);
 	inet_sk(sk)->tos = tos;
+	sk->sk_mark = mark;
 	ipc.addr = iph->saddr;
 	ipc.opt = &icmp_param->replyopts.opt;
 	ipc.tx_flags = 0;
 	ipc.ttl = 0;
 	ipc.tos = -1;
 
-	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
+	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
 			       type, code, icmp_param);
 	if (IS_ERR(rt))
 		goto out_unlock;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1a0755f..a6039b1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1501,7 +1501,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 			daddr = replyopts.opt.opt.faddr;
 	}
 
-	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
+	flowi4_init_output(&fl4, arg->bound_dev_if, skb->mark,
 			   RT_TOS(arg->tos),
 			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3277680..b4f0388 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -802,6 +802,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 	fl6.flowi6_proto = IPPROTO_TCP;
 	if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL)
 		fl6.flowi6_oif = inet6_iif(skb);
+	fl6.flowi6_mark = skb->mark;
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
@@ -828,6 +829,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	const struct tcphdr *th = tcp_hdr(skb);
 	u32 seq = 0, ack_seq = 0;
 	struct tcp_md5sig_key *key = NULL;
+	u8 tclass;
 #ifdef CONFIG_TCP_MD5SIG
 	const __u8 *hash_location = NULL;
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
@@ -878,7 +880,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 		ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
 			  (th->doff << 2);
 
-	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, 0, 0);
+	tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, tclass, 0);
 
 #ifdef CONFIG_TCP_MD5SIG
 release_sk1:
@@ -918,7 +921,7 @@ static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 	tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1,
 			req->rcv_wnd, tcp_time_stamp, req->ts_recent,
 			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
-			0, 0);
+			ipv6_get_dsfield(ipv6_hdr(skb)), 0);
 }
 
 
-- 
1.9.0.279.gdc9e3eb

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: reflect the fwmark for replies with no socket
  2014-03-11 11:15 [PATCH] net: reflect the fwmark for replies with no socket Lorenzo Colitti
@ 2014-03-11 20:52 ` Julian Anastasov
  2014-03-12  1:36   ` Lorenzo Colitti
  2014-03-12 20:17 ` David Miller
  1 sibling, 1 reply; 12+ messages in thread
From: Julian Anastasov @ 2014-03-11 20:52 UTC (permalink / raw)
  To: Lorenzo Colitti; +Cc: netdev, David Miller


	Hello,

On Tue, 11 Mar 2014, Lorenzo Colitti wrote:

> Kernel-originated IP packets that have no user socket associated
> with them (e.g., ICMP errors and echo replies, TCP RSTs, etc.)
> are emitted with a mark of zero. Instead, make them have the
> same mark as the packet they are replying to.
> 
> This is consistent with TOS, which is also reflected from the
> incoming packet, and it allows the administrator to use
> mark-based routing, firewalling, etc. for these replies by
> marking the original packets inbound.

	If you can do it with CONNMARK save+restore,
do not change behavior for existing setups. It is
hard to predict how the mark is used in ip and
POST ROUTING rules.

> Also fix the IPv6 code to reflect the tclass in replies like the
> IPv4 code does.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: reflect the fwmark for replies with no socket
  2014-03-11 20:52 ` Julian Anastasov
@ 2014-03-12  1:36   ` Lorenzo Colitti
  2014-03-12  8:22     ` Julian Anastasov
  0 siblings, 1 reply; 12+ messages in thread
From: Lorenzo Colitti @ 2014-03-12  1:36 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: netdev, David Miller

On Wed, Mar 12, 2014 at 5:52 AM, Julian Anastasov <ja@ssi.bg> wrote:
>         If you can do it with CONNMARK save+restore,
> do not change behavior for existing setups. It is
> hard to predict how the mark is used in ip and
> POST ROUTING rules.

I think CONNMARK save/restore can do some of this but not all of it.

For example, consider the case of mark-based routing ("ip rule add
fwmark 123 lookup 456"). Setting the mark when emitting the packet
from protocol code (i.e., this patch), will cause the right route to
be selected. This will properly follow the routing policy chain
(including match routes like "unreachable" and "prohibit" if present),
use the right MTU, select the right source address for ICMP errors,
pick the right MSS/rwin for inclusion in SYN+ACK packets, and so on.

Some of this can be done after the fact using a combination of the
route target, snat/masquerade, MSS rewriting, and so on. However, that
essentially requires duplicating routing and source address selection
into iptables rules. This is brittle and particularly difficult when
dealing with IPv6, where routes can come from autoconf and source
address selection is more complex than in IPv4. It also requires using
snat/masquerade for IPv6, which is something we should avoid if
possible. In general rule it seems to me that it's better to have the
kernel emit the correct packet the first time around than having to
use iptables CONNTRACK to fix it up after the fact.

How much of a concern is modifying existing behaviour? Personally I
think reflecting the mark is better than just using 0 all the time,
but of course, it's possible that people are explicitly matching mark
0 in firewall rules or routing tables. Would it be better if this was
a sysctl?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: reflect the fwmark for replies with no socket
  2014-03-12  1:36   ` Lorenzo Colitti
@ 2014-03-12  8:22     ` Julian Anastasov
  2014-04-03  8:55       ` Lorenzo Colitti
  0 siblings, 1 reply; 12+ messages in thread
From: Julian Anastasov @ 2014-03-12  8:22 UTC (permalink / raw)
  To: Lorenzo Colitti; +Cc: netdev, David Miller


	Hello,

On Wed, 12 Mar 2014, Lorenzo Colitti wrote:

> On Wed, Mar 12, 2014 at 5:52 AM, Julian Anastasov <ja@ssi.bg> wrote:
> >         If you can do it with CONNMARK save+restore,
> > do not change behavior for existing setups. It is
> > hard to predict how the mark is used in ip and
> > POST ROUTING rules.
> 
> I think CONNMARK save/restore can do some of this but not all of it.
> 
> For example, consider the case of mark-based routing ("ip rule add
> fwmark 123 lookup 456"). Setting the mark when emitting the packet
> from protocol code (i.e., this patch), will cause the right route to
> be selected. This will properly follow the routing policy chain
> (including match routes like "unreachable" and "prohibit" if present),
> use the right MTU, select the right source address for ICMP errors,
> pick the right MSS/rwin for inclusion in SYN+ACK packets, and so on.

	I agree that CONNMARK will do second route lookup
in the OUTPUT hook, ipt_mangle_out will detect the mark
change, so it is a slower alternative. And limited as
you pointed out.

> Some of this can be done after the fact using a combination of the
> route target, snat/masquerade, MSS rewriting, and so on. However, that
> essentially requires duplicating routing and source address selection
> into iptables rules. This is brittle and particularly difficult when
> dealing with IPv6, where routes can come from autoconf and source
> address selection is more complex than in IPv4. It also requires using
> snat/masquerade for IPv6, which is something we should avoid if
> possible. In general rule it seems to me that it's better to have the
> kernel emit the correct packet the first time around than having to
> use iptables CONNTRACK to fix it up after the fact.
> 
> How much of a concern is modifying existing behaviour? Personally I
> think reflecting the mark is better than just using 0 all the time,

	Problem should be for setups that use fwmark
for different purposes in both directions. There can
be many examples, one is for asymmetric routing,
ip rule that delivers locally the marked packets:

1. PRE_ROUTING: set mark for dport 80
2. ip rule ... fwmark N table to_local;
   ip route add local 0/0 dev lo table to_local
3. IPVS direct routing to many proxies on LAN

	What we can say is that such ip rule with fwmark
will start to work for the output traffic unexpectedly.
The TOS replying suffers from the same problem, I guess
routing by TOS is not used much and nobody complained.

> but of course, it's possible that people are explicitly matching mark
> 0 in firewall rules or routing tables. Would it be better if this was
> a sysctl?

	sysctl looks better to me. For example,
ip_reply_fwmark(net, fwmark) that will return 0 or
fwmark depending on net->ipv4.sysctl_reply_fwmark boolean.
Similar for ip6_reply_fwmark() and net->ipv6.sysctl_reply_fwmark.
You can select the right names.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: reflect the fwmark for replies with no socket
  2014-03-11 11:15 [PATCH] net: reflect the fwmark for replies with no socket Lorenzo Colitti
  2014-03-11 20:52 ` Julian Anastasov
@ 2014-03-12 20:17 ` David Miller
  2014-03-17  5:24   ` Lorenzo Colitti
  1 sibling, 1 reply; 12+ messages in thread
From: David Miller @ 2014-03-12 20:17 UTC (permalink / raw)
  To: lorenzo; +Cc: netdev

From: Lorenzo Colitti <lorenzo@google.com>
Date: Tue, 11 Mar 2014 20:15:41 +0900

> Kernel-originated IP packets that have no user socket associated
> with them (e.g., ICMP errors and echo replies, TCP RSTs, etc.)
> are emitted with a mark of zero. Instead, make them have the
> same mark as the packet they are replying to.
> 
> This is consistent with TOS, which is also reflected from the
> incoming packet, and it allows the administrator to use
> mark-based routing, firewalling, etc. for these replies by
> marking the original packets inbound.
> 
> Also fix the IPv6 code to reflect the tclass in replies like the
> IPv4 code does.
> 
> Change-Id: Ifd8dd75016e60dc982e7860f720d45c27dcaf04c
> Signed-off-by: Lorenzo Colitti <lorenzo@google.com>

I don't think it's safe to change this behavior after all of this
time.

And incoming marks absolutely do not have any automatic relation
to what an administartor might want to use for output packets.

I'm not applying this, sorry.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: reflect the fwmark for replies with no socket
  2014-03-12 20:17 ` David Miller
@ 2014-03-17  5:24   ` Lorenzo Colitti
  2014-03-17 18:55     ` David Miller
  0 siblings, 1 reply; 12+ messages in thread
From: Lorenzo Colitti @ 2014-03-17  5:24 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Thu, Mar 13, 2014 at 5:17 AM, David Miller <davem@davemloft.net> wrote:
>> Kernel-originated IP packets that have no user socket associated
>> with them (e.g., ICMP errors and echo replies, TCP RSTs, etc.)
>> are emitted with a mark of zero. Instead, make them have the
>> same mark as the packet they are replying to.
>>
>> This is consistent with TOS, which is also reflected from the
>> incoming packet, and it allows the administrator to use
>> mark-based routing, firewalling, etc. for these replies by
>> marking the original packets inbound.
>>
>> Also fix the IPv6 code to reflect the tclass in replies like the
>> IPv4 code does.
>>
>> Change-Id: Ifd8dd75016e60dc982e7860f720d45c27dcaf04c
>> Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
>
> I don't think it's safe to change this behavior after all of this
> time.

Fair enough. My bad.

> And incoming marks absolutely do not have any automatic relation
> to what an administartor might want to use for output packets.

In general, that's true. Still - currently the mark on these packets
is 0, period, and the administrator can't do anything about that at
all. The idea was that some control is better than zero control, and
since these packets are always in reply to other packets, then marking
responses based on the packets that provoked them made sense, since
the input marking can be as flexible as the administrator desires.

If this was optional behaviour (e.g., controlled by
/proc/sys/net/ipv{4,6}/reflect_fwmark), would that be acceptable?
That way, even if it doesn't work for everyone, at least some
environments will be able to use it. I'm happy to send in a new patch
that does that.

If not, do you see another way to do this? If you use mark-based
routing, there doesn't currently seem to be a good way of sending
these packets where they need to go. iptables doesn't really work,
because it can only touch the packet after a lot of it has been
decided. You can override some of those decisions via NAT / MSS
rewrite / ..., but it gets messy, and it seems better to send the
right packet off the bat rather than send the wrong one and then
rewrite it without updating any of the socket structures.

Regards,
Lorenzo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: reflect the fwmark for replies with no socket
  2014-03-17  5:24   ` Lorenzo Colitti
@ 2014-03-17 18:55     ` David Miller
  0 siblings, 0 replies; 12+ messages in thread
From: David Miller @ 2014-03-17 18:55 UTC (permalink / raw)
  To: lorenzo; +Cc: netdev

From: Lorenzo Colitti <lorenzo@google.com>
Date: Mon, 17 Mar 2014 14:24:57 +0900

> In general, that's true. Still - currently the mark on these packets
> is 0, period, and the administrator can't do anything about that at
> all. The idea was that some control is better than zero control, and
> since these packets are always in reply to other packets, then marking
> responses based on the packets that provoked them made sense, since
> the input marking can be as flexible as the administrator desires.

I do not agree with this assesment at all, a default of 0 is better
than an potentially undesirable default.

The incoming fwmark is controlled by external entities, and most people
are usually unhappy with external entities having influence upon their
machine's behavior.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: reflect the fwmark for replies with no socket
  2014-03-12  8:22     ` Julian Anastasov
@ 2014-04-03  8:55       ` Lorenzo Colitti
  2014-04-03  9:00         ` [PATCH] net: add a sysctl to reflect the fwmark on replies Lorenzo Colitti
  2014-04-03  9:06         ` [PATCH] net: reflect the fwmark for replies with no socket Lorenzo Colitti
  0 siblings, 2 replies; 12+ messages in thread
From: Lorenzo Colitti @ 2014-04-03  8:55 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: netdev, David Miller

On Wed, Mar 12, 2014 at 5:22 PM, Julian Anastasov <ja@ssi.bg> wrote:
>         sysctl looks better to me. For example,
> ip_reply_fwmark(net, fwmark) that will return 0 or
> fwmark depending on net->ipv4.sysctl_reply_fwmark boolean.
> Similar for ip6_reply_fwmark() and net->ipv6.sysctl_reply_fwmark.
> You can select the right names.

Ack. I'll shortly post a patch that does this. It uses one
per-namespace sysctl - net.ipv4.fwmark_reflect - for both IPv4 and
IPv6. Using only one sysctl is consistent with things like ping and
tcp, and it marginally shortens the code, but I can change this to two
if desired.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] net: add a sysctl to reflect the fwmark on replies
  2014-04-03  8:55       ` Lorenzo Colitti
@ 2014-04-03  9:00         ` Lorenzo Colitti
  2014-04-03 12:10           ` Hannes Frederic Sowa
  2014-04-03  9:06         ` [PATCH] net: reflect the fwmark for replies with no socket Lorenzo Colitti
  1 sibling, 1 reply; 12+ messages in thread
From: Lorenzo Colitti @ 2014-04-03  9:00 UTC (permalink / raw)
  To: netdev; +Cc: davem, ja, Lorenzo Colitti

Kernel-originated IP packets that have no user socket associated
with them (e.g., ICMP errors and echo replies, TCP RSTs, etc.)
are emitted with a mark of zero. Add a sysctl to make them have
the same mark as the packet they are replying to.

This allows an administrator that wishes to do so to use
mark-based routing, firewalling, etc. for these replies by
marking the original packets inbound.

Tested using user-mode linux:
 - ICMP/ICMPv6 echo replies and errors.
 - TCP RST packets (IPv4 and IPv6).

Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
---
 include/net/ip.h           |  3 +++
 include/net/netns/ipv4.h   |  2 ++
 net/ipv4/icmp.c            | 11 +++++++++--
 net/ipv4/ip_output.c       |  3 ++-
 net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 net/ipv6/icmp.c            |  6 ++++++
 net/ipv6/tcp_ipv6.c        |  1 +
 7 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 25064c2..2c0fc74 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -238,6 +238,9 @@ void ipfrag_init(void);
 
 void ip_static_sysctl_init(void);
 
+#define IP_REPLY_MARK(net, mark) \
+	((net)->ipv4.sysctl_fwmark_reflect ? (mark) : 0)
+
 static inline bool ip_is_fragment(const struct iphdr *iph)
 {
 	return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 80f500a..8e1a9c0 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -72,6 +72,8 @@ struct netns_ipv4 {
 	int sysctl_ip_no_pmtu_disc;
 	int sysctl_ip_fwd_use_pmtu;
 
+	int sysctl_fwmark_reflect;
+
 	kgid_t sysctl_ping_group_range[2];
 
 	atomic_t dev_addr_genid;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0134663..e6128cd 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct sock *sk;
 	struct inet_sock *inet;
 	__be32 daddr, saddr;
+	u32 mark = IP_REPLY_MARK(net, skb->mark);
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
@@ -349,6 +350,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	icmp_param->data.icmph.checksum = 0;
 
 	inet->tos = ip_hdr(skb)->tos;
+	sk->sk_mark = mark;
 	daddr = ipc.addr = ip_hdr(skb)->saddr;
 	saddr = fib_compute_spec_dst(skb);
 	ipc.opt = NULL;
@@ -364,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = daddr;
 	fl4.saddr = saddr;
+	fl4.flowi4_mark = mark;
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -382,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 					struct flowi4 *fl4,
 					struct sk_buff *skb_in,
 					const struct iphdr *iph,
-					__be32 saddr, u8 tos,
+					__be32 saddr, u8 tos, u32 mark,
 					int type, int code,
 					struct icmp_bxm *param)
 {
@@ -394,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->daddr = (param->replyopts.opt.opt.srr ?
 		      param->replyopts.opt.opt.faddr : iph->saddr);
 	fl4->saddr = saddr;
+	fl4->flowi4_mark = mark;
 	fl4->flowi4_tos = RT_TOS(tos);
 	fl4->flowi4_proto = IPPROTO_ICMP;
 	fl4->fl4_icmp_type = type;
@@ -491,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	struct flowi4 fl4;
 	__be32 saddr;
 	u8  tos;
+	u32 mark;
 	struct net *net;
 	struct sock *sk;
 
@@ -592,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
 					   IPTOS_PREC_INTERNETCONTROL) :
 					  iph->tos;
+	mark = IP_REPLY_MARK(net, skb_in->mark);
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
 		goto out_unlock;
@@ -608,13 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	icmp_param->skb	  = skb_in;
 	icmp_param->offset = skb_network_offset(skb_in);
 	inet_sk(sk)->tos = tos;
+	sk->sk_mark = mark;
 	ipc.addr = iph->saddr;
 	ipc.opt = &icmp_param->replyopts.opt;
 	ipc.tx_flags = 0;
 	ipc.ttl = 0;
 	ipc.tos = -1;
 
-	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
+	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
 			       type, code, icmp_param);
 	if (IS_ERR(rt))
 		goto out_unlock;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1a0755f..be93317 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1501,7 +1501,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 			daddr = replyopts.opt.opt.faddr;
 	}
 
-	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
+	flowi4_init_output(&fl4, arg->bound_dev_if,
+			   IP_REPLY_MARK(net, skb->mark),
 			   RT_TOS(arg->tos),
 			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44eba05..e40a738 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -838,6 +838,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "fwmark_reflect",
+		.data		= &init_net.ipv4.sysctl_fwmark_reflect,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7b32652..63e96841 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -400,6 +400,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	int len;
 	int hlimit;
 	int err = 0;
+	u32 mark = IP_REPLY_MARK(net, skb->mark);
 
 	if ((u8 *)hdr < skb->head ||
 	    (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb))
@@ -466,6 +467,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	fl6.daddr = hdr->saddr;
 	if (saddr)
 		fl6.saddr = *saddr;
+	fl6.flowi6_mark = mark;
 	fl6.flowi6_oif = iif;
 	fl6.fl6_icmp_type = type;
 	fl6.fl6_icmp_code = code;
@@ -474,6 +476,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	sk = icmpv6_xmit_lock(net);
 	if (sk == NULL)
 		return;
+	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 
 	if (!icmpv6_xrlim_allow(sk, type, &fl6))
@@ -556,6 +559,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	int err = 0;
 	int hlimit;
 	u8 tclass;
+	u32 mark = IP_REPLY_MARK(net, skb->mark);
 
 	saddr = &ipv6_hdr(skb)->daddr;
 
@@ -574,11 +578,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 		fl6.saddr = *saddr;
 	fl6.flowi6_oif = skb->dev->ifindex;
 	fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
+	fl6.flowi6_mark = mark;
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
 	sk = icmpv6_xmit_lock(net);
 	if (sk == NULL)
 		return;
+	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 
 	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5ca56ce..0315c59 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -802,6 +802,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 		fl6.flowi6_oif = inet6_iif(skb);
 	else
 		fl6.flowi6_oif = oif;
+	fl6.flowi6_mark = IP_REPLY_MARK(net, skb->mark);
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: reflect the fwmark for replies with no socket
  2014-04-03  8:55       ` Lorenzo Colitti
  2014-04-03  9:00         ` [PATCH] net: add a sysctl to reflect the fwmark on replies Lorenzo Colitti
@ 2014-04-03  9:06         ` Lorenzo Colitti
  1 sibling, 0 replies; 12+ messages in thread
From: Lorenzo Colitti @ 2014-04-03  9:06 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: netdev, David Miller

On Thu, Apr 3, 2014 at 5:55 PM, Lorenzo Colitti <lorenzo@google.com> wrote:
> Ack. I'll shortly post a patch that does this. It uses one
> per-namespace sysctl - net.ipv4.fwmark_reflect - for both IPv4 and
> IPv6.

http://patchwork.ozlabs.org/patch/336553/

David, does it look better to you now it's an option?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: add a sysctl to reflect the fwmark on replies
  2014-04-03  9:00         ` [PATCH] net: add a sysctl to reflect the fwmark on replies Lorenzo Colitti
@ 2014-04-03 12:10           ` Hannes Frederic Sowa
  2014-04-04 15:21             ` Lorenzo Colitti
  0 siblings, 1 reply; 12+ messages in thread
From: Hannes Frederic Sowa @ 2014-04-03 12:10 UTC (permalink / raw)
  To: Lorenzo Colitti; +Cc: netdev, davem, ja

On Thu, Apr 03, 2014 at 06:00:01PM +0900, Lorenzo Colitti wrote:
> Kernel-originated IP packets that have no user socket associated
> with them (e.g., ICMP errors and echo replies, TCP RSTs, etc.)
> are emitted with a mark of zero. Add a sysctl to make them have
> the same mark as the packet they are replying to.
> 
> This allows an administrator that wishes to do so to use
> mark-based routing, firewalling, etc. for these replies by
> marking the original packets inbound.
> 
> Tested using user-mode linux:
>  - ICMP/ICMPv6 echo replies and errors.
>  - TCP RST packets (IPv4 and IPv6).

Hmm, maybe add two sysctls to distinguish between IPv4 and IPv6?

We share the sysctl namespace for tcp with ipv4, but otherwise the knobs
are isolated if they touch ip-only behaviour.

Also net-next is currently closed, so the patch would have to wait until it
opens back up in a few weeks(?).

Bye,

  Hannes

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] net: add a sysctl to reflect the fwmark on replies
  2014-04-03 12:10           ` Hannes Frederic Sowa
@ 2014-04-04 15:21             ` Lorenzo Colitti
  0 siblings, 0 replies; 12+ messages in thread
From: Lorenzo Colitti @ 2014-04-04 15:21 UTC (permalink / raw)
  To: Lorenzo Colitti, netdev, David Miller, Julian Anastasov

On Thu, Apr 3, 2014 at 9:10 PM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> We share the sysctl namespace for tcp with ipv4, but otherwise the knobs
> are isolated if they touch ip-only behaviour.
>
> Also net-next is currently closed, so the patch would have to wait until it
> opens back up in a few weeks(?).

Ack, thanks. I have an updated patch with two sysctls, but I'll wait
before posting it.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-04-04 15:21 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-03-11 11:15 [PATCH] net: reflect the fwmark for replies with no socket Lorenzo Colitti
2014-03-11 20:52 ` Julian Anastasov
2014-03-12  1:36   ` Lorenzo Colitti
2014-03-12  8:22     ` Julian Anastasov
2014-04-03  8:55       ` Lorenzo Colitti
2014-04-03  9:00         ` [PATCH] net: add a sysctl to reflect the fwmark on replies Lorenzo Colitti
2014-04-03 12:10           ` Hannes Frederic Sowa
2014-04-04 15:21             ` Lorenzo Colitti
2014-04-03  9:06         ` [PATCH] net: reflect the fwmark for replies with no socket Lorenzo Colitti
2014-03-12 20:17 ` David Miller
2014-03-17  5:24   ` Lorenzo Colitti
2014-03-17 18:55     ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.