netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] route: allow to route in a peer netns via lwt framework
@ 2015-07-23 14:22 Nicolas Dichtel
  2015-07-23 15:01 ` roopa
  2015-07-24 14:28 ` [PATCH net-next] " David Ahern
  0 siblings, 2 replies; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-23 14:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, tgraf, Nicolas Dichtel

This patch takes advantage of the newly added lwtunnel framework to
allow the user to set routes that points to a peer netns.

Packets are injected to the peer netns via the loopback device. It works
only when the output device is 'lo'.

Example:
ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 drivers/net/loopback.c        | 16 +++++++++++++
 include/net/lwtunnel.h        | 23 +++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  1 +
 net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index c76283c2f84a..758d02f592f9 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -57,6 +57,7 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
+#include <net/lwtunnel.h>
 
 struct pcpu_lstats {
 	u64			packets;
@@ -71,9 +72,23 @@ struct pcpu_lstats {
 static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 				 struct net_device *dev)
 {
+	int nsid = skb_lwt_netns_info(skb);
 	struct pcpu_lstats *lb_stats;
 	int len;
 
+	if (nsid >= 0) {
+		struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
+
+		if (!peernet) {
+			kfree_skb(skb);
+			goto end;
+		}
+
+		dev_forward_skb(peernet->loopback_dev, skb);
+		put_net(peernet);
+		goto end;
+	}
+
 	skb_orphan(skb);
 
 	/* Before queueing this packet to netif_rx(),
@@ -94,6 +109,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 		u64_stats_update_end(&lb_stats->syncp);
 	}
 
+end:
 	return NETDEV_TX_OK;
 }
 
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 918e03c1dafa..cc05ce3c1aae 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -5,7 +5,9 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/types.h>
+#include <linux/net_namespace.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 
 #define LWTUNNEL_HASH_BITS   7
 #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
@@ -141,4 +143,25 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
 
 #endif
 
+static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
+{
+	return (u32 *)lwtstate->data;
+}
+
+static inline int skb_lwt_netns_info(struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct rtable *rt = (struct rtable *)skb_dst(skb);
+
+		if (rt && rt->rt_lwtstate)
+			return *lwt_netns_info(rt->rt_lwtstate);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+		if (rt6 && rt6->rt6i_lwtstate)
+			return *lwt_netns_info(rt6->rt6i_lwtstate);
+	}
+
+	return NETNSA_NSID_NOT_ASSIGNED;
+}
 #endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 31377bbea3f8..6715e7a1b335 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -7,6 +7,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
+	LWTUNNEL_ENCAP_NETNS,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..c1267aac373d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
 #include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/lwtunnel.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -725,6 +726,56 @@ out:
 	rtnl_set_sk_err(net, RTNLGRP_NSID, err);
 }
 
+static int lwt_netns_build_state(struct net_device *dev, struct nlattr *nla,
+				 struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[NETNSA_MAX + 1];
+	struct lwtunnel_state *newts;
+	int *nsid;
+	int ret;
+
+	ret = nla_parse_nested(tb, NETNSA_MAX, nla, rtnl_net_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[NETNSA_NSID])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*nsid));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = sizeof(*nsid);
+	nsid = lwt_netns_info(newts);
+	*nsid = nla_get_s32(tb[NETNSA_NSID]);
+	newts->type = LWTUNNEL_ENCAP_NETNS;
+
+	*ts = newts;
+	return 0;
+}
+
+static int lwt_netns_fill_encap_info(struct sk_buff *skb,
+				     struct lwtunnel_state *lwtstate)
+{
+	int *nsid = lwt_netns_info(lwtstate);
+
+	if (nla_put_s32(skb, NETNSA_NSID, *nsid))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int lwt_netns_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(4);	/* NETNSA_NSID */
+}
+
+static const struct lwtunnel_encap_ops lwt_netns_ops = {
+	.build_state = lwt_netns_build_state,
+	.fill_encap = lwt_netns_fill_encap_info,
+	.get_encap_size = lwt_netns_encap_nlsize,
+};
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@@ -762,6 +813,7 @@ static int __init net_ns_init(void)
 	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
 		      NULL);
 
+	lwtunnel_encap_add_ops(&lwt_netns_ops, LWTUNNEL_ENCAP_NETNS);
 	return 0;
 }
 
-- 
2.4.2

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-23 14:22 [PATCH net-next] route: allow to route in a peer netns via lwt framework Nicolas Dichtel
@ 2015-07-23 15:01 ` roopa
  2015-07-23 15:25   ` Nicolas Dichtel
  2015-07-24 14:28 ` [PATCH net-next] " David Ahern
  1 sibling, 1 reply; 18+ messages in thread
From: roopa @ 2015-07-23 15:01 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: davem, netdev, tgraf

On 7/23/15, 7:22 AM, Nicolas Dichtel wrote:
> This patch takes advantage of the newly added lwtunnel framework to
> allow the user to set routes that points to a peer netns.
>
> Packets are injected to the peer netns via the loopback device. It works
> only when the output device is 'lo'.
>
> Example:
> ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
>
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
> ---
>   drivers/net/loopback.c        | 16 +++++++++++++
>   include/net/lwtunnel.h        | 23 +++++++++++++++++++
>   include/uapi/linux/lwtunnel.h |  1 +
>   net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 92 insertions(+)
>
> diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
> index c76283c2f84a..758d02f592f9 100644
> --- a/drivers/net/loopback.c
> +++ b/drivers/net/loopback.c
> @@ -57,6 +57,7 @@
>   #include <linux/percpu.h>
>   #include <net/net_namespace.h>
>   #include <linux/u64_stats_sync.h>
> +#include <net/lwtunnel.h>
>   
>   struct pcpu_lstats {
>   	u64			packets;
> @@ -71,9 +72,23 @@ struct pcpu_lstats {
>   static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>   				 struct net_device *dev)
>   {
> +	int nsid = skb_lwt_netns_info(skb);
>   	struct pcpu_lstats *lb_stats;
>   	int len;
>   
> +	if (nsid >= 0) {
> +		struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
> +
> +		if (!peernet) {
> +			kfree_skb(skb);
> +			goto end;
> +		}
> +
> +		dev_forward_skb(peernet->loopback_dev, skb);
> +		put_net(peernet);
> +		goto end;
> +	}
> +
>   	skb_orphan(skb);
>   
>   	/* Before queueing this packet to netif_rx(),
> @@ -94,6 +109,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>   		u64_stats_update_end(&lb_stats->syncp);
>   	}
>   
> +end:
>   	return NETDEV_TX_OK;
>   }
>   
> diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
> index 918e03c1dafa..cc05ce3c1aae 100644
> --- a/include/net/lwtunnel.h
> +++ b/include/net/lwtunnel.h
> @@ -5,7 +5,9 @@
>   #include <linux/netdevice.h>
>   #include <linux/skbuff.h>
>   #include <linux/types.h>
> +#include <linux/net_namespace.h>
>   #include <net/route.h>
> +#include <net/ip6_fib.h>
>   
>   #define LWTUNNEL_HASH_BITS   7
>   #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
> @@ -141,4 +143,25 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
>   
>   #endif
>   
> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
> +{
> +	return (u32 *)lwtstate->data;
> +}
> +
> +static inline int skb_lwt_netns_info(struct sk_buff *skb)
> +{
> +	if (skb->protocol == htons(ETH_P_IP)) {
> +		struct rtable *rt = (struct rtable *)skb_dst(skb);
> +
> +		if (rt && rt->rt_lwtstate)
> +			return *lwt_netns_info(rt->rt_lwtstate);
> +	} else if (skb->protocol == htons(ETH_P_IPV6)) {
> +		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
> +
> +		if (rt6 && rt6->rt6i_lwtstate)
> +			return *lwt_netns_info(rt6->rt6i_lwtstate);
> +	}
> +
> +	return NETNSA_NSID_NOT_ASSIGNED;
> +}
>   #endif /* __NET_LWTUNNEL_H */
since these apis' don't have to be netns specific,
Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ?

and seems like they should be declared for both CONFIG_LWTUNNEL 'y' and 'n'.

Thanks,
Roopa

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-23 15:01 ` roopa
@ 2015-07-23 15:25   ` Nicolas Dichtel
  2015-07-23 15:50     ` roopa
  2015-07-24 14:16     ` [PATCH net-next v2] " Nicolas Dichtel
  0 siblings, 2 replies; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-23 15:25 UTC (permalink / raw)
  To: roopa; +Cc: davem, netdev, tgraf

Le 23/07/2015 17:01, roopa a écrit :
> On 7/23/15, 7:22 AM, Nicolas Dichtel wrote:
[snip]
>> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
>> +{
>> +    return (u32 *)lwtstate->data;
>> +}
>> +
>> +static inline int skb_lwt_netns_info(struct sk_buff *skb)
>> +{
>> +    if (skb->protocol == htons(ETH_P_IP)) {
>> +        struct rtable *rt = (struct rtable *)skb_dst(skb);
>> +
>> +        if (rt && rt->rt_lwtstate)
>> +            return *lwt_netns_info(rt->rt_lwtstate);
>> +    } else if (skb->protocol == htons(ETH_P_IPV6)) {
>> +        struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
>> +
>> +        if (rt6 && rt6->rt6i_lwtstate)
>> +            return *lwt_netns_info(rt6->rt6i_lwtstate);
>> +    }
>> +
>> +    return NETNSA_NSID_NOT_ASSIGNED;
>> +}
>>   #endif /* __NET_LWTUNNEL_H */
> since these apis' don't have to be netns specific,
> Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ?
They are specific to netns because lwtstate->data is interpreted as an u32 *.
But I agree that a test is missing against lwtstate->type to ensure that data
will be a nsid.

>
> and seems like they should be declared for both CONFIG_LWTUNNEL 'y' and 'n'.
It is outside the "#ifdef CONFIG_LWTUNNEL". I can successfully compile with and
without CONFIG_LWTUNNEL.

Thank you,
Nicolas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-23 15:25   ` Nicolas Dichtel
@ 2015-07-23 15:50     ` roopa
  2015-07-24 12:24       ` Nicolas Dichtel
  2015-07-24 14:16     ` [PATCH net-next v2] " Nicolas Dichtel
  1 sibling, 1 reply; 18+ messages in thread
From: roopa @ 2015-07-23 15:50 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: davem, netdev, tgraf

On 7/23/15, 8:25 AM, Nicolas Dichtel wrote:
> Le 23/07/2015 17:01, roopa a écrit :
>> On 7/23/15, 7:22 AM, Nicolas Dichtel wrote:
> [snip]
>>> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
>>> +{
>>> +    return (u32 *)lwtstate->data;
>>> +}
>>> +
>>> +static inline int skb_lwt_netns_info(struct sk_buff *skb)
>>> +{
>>> +    if (skb->protocol == htons(ETH_P_IP)) {
>>> +        struct rtable *rt = (struct rtable *)skb_dst(skb);
>>> +
>>> +        if (rt && rt->rt_lwtstate)
>>> +            return *lwt_netns_info(rt->rt_lwtstate);
>>> +    } else if (skb->protocol == htons(ETH_P_IPV6)) {
>>> +        struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
>>> +
>>> +        if (rt6 && rt6->rt6i_lwtstate)
>>> +            return *lwt_netns_info(rt6->rt6i_lwtstate);
>>> +    }
>>> +
>>> +    return NETNSA_NSID_NOT_ASSIGNED;
>>> +}
>>>   #endif /* __NET_LWTUNNEL_H */
>> since these apis' don't have to be netns specific,
>> Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ?
> They are specific to netns because lwtstate->data is interpreted as an 
> u32 *.
> But I agree that a test is missing against lwtstate->type to ensure 
> that data
> will be a nsid.
>
o ok..., the api's in lwtunnel.h today are not specific to an encap type.
they are generic, so skb_lwtunnel_state() which returns struct 
lwtunnel_state could go here.
the encap specific ones can go in the respective callers. Recently 
thomas added a similar
skb_tunnel_info() for ip tunnels. I did  like to have a generic version 
of your skb_lwt_netns_info in lwtunnel.h. I could use it in my mpls 
output func too.


>>
>> and seems like they should be declared for both CONFIG_LWTUNNEL 'y' 
>> and 'n'.
> It is outside the "#ifdef CONFIG_LWTUNNEL". I can successfully compile 
> with and
> without CONFIG_LWTUNNEL.
ok,

thanks,
Roopa

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-23 15:50     ` roopa
@ 2015-07-24 12:24       ` Nicolas Dichtel
  2015-07-24 13:50         ` roopa
  0 siblings, 1 reply; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-24 12:24 UTC (permalink / raw)
  To: roopa; +Cc: davem, netdev, tgraf

Le 23/07/2015 17:50, roopa a écrit :
> On 7/23/15, 8:25 AM, Nicolas Dichtel wrote:
>> Le 23/07/2015 17:01, roopa a écrit :
>>> On 7/23/15, 7:22 AM, Nicolas Dichtel wrote:
>> [snip]
>>>> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
>>>> +{
>>>> +    return (u32 *)lwtstate->data;
>>>> +}
>>>> +
>>>> +static inline int skb_lwt_netns_info(struct sk_buff *skb)
>>>> +{
>>>> +    if (skb->protocol == htons(ETH_P_IP)) {
>>>> +        struct rtable *rt = (struct rtable *)skb_dst(skb);
>>>> +
>>>> +        if (rt && rt->rt_lwtstate)
>>>> +            return *lwt_netns_info(rt->rt_lwtstate);
>>>> +    } else if (skb->protocol == htons(ETH_P_IPV6)) {
>>>> +        struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
>>>> +
>>>> +        if (rt6 && rt6->rt6i_lwtstate)
>>>> +            return *lwt_netns_info(rt6->rt6i_lwtstate);
>>>> +    }
>>>> +
>>>> +    return NETNSA_NSID_NOT_ASSIGNED;
>>>> +}
>>>>   #endif /* __NET_LWTUNNEL_H */
>>> since these apis' don't have to be netns specific,
>>> Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ?
>> They are specific to netns because lwtstate->data is interpreted as an u32 *.
>> But I agree that a test is missing against lwtstate->type to ensure that data
>> will be a nsid.
>>
> o ok..., the api's in lwtunnel.h today are not specific to an encap type.
> they are generic, so skb_lwtunnel_state() which returns struct lwtunnel_state
> could go here.
> the encap specific ones can go in the respective callers. Recently thomas added
> a similar
> skb_tunnel_info() for ip tunnels. I did  like to have a generic version of your
> skb_lwt_netns_info in lwtunnel.h. I could use it in my mpls output func too.
Sure, but my goal was to not create a new .h file just for these two helpers.
It's related to lwtunnel, thus I was thinking they can go here.


Regards,
Nicolas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-24 12:24       ` Nicolas Dichtel
@ 2015-07-24 13:50         ` roopa
  2015-07-24 14:11           ` Nicolas Dichtel
  0 siblings, 1 reply; 18+ messages in thread
From: roopa @ 2015-07-24 13:50 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: davem, netdev, tgraf

On 7/24/15, 5:24 AM, Nicolas Dichtel wrote:
> Sure, but my goal was to not create a new .h file just for these two 
> helpers.
> It's related to lwtunnel, thus I was thinking they can go here.
ok..., since your lwt namespace functions went into net_namespace.c, I 
was thinking
these should really go into net_namespace.h. Does that work for you ?
If that does not, then yes, they could live here.

Thanks,
Roopa

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-24 13:50         ` roopa
@ 2015-07-24 14:11           ` Nicolas Dichtel
  0 siblings, 0 replies; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-24 14:11 UTC (permalink / raw)
  To: roopa; +Cc: davem, netdev, tgraf

Le 24/07/2015 15:50, roopa a écrit :
> On 7/24/15, 5:24 AM, Nicolas Dichtel wrote:
>> Sure, but my goal was to not create a new .h file just for these two helpers.
>> It's related to lwtunnel, thus I was thinking they can go here.
> ok..., since your lwt namespace functions went into net_namespace.c, I was thinking
> these should really go into net_namespace.h. Does that work for you ?
Not so easy, it's a problem of chicken and egg. If I add this to
net/net_namespace.h, I need to include net/lwtunnel.h but this file already
includes net/net_namespace.h (included directly or indirectly by most of the
network headers).


Regards,
Nicolas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH net-next v2] route: allow to route in a peer netns via lwt framework
  2015-07-23 15:25   ` Nicolas Dichtel
  2015-07-23 15:50     ` roopa
@ 2015-07-24 14:16     ` Nicolas Dichtel
  2015-07-24 15:39       ` Eric Dumazet
  1 sibling, 1 reply; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-24 14:16 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, tgraf, Nicolas Dichtel

This patch takes advantage of the newly added lwtunnel framework to
allow the user to set routes that point to a peer netns.

Packets are injected to the peer netns via the loopback device. It works
only when the output device is 'lo'.

Example:
ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---

v2: rework loopback handling part (update stats and call skb_dst_force())
    fix ipv6 processing
    check lwtunnel type before converting data to a nsid

 drivers/net/loopback.c        | 33 +++++++++++++++++++++------
 include/net/lwtunnel.h        | 27 ++++++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  1 +
 net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/route.c              |  9 ++++++--
 5 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index c76283c2f84a..4358256ff94e 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -57,6 +57,7 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
+#include <net/lwtunnel.h>
 
 struct pcpu_lstats {
 	u64			packets;
@@ -71,29 +72,47 @@ struct pcpu_lstats {
 static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 				 struct net_device *dev)
 {
+	int nsid = skb_lwt_netns_info(skb);
 	struct pcpu_lstats *lb_stats;
-	int len;
-
-	skb_orphan(skb);
+	struct net *peernet = NULL;
+	int len, ret;
 
 	/* Before queueing this packet to netif_rx(),
 	 * make sure dst is refcounted.
 	 */
 	skb_dst_force(skb);
 
-	skb->protocol = eth_type_trans(skb, dev);
+	if (nsid != NETNSA_NSID_NOT_ASSIGNED) {
+		peernet = get_net_ns_by_id(dev_net(dev), nsid);
+		if (!peernet) {
+			kfree_skb(skb);
+			goto end;
+		}
+
+		/* it's OK to use per_cpu_ptr() because BHs are off */
+		lb_stats = this_cpu_ptr(peernet->loopback_dev->lstats);
+		ret = dev_forward_skb(peernet->loopback_dev, skb);
+	} else {
+		skb_orphan(skb);
 
-	/* it's OK to use per_cpu_ptr() because BHs are off */
-	lb_stats = this_cpu_ptr(dev->lstats);
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* it's OK to use per_cpu_ptr() because BHs are off */
+		lb_stats = this_cpu_ptr(dev->lstats);
+		ret = netif_rx(skb);
+	}
 
 	len = skb->len;
-	if (likely(netif_rx(skb) == NET_RX_SUCCESS)) {
+	if (likely(ret == NET_RX_SUCCESS)) {
 		u64_stats_update_begin(&lb_stats->syncp);
 		lb_stats->bytes += len;
 		lb_stats->packets++;
 		u64_stats_update_end(&lb_stats->syncp);
 	}
 
+end:
+	if (peernet)
+		put_net(peernet);
 	return NETDEV_TX_OK;
 }
 
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index b02039081b04..78376da1afa2 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -5,7 +5,9 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/types.h>
+#include <linux/net_namespace.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 
 #define LWTUNNEL_HASH_BITS   7
 #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
@@ -147,4 +149,29 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
 
 #endif
 
+static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
+{
+	return (u32 *)lwtstate->data;
+}
+
+static inline int skb_lwt_netns_info(struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct rtable *rt = (struct rtable *)skb_dst(skb);
+
+		if (rt &&
+		    rt->rt_lwtstate &&
+		    rt->rt_lwtstate->type & LWTUNNEL_ENCAP_NETNS)
+			return *lwt_netns_info(rt->rt_lwtstate);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+		if (rt6 &&
+		    rt6->rt6i_lwtstate &&
+		    rt6->rt6i_lwtstate->type & LWTUNNEL_ENCAP_NETNS)
+			return *lwt_netns_info(rt6->rt6i_lwtstate);
+	}
+
+	return NETNSA_NSID_NOT_ASSIGNED;
+}
 #endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 31377bbea3f8..6715e7a1b335 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -7,6 +7,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
+	LWTUNNEL_ENCAP_NETNS,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..c1267aac373d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
 #include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/lwtunnel.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -725,6 +726,56 @@ out:
 	rtnl_set_sk_err(net, RTNLGRP_NSID, err);
 }
 
+static int lwt_netns_build_state(struct net_device *dev, struct nlattr *nla,
+				 struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[NETNSA_MAX + 1];
+	struct lwtunnel_state *newts;
+	int *nsid;
+	int ret;
+
+	ret = nla_parse_nested(tb, NETNSA_MAX, nla, rtnl_net_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[NETNSA_NSID])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*nsid));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = sizeof(*nsid);
+	nsid = lwt_netns_info(newts);
+	*nsid = nla_get_s32(tb[NETNSA_NSID]);
+	newts->type = LWTUNNEL_ENCAP_NETNS;
+
+	*ts = newts;
+	return 0;
+}
+
+static int lwt_netns_fill_encap_info(struct sk_buff *skb,
+				     struct lwtunnel_state *lwtstate)
+{
+	int *nsid = lwt_netns_info(lwtstate);
+
+	if (nla_put_s32(skb, NETNSA_NSID, *nsid))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int lwt_netns_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(4);	/* NETNSA_NSID */
+}
+
+static const struct lwtunnel_encap_ops lwt_netns_ops = {
+	.build_state = lwt_netns_build_state,
+	.fill_encap = lwt_netns_fill_encap_info,
+	.get_encap_size = lwt_netns_encap_nlsize,
+};
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@@ -762,6 +813,7 @@ static int __init net_ns_init(void)
 	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
 		      NULL);
 
+	lwtunnel_encap_add_ops(&lwt_netns_ops, LWTUNNEL_ENCAP_NETNS);
 	return 0;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c9b2b9fe83fc..894cb18cd8ca 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1796,10 +1796,13 @@ int ip6_route_add(struct fib6_config *cfg)
 	rt->rt6i_metric = cfg->fc_metric;
 
 	/* We cannot add true routes via loopback here,
-	   they would result in kernel looping; promote them to reject routes
+	 * they would result in kernel looping; promote them to reject routes.
+	 * Exception: routes that point to a peer netns.
 	 */
 	if ((cfg->fc_flags & RTF_REJECT) ||
 	    (dev && (dev->flags & IFF_LOOPBACK) &&
+	     (!rt->rt6i_lwtstate ||
+	      rt->rt6i_lwtstate->type != LWTUNNEL_ENCAP_NETNS) &&
 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
 	     !(cfg->fc_flags & RTF_LOCAL))) {
 		/* hold loopback dev/idev if we haven't done so. */
@@ -2880,7 +2883,9 @@ static int rt6_fill_node(struct net *net,
 	}
 	else if (rt->rt6i_flags & RTF_LOCAL)
 		rtm->rtm_type = RTN_LOCAL;
-	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
+	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK) &&
+		(!rt->rt6i_lwtstate ||
+		 rt->rt6i_lwtstate->type != LWTUNNEL_ENCAP_NETNS))
 		rtm->rtm_type = RTN_LOCAL;
 	else
 		rtm->rtm_type = RTN_UNICAST;
-- 
2.4.2

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-23 14:22 [PATCH net-next] route: allow to route in a peer netns via lwt framework Nicolas Dichtel
  2015-07-23 15:01 ` roopa
@ 2015-07-24 14:28 ` David Ahern
  2015-07-24 14:32   ` Nicolas Dichtel
  1 sibling, 1 reply; 18+ messages in thread
From: David Ahern @ 2015-07-24 14:28 UTC (permalink / raw)
  To: Nicolas Dichtel, davem; +Cc: netdev, roopa, tgraf

On 7/23/15 8:22 AM, Nicolas Dichtel wrote:
>   static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>   				 struct net_device *dev)
>   {
> +	int nsid = skb_lwt_netns_info(skb);
>   	struct pcpu_lstats *lb_stats;
>   	int len;
>
> +	if (nsid >= 0) {
> +		struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
> +
> +		if (!peernet) {

If nsid is > 0 then the peer namespace should exist right? So for this 
failure path why not increment tx_error stat?


> +			kfree_skb(skb);
> +			goto end;
> +		}
> +
> +		dev_forward_skb(peernet->loopback_dev, skb);
> +		put_net(peernet);
> +		goto end;
> +	}
> +
>   	skb_orphan(skb);
>
>   	/* Before queueing this packet to netif_rx(),

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-24 14:28 ` [PATCH net-next] " David Ahern
@ 2015-07-24 14:32   ` Nicolas Dichtel
  2015-07-24 15:19     ` David Ahern
  0 siblings, 1 reply; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-24 14:32 UTC (permalink / raw)
  To: David Ahern, davem; +Cc: netdev, roopa, tgraf

Le 24/07/2015 16:28, David Ahern a écrit :
> On 7/23/15 8:22 AM, Nicolas Dichtel wrote:
>>   static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>>                    struct net_device *dev)
>>   {
>> +    int nsid = skb_lwt_netns_info(skb);
>>       struct pcpu_lstats *lb_stats;
>>       int len;
>>
>> +    if (nsid >= 0) {
>> +        struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
>> +
>> +        if (!peernet) {
>
> If nsid is > 0 then the peer namespace should exist right? So for this failure
> path why not increment tx_error stat?
I was not sure about that, because before my patch we increment statistics only
in case of NET_RX_SUCCESS.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-24 14:32   ` Nicolas Dichtel
@ 2015-07-24 15:19     ` David Ahern
  2015-07-27 20:07       ` Nicolas Dichtel
  0 siblings, 1 reply; 18+ messages in thread
From: David Ahern @ 2015-07-24 15:19 UTC (permalink / raw)
  To: nicolas.dichtel, davem; +Cc: netdev, roopa, tgraf

On 7/24/15 8:32 AM, Nicolas Dichtel wrote:
> Le 24/07/2015 16:28, David Ahern a écrit :
>> On 7/23/15 8:22 AM, Nicolas Dichtel wrote:
>>>   static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>>>                    struct net_device *dev)
>>>   {
>>> +    int nsid = skb_lwt_netns_info(skb);
>>>       struct pcpu_lstats *lb_stats;
>>>       int len;
>>>
>>> +    if (nsid >= 0) {
>>> +        struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
>>> +
>>> +        if (!peernet) {
>>
>> If nsid is > 0 then the peer namespace should exist right? So for this
>> failure
>> path why not increment tx_error stat?
> I was not sure about that, because before my patch we increment
> statistics only
> in case of NET_RX_SUCCESS.

In this case you are knowingly dropping packets. Would be nice to have a 
counter showing that.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next v2] route: allow to route in a peer netns via lwt framework
  2015-07-24 14:16     ` [PATCH net-next v2] " Nicolas Dichtel
@ 2015-07-24 15:39       ` Eric Dumazet
  2015-07-24 16:27         ` Alexei Starovoitov
  2015-07-27 19:56         ` Nicolas Dichtel
  0 siblings, 2 replies; 18+ messages in thread
From: Eric Dumazet @ 2015-07-24 15:39 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: davem, netdev, roopa, tgraf


On Fri, 2015-07-24 at 16:16 +0200, Nicolas Dichtel wrote:
> This patch takes advantage of the newly added lwtunnel framework to
> allow the user to set routes that point to a peer netns.
> 
> Packets are injected to the peer netns via the loopback device. It works
> only when the output device is 'lo'.
> 
> Example:
> ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
> 

Is this feature so badly wanted to add complexity on lo device ?

> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
> ---
> 
> v2: rework loopback handling part (update stats and call skb_dst_force())
>     fix ipv6 processing
>     check lwtunnel type before converting data to a nsid
> 
>  drivers/net/loopback.c        | 33 +++++++++++++++++++++------
>  include/net/lwtunnel.h        | 27 ++++++++++++++++++++++
>  include/uapi/linux/lwtunnel.h |  1 +
>  net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
>  net/ipv6/route.c              |  9 ++++++--
>  5 files changed, 113 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
> index c76283c2f84a..4358256ff94e 100644
> --- a/drivers/net/loopback.c
> +++ b/drivers/net/loopback.c
> @@ -57,6 +57,7 @@
>  #include <linux/percpu.h>
>  #include <net/net_namespace.h>
>  #include <linux/u64_stats_sync.h>
> +#include <net/lwtunnel.h>
>  
>  struct pcpu_lstats {
>  	u64			packets;
> @@ -71,29 +72,47 @@ struct pcpu_lstats {
>  static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>  				 struct net_device *dev)
>  {
> +	int nsid = skb_lwt_netns_info(skb);
>  	struct pcpu_lstats *lb_stats;
> -	int len;
> -
> -	skb_orphan(skb);
> +	struct net *peernet = NULL;
> +	int len, ret;
>  
>  	/* Before queueing this packet to netif_rx(),
>  	 * make sure dst is refcounted.
>  	 */
>  	skb_dst_force(skb);
>  
> -	skb->protocol = eth_type_trans(skb, dev);
> +	if (nsid != NETNSA_NSID_NOT_ASSIGNED) {
> +		peernet = get_net_ns_by_id(dev_net(dev), nsid);
> +		if (!peernet) {
> +			kfree_skb(skb);
> +			goto end;
> +		}
> +
> +		/* it's OK to use per_cpu_ptr() because BHs are off */
> +		lb_stats = this_cpu_ptr(peernet->loopback_dev->lstats);
> +		ret = dev_forward_skb(peernet->loopback_dev, skb);
> +	} else {
> +		skb_orphan(skb);
>  
> -	/* it's OK to use per_cpu_ptr() because BHs are off */
> -	lb_stats = this_cpu_ptr(dev->lstats);
> +		skb->protocol = eth_type_trans(skb, dev);
> +
> +		/* it's OK to use per_cpu_ptr() because BHs are off */
> +		lb_stats = this_cpu_ptr(dev->lstats);
> +		ret = netif_rx(skb);
> +	}
>  
>  	len = skb->len;

<use after free error>  At this point you no longer can access skb

> -	if (likely(netif_rx(skb) == NET_RX_SUCCESS)) {
> +	if (likely(ret == NET_RX_SUCCESS)) {
>  		u64_stats_update_begin(&lb_stats->syncp);
>  		lb_stats->bytes += len;
>  		lb_stats->packets++;
>  		u64_stats_update_end(&lb_stats->syncp);
>  	}
>  
> +end:
> +	if (peernet)
> +		put_net(peernet);
>  	return NETDEV_TX_OK;
>  }
>  

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next v2] route: allow to route in a peer netns via lwt framework
  2015-07-24 15:39       ` Eric Dumazet
@ 2015-07-24 16:27         ` Alexei Starovoitov
  2015-07-27 19:56         ` Nicolas Dichtel
  1 sibling, 0 replies; 18+ messages in thread
From: Alexei Starovoitov @ 2015-07-24 16:27 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Nicolas Dichtel, davem, netdev, roopa, tgraf

On Fri, Jul 24, 2015 at 05:39:57PM +0200, Eric Dumazet wrote:
> 
> On Fri, 2015-07-24 at 16:16 +0200, Nicolas Dichtel wrote:
> > This patch takes advantage of the newly added lwtunnel framework to
> > allow the user to set routes that point to a peer netns.
> > 
> > Packets are injected to the peer netns via the loopback device. It works
> > only when the output device is 'lo'.
> > 
> > Example:
> > ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
> > 
> 
> Is this feature so badly wanted to add complexity on lo device ?
...
> >  static netdev_tx_t loopback_xmit(struct sk_buff *skb,
> >  				 struct net_device *dev)
...
> > +	if (nsid != NETNSA_NSID_NOT_ASSIGNED) {
> > +		peernet = get_net_ns_by_id(dev_net(dev), nsid);
> > +		if (!peernet) {
> > +			kfree_skb(skb);
> > +			goto end;
> > +		}
> > +
> > +		/* it's OK to use per_cpu_ptr() because BHs are off */
> > +		lb_stats = this_cpu_ptr(peernet->loopback_dev->lstats);
> > +		ret = dev_forward_skb(peernet->loopback_dev, skb);

have the same concern as Eric.
Using loopback for this looks wrong.
netns suppose to look like host, but I cannot imagine a host
without NICs seeing packets on loopback from another world.
Then how the opposite direction suppose to work?
netns will setup a route to send packets to loopback of the host?!
The idea of using routing to forward packets to namespaces is great,
but I think we need something else instead of loopback.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next v2] route: allow to route in a peer netns via lwt framework
  2015-07-24 15:39       ` Eric Dumazet
  2015-07-24 16:27         ` Alexei Starovoitov
@ 2015-07-27 19:56         ` Nicolas Dichtel
  2015-07-29 13:16           ` [PATCH net-next v3] " Nicolas Dichtel
  1 sibling, 1 reply; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-27 19:56 UTC (permalink / raw)
  To: Eric Dumazet, Alexei Starovoitov; +Cc: davem, netdev, roopa, tgraf

Le 24/07/2015 17:39, Eric Dumazet a écrit :
>
> On Fri, 2015-07-24 at 16:16 +0200, Nicolas Dichtel wrote:
>> This patch takes advantage of the newly added lwtunnel framework to
>> allow the user to set routes that point to a peer netns.
>>
>> Packets are injected to the peer netns via the loopback device. It works
>> only when the output device is 'lo'.
>>
>> Example:
>> ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
>>
>
> Is this feature so badly wanted to add complexity on lo device ?
>
The goal is to be scalable when the number of netns is high (10k or more).
Which this patch, we can save two interfaces (veth) per netns, which helps to to 
reduce memory consumption and the time needed to create a netns.

[snip]
>> +	if (nsid != NETNSA_NSID_NOT_ASSIGNED) {
>> +		peernet = get_net_ns_by_id(dev_net(dev), nsid);
>> +		if (!peernet) {
>> +			kfree_skb(skb);
>> +			goto end;
>> +		}
>> +
>> +		/* it's OK to use per_cpu_ptr() because BHs are off */
>> +		lb_stats = this_cpu_ptr(peernet->loopback_dev->lstats);
>> +		ret = dev_forward_skb(peernet->loopback_dev, skb);
>> +	} else {
>> +		skb_orphan(skb);
>>
>> -	/* it's OK to use per_cpu_ptr() because BHs are off */
>> -	lb_stats = this_cpu_ptr(dev->lstats);
>> +		skb->protocol = eth_type_trans(skb, dev);
>> +
>> +		/* it's OK to use per_cpu_ptr() because BHs are off */
>> +		lb_stats = this_cpu_ptr(dev->lstats);
>> +		ret = netif_rx(skb);
>> +	}
>>
>>   	len = skb->len;
>
> <use after free error>  At this point you no longer can access skb
Right, will fix it.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
  2015-07-24 15:19     ` David Ahern
@ 2015-07-27 20:07       ` Nicolas Dichtel
  0 siblings, 0 replies; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-27 20:07 UTC (permalink / raw)
  To: David Ahern, davem; +Cc: netdev, roopa, tgraf

Le 24/07/2015 17:19, David Ahern a écrit :
> In this case you are knowingly dropping packets. Would be nice to have a counter
> showing that.
Ok.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH net-next v3] route: allow to route in a peer netns via lwt framework
  2015-07-27 19:56         ` Nicolas Dichtel
@ 2015-07-29 13:16           ` Nicolas Dichtel
  2015-07-29 15:20             ` Eric Dumazet
  0 siblings, 1 reply; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-29 13:16 UTC (permalink / raw)
  To: davem
  Cc: netdev, roopa, tgraf, eric.dumazet, alexei.starovoitov, Nicolas Dichtel

This patch takes advantage of the newly added lwtunnel framework to
allow the user to set routes that point to a peer netns.

Packets are injected to the peer netns via the loopback device. It works
only when the output device is 'lo'.

Example:
ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo

The goal is to be scalable when the number of netns is high (10k or more).
Which this patch, we can save two interfaces (veth) per netns, which helps
to to reduce memory consumption and the time needed to create a netns.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---

v3: fix skb use after free in loopback_xmit()
    inc err stats if unable to find the peer netns
    fix a checkpatch style report

v2: rework loopback handling part (update stats and call skb_dst_force())
    fix ipv6 processing
    check lwtunnel type before converting data to a nsid

 drivers/net/loopback.c        | 40 +++++++++++++++++++++++++--------
 include/net/lwtunnel.h        | 27 ++++++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  1 +
 net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/route.c              |  9 ++++++--
 5 files changed, 118 insertions(+), 11 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index c76283c2f84a..1b83efcbfbb3 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -57,6 +57,7 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
+#include <net/lwtunnel.h>
 
 struct pcpu_lstats {
 	u64			packets;
@@ -71,29 +72,49 @@ struct pcpu_lstats {
 static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 				 struct net_device *dev)
 {
+	int nsid = skb_lwt_netns_info(skb);
 	struct pcpu_lstats *lb_stats;
-	int len;
-
-	skb_orphan(skb);
+	struct net *peernet = NULL;
+	int len, ret;
 
 	/* Before queueing this packet to netif_rx(),
 	 * make sure dst is refcounted.
 	 */
 	skb_dst_force(skb);
 
-	skb->protocol = eth_type_trans(skb, dev);
-
-	/* it's OK to use per_cpu_ptr() because BHs are off */
-	lb_stats = this_cpu_ptr(dev->lstats);
+	if (nsid != NETNSA_NSID_NOT_ASSIGNED) {
+		peernet = get_net_ns_by_id(dev_net(dev), nsid);
+		if (!peernet) {
+			dev->stats.tx_errors++;
+			kfree_skb(skb);
+			goto end;
+		}
+
+		/* it's OK to use per_cpu_ptr() because BHs are off */
+		lb_stats = this_cpu_ptr(peernet->loopback_dev->lstats);
+		len = skb->len;
+		ret = dev_forward_skb(peernet->loopback_dev, skb);
+	} else {
+		skb_orphan(skb);
+
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* it's OK to use per_cpu_ptr() because BHs are off */
+		lb_stats = this_cpu_ptr(dev->lstats);
+		len = skb->len;
+		ret = netif_rx(skb);
+	}
 
-	len = skb->len;
-	if (likely(netif_rx(skb) == NET_RX_SUCCESS)) {
+	if (likely(ret == NET_RX_SUCCESS)) {
 		u64_stats_update_begin(&lb_stats->syncp);
 		lb_stats->bytes += len;
 		lb_stats->packets++;
 		u64_stats_update_end(&lb_stats->syncp);
 	}
 
+end:
+	if (peernet)
+		put_net(peernet);
 	return NETDEV_TX_OK;
 }
 
@@ -122,6 +143,7 @@ static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
 	stats->tx_packets = packets;
 	stats->rx_bytes   = bytes;
 	stats->tx_bytes   = bytes;
+	stats->tx_errors  = dev->stats.tx_errors;
 	return stats;
 }
 
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index b02039081b04..78376da1afa2 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -5,7 +5,9 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/types.h>
+#include <linux/net_namespace.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 
 #define LWTUNNEL_HASH_BITS   7
 #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
@@ -147,4 +149,29 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
 
 #endif
 
+static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
+{
+	return (u32 *)lwtstate->data;
+}
+
+static inline int skb_lwt_netns_info(struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct rtable *rt = (struct rtable *)skb_dst(skb);
+
+		if (rt &&
+		    rt->rt_lwtstate &&
+		    rt->rt_lwtstate->type & LWTUNNEL_ENCAP_NETNS)
+			return *lwt_netns_info(rt->rt_lwtstate);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+		if (rt6 &&
+		    rt6->rt6i_lwtstate &&
+		    rt6->rt6i_lwtstate->type & LWTUNNEL_ENCAP_NETNS)
+			return *lwt_netns_info(rt6->rt6i_lwtstate);
+	}
+
+	return NETNSA_NSID_NOT_ASSIGNED;
+}
 #endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 31377bbea3f8..6715e7a1b335 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -7,6 +7,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
+	LWTUNNEL_ENCAP_NETNS,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..c1267aac373d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
 #include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/lwtunnel.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -725,6 +726,56 @@ out:
 	rtnl_set_sk_err(net, RTNLGRP_NSID, err);
 }
 
+static int lwt_netns_build_state(struct net_device *dev, struct nlattr *nla,
+				 struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[NETNSA_MAX + 1];
+	struct lwtunnel_state *newts;
+	int *nsid;
+	int ret;
+
+	ret = nla_parse_nested(tb, NETNSA_MAX, nla, rtnl_net_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[NETNSA_NSID])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*nsid));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = sizeof(*nsid);
+	nsid = lwt_netns_info(newts);
+	*nsid = nla_get_s32(tb[NETNSA_NSID]);
+	newts->type = LWTUNNEL_ENCAP_NETNS;
+
+	*ts = newts;
+	return 0;
+}
+
+static int lwt_netns_fill_encap_info(struct sk_buff *skb,
+				     struct lwtunnel_state *lwtstate)
+{
+	int *nsid = lwt_netns_info(lwtstate);
+
+	if (nla_put_s32(skb, NETNSA_NSID, *nsid))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int lwt_netns_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(4);	/* NETNSA_NSID */
+}
+
+static const struct lwtunnel_encap_ops lwt_netns_ops = {
+	.build_state = lwt_netns_build_state,
+	.fill_encap = lwt_netns_fill_encap_info,
+	.get_encap_size = lwt_netns_encap_nlsize,
+};
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@@ -762,6 +813,7 @@ static int __init net_ns_init(void)
 	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
 		      NULL);
 
+	lwtunnel_encap_add_ops(&lwt_netns_ops, LWTUNNEL_ENCAP_NETNS);
 	return 0;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 54fccf0d705d..6e77d4b1380d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1796,10 +1796,13 @@ int ip6_route_add(struct fib6_config *cfg)
 	rt->rt6i_metric = cfg->fc_metric;
 
 	/* We cannot add true routes via loopback here,
-	   they would result in kernel looping; promote them to reject routes
+	 * they would result in kernel looping; promote them to reject routes.
+	 * Exception: routes that point to a peer netns.
 	 */
 	if ((cfg->fc_flags & RTF_REJECT) ||
 	    (dev && (dev->flags & IFF_LOOPBACK) &&
+	     (!rt->rt6i_lwtstate ||
+	      rt->rt6i_lwtstate->type != LWTUNNEL_ENCAP_NETNS) &&
 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
 	     !(cfg->fc_flags & RTF_LOCAL))) {
 		/* hold loopback dev/idev if we haven't done so. */
@@ -2880,7 +2883,9 @@ static int rt6_fill_node(struct net *net,
 	}
 	else if (rt->rt6i_flags & RTF_LOCAL)
 		rtm->rtm_type = RTN_LOCAL;
-	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
+	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK) &&
+		 (!rt->rt6i_lwtstate ||
+		  rt->rt6i_lwtstate->type != LWTUNNEL_ENCAP_NETNS))
 		rtm->rtm_type = RTN_LOCAL;
 	else
 		rtm->rtm_type = RTN_UNICAST;
-- 
2.4.2

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next v3] route: allow to route in a peer netns via lwt framework
  2015-07-29 13:16           ` [PATCH net-next v3] " Nicolas Dichtel
@ 2015-07-29 15:20             ` Eric Dumazet
  2015-07-29 21:17               ` Nicolas Dichtel
  0 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2015-07-29 15:20 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: davem, netdev, roopa, tgraf, alexei.starovoitov

On Wed, 2015-07-29 at 15:16 +0200, Nicolas Dichtel wrote:
> This patch takes advantage of the newly added lwtunnel framework to
> allow the user to set routes that point to a peer netns.
> 
> Packets are injected to the peer netns via the loopback device. It works
> only when the output device is 'lo'.
> 
> Example:
> ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
> 
> The goal is to be scalable when the number of netns is high (10k or more).
> Which this patch, we can save two interfaces (veth) per netns, which helps
> to to reduce memory consumption and the time needed to create a netns.


Really this is a hack Nicolas.

get_net_ns_by_id() was not meant to be used in data (fast ???) path.

Same for get_net() and put_net()

Plumbing like that should not happen in lo start_xmit()

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH net-next v3] route: allow to route in a peer netns via lwt framework
  2015-07-29 15:20             ` Eric Dumazet
@ 2015-07-29 21:17               ` Nicolas Dichtel
  0 siblings, 0 replies; 18+ messages in thread
From: Nicolas Dichtel @ 2015-07-29 21:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev, roopa, tgraf, alexei.starovoitov

Le 29/07/2015 17:20, Eric Dumazet a écrit :
> On Wed, 2015-07-29 at 15:16 +0200, Nicolas Dichtel wrote:
>> This patch takes advantage of the newly added lwtunnel framework to
>> allow the user to set routes that point to a peer netns.
>>
>> Packets are injected to the peer netns via the loopback device. It works
>> only when the output device is 'lo'.
>>
>> Example:
>> ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
>>
>> The goal is to be scalable when the number of netns is high (10k or more).
>> Which this patch, we can save two interfaces (veth) per netns, which helps
>> to to reduce memory consumption and the time needed to create a netns.
>
>
> Really this is a hack Nicolas.
>
> get_net_ns_by_id() was not meant to be used in data (fast ???) path.
>
> Same for get_net() and put_net()
>
> Plumbing like that should not happen in lo start_xmit()
Yes, I think you're right. I was a bit too enthusiasm with this new framework.
Do you think it would be acceptable if the netns was directly referenced
instead of a nsid?

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2015-07-29 21:17 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-23 14:22 [PATCH net-next] route: allow to route in a peer netns via lwt framework Nicolas Dichtel
2015-07-23 15:01 ` roopa
2015-07-23 15:25   ` Nicolas Dichtel
2015-07-23 15:50     ` roopa
2015-07-24 12:24       ` Nicolas Dichtel
2015-07-24 13:50         ` roopa
2015-07-24 14:11           ` Nicolas Dichtel
2015-07-24 14:16     ` [PATCH net-next v2] " Nicolas Dichtel
2015-07-24 15:39       ` Eric Dumazet
2015-07-24 16:27         ` Alexei Starovoitov
2015-07-27 19:56         ` Nicolas Dichtel
2015-07-29 13:16           ` [PATCH net-next v3] " Nicolas Dichtel
2015-07-29 15:20             ` Eric Dumazet
2015-07-29 21:17               ` Nicolas Dichtel
2015-07-24 14:28 ` [PATCH net-next] " David Ahern
2015-07-24 14:32   ` Nicolas Dichtel
2015-07-24 15:19     ` David Ahern
2015-07-27 20:07       ` Nicolas Dichtel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).