All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
@ 2019-03-22 13:06 David Ahern
  2019-03-22 16:14 ` Alexei Starovoitov
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: David Ahern @ 2019-03-22 13:06 UTC (permalink / raw)
  To: davem; +Cc: netdev, David Ahern

From: David Ahern <dsahern@gmail.com>

The number of stubs is growing and has nothing to do with addrconf.
Move the definition of the stubs to a separate header file and update
users. In the move, drop the vxlan specific comment before ipv6_stub.

Code move only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 drivers/infiniband/core/addr.c                  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c |  1 +
 drivers/net/geneve.c                            |  1 +
 drivers/net/usb/cdc_mbim.c                      |  1 +
 drivers/net/vxlan.c                             |  1 +
 include/net/addrconf.h                          | 47 ------------------
 include/net/ipv6_stubs.h                        | 63 +++++++++++++++++++++++++
 include/net/udp_tunnel.h                        |  2 +-
 net/bridge/br_arp_nd_proxy.c                    |  1 +
 net/core/filter.c                               |  1 +
 net/core/lwt_bpf.c                              |  1 +
 net/ipv6/addrconf_core.c                        |  2 +-
 net/ipv6/af_inet6.c                             |  1 +
 net/mpls/af_mpls.c                              |  2 +-
 net/tipc/udp_media.c                            |  2 +-
 15 files changed, 76 insertions(+), 52 deletions(-)
 create mode 100644 include/net/ipv6_stubs.h

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 0dce94e3c495..2649e0f2ff65 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -42,7 +42,7 @@
 #include <net/neighbour.h>
 #include <net/route.h>
 #include <net/netevent.h>
-#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #include <net/ip6_route.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_sa.h>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index b4967a0ff8c7..6f2852c7365c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -44,6 +44,7 @@
 #include <net/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_csum.h>
 #include <net/arp.h>
+#include <net/ipv6_stubs.h>
 #include "en.h"
 #include "en_rep.h"
 #include "en_tc.h"
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 5583d993480d..384d2dbca324 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/etherdevice.h>
 #include <linux/hash.h>
+#include <net/ipv6_stubs.h>
 #include <net/dst_metadata.h>
 #include <net/gro_cells.h>
 #include <net/rtnetlink.h>
diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index 0362acd5cdca..28321aca48fe 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -23,6 +23,7 @@
 #include <linux/usb/cdc_ncm.h>
 #include <net/ipv6.h>
 #include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 
 /* alternative VLAN for IP session 0 if not untagged */
 #define MBIM_IPS0_VID	4094
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d76dfed8d9bb..5994d5415a03 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -20,6 +20,7 @@
 #include <linux/ethtool.h>
 #include <net/arp.h>
 #include <net/ndisc.h>
+#include <net/ipv6_stubs.h>
 #include <net/ip.h>
 #include <net/icmp.h>
 #include <net/rtnetlink.h>
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 269ec27385e9..e4997bbd93b2 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -238,53 +238,6 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 
 void ipv6_mc_dad_complete(struct inet6_dev *idev);
 
-/* A stub used by vxlan module. This is ugly, ideally these
- * symbols should be built into the core kernel.
- */
-struct ipv6_stub {
-	int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex,
-				 const struct in6_addr *addr);
-	int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
-				 const struct in6_addr *addr);
-	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
-			       struct dst_entry **dst, struct flowi6 *fl6);
-	int (*ipv6_route_input)(struct sk_buff *skb);
-
-	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
-	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
-					 struct flowi6 *fl6, int flags);
-	struct fib6_info *(*fib6_table_lookup)(struct net *net,
-					      struct fib6_table *table,
-					      int oif, struct flowi6 *fl6,
-					      int flags);
-	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
-						   struct fib6_info *f6i,
-						   struct flowi6 *fl6, int oif,
-						   const struct sk_buff *skb,
-						   int strict);
-	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
-				 struct in6_addr *saddr);
-
-	void (*udpv6_encap_enable)(void);
-	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
-			      const struct in6_addr *solicited_addr,
-			      bool router, bool solicited, bool override, bool inc_opt);
-	struct neigh_table *nd_tbl;
-};
-extern const struct ipv6_stub *ipv6_stub __read_mostly;
-
-/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
-struct ipv6_bpf_stub {
-	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-			  bool force_bind_address_no_port, bool with_lock);
-	struct sock *(*udp6_lib_lookup)(struct net *net,
-					const struct in6_addr *saddr, __be16 sport,
-					const struct in6_addr *daddr, __be16 dport,
-					int dif, int sdif, struct udp_table *tbl,
-					struct sk_buff *skb);
-};
-extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
-
 /*
  * identify MLD packets for MLD filter exceptions
  */
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
new file mode 100644
index 000000000000..d8d9c0b0e8c0
--- /dev/null
+++ b/include/net/ipv6_stubs.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IPV6_STUBS_H
+#define _IPV6_STUBS_H
+
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/neighbour.h>
+#include <net/sock.h>
+
+/* structs from net/ip6_fib.h */
+struct fib6_info;
+
+/* This is ugly, ideally these symbols should be built
+ * into the core kernel.
+ */
+struct ipv6_stub {
+	int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex,
+				 const struct in6_addr *addr);
+	int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
+				 const struct in6_addr *addr);
+	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
+			       struct dst_entry **dst, struct flowi6 *fl6);
+	int (*ipv6_route_input)(struct sk_buff *skb);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
+				 struct in6_addr *saddr);
+
+	void (*udpv6_encap_enable)(void);
+	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
+			      const struct in6_addr *solicited_addr,
+			      bool router, bool solicited, bool override, bool inc_opt);
+	struct neigh_table *nd_tbl;
+};
+extern const struct ipv6_stub *ipv6_stub __read_mostly;
+
+/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
+struct ipv6_bpf_stub {
+	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+			  bool force_bind_address_no_port, bool with_lock);
+	struct sock *(*udp6_lib_lookup)(struct net *net,
+				     const struct in6_addr *saddr, __be16 sport,
+				     const struct in6_addr *daddr, __be16 dport,
+				     int dif, int sdif, struct udp_table *tbl,
+				     struct sk_buff *skb);
+};
+extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
+
+#endif
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index b8137953fea3..4b1f95e08307 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -7,7 +7,7 @@
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
-#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #endif
 
 struct udp_port_cfg {
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 6b78e6351719..724b474ade54 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -21,6 +21,7 @@
 #include <linux/if_vlan.h>
 #include <linux/inetdevice.h>
 #include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ip6_checksum.h>
 #endif
diff --git a/net/core/filter.c b/net/core/filter.c
index 647c63a7b25b..841502f894bc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -74,6 +74,7 @@
 #include <net/seg6.h>
 #include <net/seg6_local.h>
 #include <net/lwtunnel.h>
+#include <net/ipv6_stubs.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 126d31ff5ee3..3c5c24a5d9f5 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -18,6 +18,7 @@
 #include <net/lwtunnel.h>
 #include <net/gre.h>
 #include <net/ip6_route.h>
+#include <net/ipv6_stubs.h>
 
 struct bpf_lwt_prog {
 	struct bpf_prog *prog;
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 6c79af056d9b..945b66e3008f 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -5,7 +5,7 @@
 
 #include <linux/export.h>
 #include <net/ipv6.h>
-#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #include <net/ip.h>
 
 /* if ipv6 module registers this function is used by xfrm to force all
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fa6b404cbd10..1789bf99c419 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -56,6 +56,7 @@
 #include <net/transp_v6.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #include <net/ndisc.h>
 #ifdef CONFIG_IPV6_TUNNEL
 #include <net/ip6_tunnel.h>
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index f7c544592ec8..8120e04f15e4 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -22,7 +22,7 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
 #endif
-#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #include <net/nexthop.h>
 #include "internal.h"
 
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 4d85d71f16e2..6f166fbbfff1 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -44,7 +44,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/udp_tunnel.h>
-#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #include <linux/tipc_netlink.h>
 #include "core.h"
 #include "addr.h"
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-22 13:06 [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file David Ahern
@ 2019-03-22 16:14 ` Alexei Starovoitov
  2019-03-22 16:17   ` David Ahern
  2019-03-24  1:40 ` David Miller
  2019-03-29 17:55 ` David Miller
  2 siblings, 1 reply; 16+ messages in thread
From: Alexei Starovoitov @ 2019-03-22 16:14 UTC (permalink / raw)
  To: David Ahern; +Cc: davem, netdev, David Ahern

On Fri, Mar 22, 2019 at 06:06:09AM -0700, David Ahern wrote:
> From: David Ahern <dsahern@gmail.com>
> 
> The number of stubs is growing and has nothing to do with addrconf.
> Move the definition of the stubs to a separate header file and update
> users. In the move, drop the vxlan specific comment before ipv6_stub.
> 
> Code move only; no functional change intended.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>

nack.
why this code churn?
I very much prefer to disallow ipv6 as a module.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-22 16:14 ` Alexei Starovoitov
@ 2019-03-22 16:17   ` David Ahern
  2019-03-22 17:04     ` Alexei Starovoitov
  0 siblings, 1 reply; 16+ messages in thread
From: David Ahern @ 2019-03-22 16:17 UTC (permalink / raw)
  To: Alexei Starovoitov, David Ahern; +Cc: davem, netdev

On 3/22/19 5:14 PM, Alexei Starovoitov wrote:
> On Fri, Mar 22, 2019 at 06:06:09AM -0700, David Ahern wrote:
>> From: David Ahern <dsahern@gmail.com>
>>
>> The number of stubs is growing and has nothing to do with addrconf.
>> Move the definition of the stubs to a separate header file and update
>> users. In the move, drop the vxlan specific comment before ipv6_stub.
>>
>> Code move only; no functional change intended.
>>
>> Signed-off-by: David Ahern <dsahern@gmail.com>
> 
> nack.
> why this code churn?

This is a prep patch for nexthops as separate objects. I will be adding
4 more functions to the stubs, and I hit a bit of a header nightmare
with it in addrconf.h.

And this has no impact on ipv6 built in or as a module; it has only to
do with a proper header file for a definition.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-22 16:17   ` David Ahern
@ 2019-03-22 17:04     ` Alexei Starovoitov
  0 siblings, 0 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-03-22 17:04 UTC (permalink / raw)
  To: David Ahern; +Cc: David Ahern, davem, netdev

On Fri, Mar 22, 2019 at 05:17:52PM +0100, David Ahern wrote:
> On 3/22/19 5:14 PM, Alexei Starovoitov wrote:
> > On Fri, Mar 22, 2019 at 06:06:09AM -0700, David Ahern wrote:
> >> From: David Ahern <dsahern@gmail.com>
> >>
> >> The number of stubs is growing and has nothing to do with addrconf.
> >> Move the definition of the stubs to a separate header file and update
> >> users. In the move, drop the vxlan specific comment before ipv6_stub.
> >>
> >> Code move only; no functional change intended.
> >>
> >> Signed-off-by: David Ahern <dsahern@gmail.com>
> > 
> > nack.
> > why this code churn?
> 
> This is a prep patch for nexthops as separate objects. I will be adding
> 4 more functions to the stubs, and I hit a bit of a header nightmare
> with it in addrconf.h.

please do not.
I see no value in splitting nexthops either.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-22 13:06 [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file David Ahern
  2019-03-22 16:14 ` Alexei Starovoitov
@ 2019-03-24  1:40 ` David Miller
  2019-03-24  3:55   ` Alexei Starovoitov
  2019-03-29 17:55 ` David Miller
  2 siblings, 1 reply; 16+ messages in thread
From: David Miller @ 2019-03-24  1:40 UTC (permalink / raw)
  To: dsahern; +Cc: netdev, dsahern

From: David Ahern <dsahern@kernel.org>
Date: Fri, 22 Mar 2019 06:06:09 -0700

> From: David Ahern <dsahern@gmail.com>
> 
> The number of stubs is growing and has nothing to do with addrconf.
> Move the definition of the stubs to a separate header file and update
> users. In the move, drop the vxlan specific comment before ipv6_stub.
> 
> Code move only; no functional change intended.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>

Eric, I fully support David's overall plan to make separate nexthop
objects as it will significantly empower the stack to do more sensible
things when links flap etc.

If you don't like this road he is going down to achieve that, you must
suggest an alternative.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-24  1:40 ` David Miller
@ 2019-03-24  3:55   ` Alexei Starovoitov
  2019-03-24 12:56     ` David Ahern
  0 siblings, 1 reply; 16+ messages in thread
From: Alexei Starovoitov @ 2019-03-24  3:55 UTC (permalink / raw)
  To: David Miller; +Cc: dsahern, netdev, dsahern, edumazet

On Sat, Mar 23, 2019 at 09:40:23PM -0400, David Miller wrote:
> From: David Ahern <dsahern@kernel.org>
> Date: Fri, 22 Mar 2019 06:06:09 -0700
> 
> > From: David Ahern <dsahern@gmail.com>
> > 
> > The number of stubs is growing and has nothing to do with addrconf.
> > Move the definition of the stubs to a separate header file and update
> > users. In the move, drop the vxlan specific comment before ipv6_stub.
> > 
> > Code move only; no functional change intended.
> > 
> > Signed-off-by: David Ahern <dsahern@gmail.com>
> 
> Eric, I fully support David's overall plan to make separate nexthop
> objects as it will significantly empower the stack to do more sensible
> things when links flap etc.

let's agree to disagree.
'link flaps' were not mentioned in the cover letter for:
"net: Improve route scalability via support for nexthop objects"

The _only_ value of 86 patches is to align linux kernel routing
with switch ASICs, because cumulus is trying to reuse iproute2
to manage them.
It was broken model to begin with and it keeps complicating routing
when linux is used as a host while not achieving the goal of iproute2
for switches.
Can anyone use off the shelf linux to manage trident/tomahawk switches? Nope.
brcm sdk is still necessary.
nexthop objects are essential to configure enterprise switches.
Clearly cumulus customers don't like iproute2 style because it's missing
this feature, so David's proposal is to add that to the kernel.
Even after kernel and iproute2 understand nexthop id the kernel is still
not going to be competitive with switching os. The linux kernel is an OS
to run on the host cpu and to run on a control plane cpu of a switch.
That is all great, but the reasons to push routing into the kernel
of control plane cpu were weak. It's not using these routes.
Such architecture allowed temporary reuse of bgp daemons, but it fails to scale.
No need to push route to the kernel when kernel won't use them.
Hence an alternative proposal:
- introduce hooks at netlink layer and steal back and forth messages
from your favorite daemon without populating the kernel
- same for iproute2 netlink interaction


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-24  3:55   ` Alexei Starovoitov
@ 2019-03-24 12:56     ` David Ahern
  2019-03-25  3:26       ` Alexei Starovoitov
  0 siblings, 1 reply; 16+ messages in thread
From: David Ahern @ 2019-03-24 12:56 UTC (permalink / raw)
  To: Alexei Starovoitov, David Miller; +Cc: netdev, edumazet

On 3/23/19 9:55 PM, Alexei Starovoitov wrote:
> On Sat, Mar 23, 2019 at 09:40:23PM -0400, David Miller wrote:
>> From: David Ahern <dsahern@kernel.org>
>> Date: Fri, 22 Mar 2019 06:06:09 -0700
>>
>>> From: David Ahern <dsahern@gmail.com>
>>>
>>> The number of stubs is growing and has nothing to do with addrconf.
>>> Move the definition of the stubs to a separate header file and update
>>> users. In the move, drop the vxlan specific comment before ipv6_stub.
>>>
>>> Code move only; no functional change intended.
>>>
>>> Signed-off-by: David Ahern <dsahern@gmail.com>
>>
>> Eric, I fully support David's overall plan to make separate nexthop
>> objects as it will significantly empower the stack to do more sensible
>> things when links flap etc.
> 
> let's agree to disagree.
> 'link flaps' were not mentioned in the cover letter for:
> "net: Improve route scalability via support for nexthop objects"
> 
> The _only_ value of 86 patches is to align linux kernel routing
> with switch ASICs, because cumulus is trying to reuse iproute2
> to manage them.
> It was broken model to begin with and it keeps complicating routing
> when linux is used as a host while not achieving the goal of iproute2
> for switches.
> Can anyone use off the shelf linux to manage trident/tomahawk switches? Nope.
> brcm sdk is still necessary.
> nexthop objects are essential to configure enterprise switches.
> Clearly cumulus customers don't like iproute2 style because it's missing
> this feature, so David's proposal is to add that to the kernel.
> Even after kernel and iproute2 understand nexthop id the kernel is still
> not going to be competitive with switching os. The linux kernel is an OS
> to run on the host cpu and to run on a control plane cpu of a switch.
> That is all great, but the reasons to push routing into the kernel
> of control plane cpu were weak. It's not using these routes.
> Such architecture allowed temporary reuse of bgp daemons, but it fails to scale.
> No need to push route to the kernel when kernel won't use them.
> Hence an alternative proposal:
> - introduce hooks at netlink layer and steal back and forth messages
> from your favorite daemon without populating the kernel
> - same for iproute2 netlink interaction
> 

The use case here is not just Cumulus or switchdev, but ANY OS using the
Linux API and the kernel to configure and manage its networking state
[1] and that includes XDP based use cases [2] and routing on the
host.[3] This is not about iproute2 driving networking deployments. This
is about continuing to remove the 'fails to scale' notion which *forces*
a NOS architecture away from the kernel databases as the single source
of truth and the kernel's IPC/notification mechanisms, and the
subsequent impacts of that choice which negates the Linux ecosystem
forcing a customization of all of the software running in the control
plane to work in some vendor's custom environment. You should read the
paper I wrote last summer [1].

This current patch set is not just about link flaps, but improving the
overall scaling properties of managing the FIB. This is about leveraging
existing ideas about network models and their scalability properties and
bringing that efficiency to Linux. With nexthops, the time to insert
routes is near constant regardless of the number of nexthops in the
route. So the time to insert a single path route and the time to insert
a route with 2, 4, 8, 16, 32, … nexthops is the same. That is a HUGE
scalability improvement from a simple idea. The “near” constant is
because of the need to expand nexthop definitions in the route
notifications to userspace to enable legacy applications to work with
this new API. In time, a lever can be added to not expand the
definitions and let the RTA_NHID alone point to it, allowing companies
who know that there are no legacy apps that need the nexthop expansion
to gain the full scaling improvements.

This change also enables many other key features:
1. IPv4 multipath routes are not evicted just because 1 hop goes down.
2. IPv6 multipath routes with device only nexthops (e.g., tunnels).
3. IPv6 nexthop with IPv4 route (aka, RFC 5549) which enables a more
natural BGP unnumbered.
4. Lower memory consumption for IPv6 FIB entries which has no sharing at
all like IPv4 does.
5. Allows atomic update of nexthop definitions with a single replace
command as opposed to replacing the N-routes using it.

The list goes on, but 2-5 of the above were in the cover letter I sent
on March 14.

I have spent a lot of time over the last few years not just working on
features like VRF and MPLS, but improving the scaling properties of
Linux and removing this 'fails to scale' notion you and others hold.
This current patch set is just another step in that path.

[1] https://www.files.netdevconf.org/d/f982086fdd6946d9b596/
[2] http://vger.kernel.org/lpc_net2018_talks/dsa-xdp-kernel-tables-paper.pdf
[3]
https://netdevconf.org/1.2/slides/oct7/01_ahern_microservice_net_vrf_on_host.pdf

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-24 12:56     ` David Ahern
@ 2019-03-25  3:26       ` Alexei Starovoitov
  2019-03-25 15:39         ` Stephen Suryaputra
                           ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Alexei Starovoitov @ 2019-03-25  3:26 UTC (permalink / raw)
  To: David Ahern; +Cc: David Miller, netdev, edumazet

On Sun, Mar 24, 2019 at 06:56:42AM -0600, David Ahern wrote:
> 
> This change also enables many other key features:
> 1. IPv4 multipath routes are not evicted just because 1 hop goes down.
> 2. IPv6 multipath routes with device only nexthops (e.g., tunnels).
> 3. IPv6 nexthop with IPv4 route (aka, RFC 5549) which enables a more
> natural BGP unnumbered.
> 4. Lower memory consumption for IPv6 FIB entries which has no sharing at
> all like IPv4 does.
> 5. Allows atomic update of nexthop definitions with a single replace
> command as opposed to replacing the N-routes using it.

Does kernel work as data plane or control plane in any of the above
features ?
Sadly the patches allow it to do both, but cumulus doesn't use it
for data path. The kernel on control plane cpu is merely a database.
And today it doesn't scale when used as a database.
The kernel has to be fast as a dataplane but these extra features
will slow down the routing by making kernel-as-database scale a bit better.
Hence my suggestion in the previous email: use proper database
to store routes, nexthops and whatever else necessary to program the asic.
The kernel doesn't need to hold this information.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-25  3:26       ` Alexei Starovoitov
@ 2019-03-25 15:39         ` Stephen Suryaputra
  2019-03-25 17:02         ` David Ahern
  2019-03-26  2:22         ` David Ahern
  2 siblings, 0 replies; 16+ messages in thread
From: Stephen Suryaputra @ 2019-03-25 15:39 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: David Ahern, David Miller, netdev, edumazet

Just a comment: on the system that I work on, the kernel closely
matches the hardware data plane, so the kernel has both roles as data
and control plane.
Whatever routes are installed on the hardware are installed also in the kernel.

On Sun, Mar 24, 2019 at 11:27 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Sun, Mar 24, 2019 at 06:56:42AM -0600, David Ahern wrote:
> >
> > This change also enables many other key features:
> > 1. IPv4 multipath routes are not evicted just because 1 hop goes down.
> > 2. IPv6 multipath routes with device only nexthops (e.g., tunnels).
> > 3. IPv6 nexthop with IPv4 route (aka, RFC 5549) which enables a more
> > natural BGP unnumbered.
> > 4. Lower memory consumption for IPv6 FIB entries which has no sharing at
> > all like IPv4 does.
> > 5. Allows atomic update of nexthop definitions with a single replace
> > command as opposed to replacing the N-routes using it.
>
> Does kernel work as data plane or control plane in any of the above
> features ?
> Sadly the patches allow it to do both, but cumulus doesn't use it
> for data path. The kernel on control plane cpu is merely a database.
> And today it doesn't scale when used as a database.
> The kernel has to be fast as a dataplane but these extra features
> will slow down the routing by making kernel-as-database scale a bit better.
> Hence my suggestion in the previous email: use proper database
> to store routes, nexthops and whatever else necessary to program the asic.
> The kernel doesn't need to hold this information.
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-25  3:26       ` Alexei Starovoitov
  2019-03-25 15:39         ` Stephen Suryaputra
@ 2019-03-25 17:02         ` David Ahern
  2019-03-26  3:05           ` Alexei Starovoitov
  2019-03-26  2:22         ` David Ahern
  2 siblings, 1 reply; 16+ messages in thread
From: David Ahern @ 2019-03-25 17:02 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: David Miller, netdev, edumazet

On 3/24/19 9:26 PM, Alexei Starovoitov wrote:
> On Sun, Mar 24, 2019 at 06:56:42AM -0600, David Ahern wrote:
>>
>> This change also enables many other key features:
>> 1. IPv4 multipath routes are not evicted just because 1 hop goes down.
>> 2. IPv6 multipath routes with device only nexthops (e.g., tunnels).
>> 3. IPv6 nexthop with IPv4 route (aka, RFC 5549) which enables a more
>> natural BGP unnumbered.
>> 4. Lower memory consumption for IPv6 FIB entries which has no sharing at
>> all like IPv4 does.
>> 5. Allows atomic update of nexthop definitions with a single replace
>> command as opposed to replacing the N-routes using it.
> 
> Does kernel work as data plane or control plane in any of the above
> features ?
> Sadly the patches allow it to do both, but cumulus doesn't use it
> for data path. The kernel on control plane cpu is merely a database.
> And today it doesn't scale when used as a database.
> The kernel has to be fast as a dataplane but these extra features
> will slow down the routing by making kernel-as-database scale a bit better.
> Hence my suggestion in the previous email: use proper database
> to store routes, nexthops and whatever else necessary to program the asic.
> The kernel doesn't need to hold this information.
> 

The first 40 patches align fib_nh and fib6_nh providing more consistency
and alignment between IPv4 and IPv6 and allowing more code re-use
between the protocols. The end result is the ability to have IPv6
gateways with IPv4 routes, a much needed control plane feature other
companies have been harassing me about as well as the internal need for
Cumulus. In the refactoring I have been very careful about changes to
data structure layout and cacheline hits as well as adverse changes to
memory use. I believe at the end of this change set there is no impact
to existing performance - control plane or data plane.

That is followed by refactoring IPv6 again in a direction that makes
IPv4 and IPv6 more consistent and enables changes (outside of the
nexthop sets) that will improve IPv6 for a number of cases by removing
the need to always generate a dst_entry.

After that are a few patches exporting functions for use by nexthop code
and then diving into the refactoring enabling separate nexthop objects.
Again, impacts to performance have been top of mind, and I have done
what I can to minimize any overhead in the datapath - to the point of a
few ‘if  (nh)’ checks wrapped in an unlikely. And with the nexthop code
in place it gives users an alternative to a broken IPv6 multipath API as
one example.

As far as scalability goes, I can already inject a million routes into
the kernel FIB. This allows me to it more efficiently and to manage the
FIBs more efficiently in the face of changes such as a link going down
as we move to higher end systems - such as spectrum2.

As for routes in the kernel, they need to be there for any control plane
processes to properly function. One example is ping and traceroute to
troubleshoot data path problems, and another is for bgp (or any other
service) to connect to a peer through the data plane (do not assume a
peer is on a directly connected route). Further, the routes need to go
through the kernel to get to the switchdev driver. The routes need to be
there for XDP forwarding and routing on the host. Pawel has already
expressed interest in using XDP for fast path forwarding with FRR
managing the route table.

You keep trying to make this about Cumulus. This is about bringing next
level features to Linux and in the process bringing more consistency and
code sharing between IPv4 and IPv6. This is about 1-API for the data
center be it servers, hosts, switches or routers regardless of datapath
(hardware offload, XDP, or kernel forwarding), and maintaining
consistency in configuring, monitoring and troubleshooting across those
systems. That is the common theme of both the netdev talk last summer
and the talk at LPC in November.

Again, I have tried to be very careful with the intrusion of checks into
the datapath with the goal of no measurable impact to performance. I am
invested to seeing that through and will continue looking for ways to
improve it for all use cases.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-25  3:26       ` Alexei Starovoitov
  2019-03-25 15:39         ` Stephen Suryaputra
  2019-03-25 17:02         ` David Ahern
@ 2019-03-26  2:22         ` David Ahern
  2 siblings, 0 replies; 16+ messages in thread
From: David Ahern @ 2019-03-26  2:22 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: David Miller, netdev, edumazet

On 3/24/19 9:26 PM, Alexei Starovoitov wrote:
> The kernel has to be fast as a dataplane but these extra features
> will slow down the routing by making kernel-as-database scale a bit better.

I took Vincent Bernat's fib lookup microbenchmark and added input and
output route lookups to it. Through this commit:

commit 4d5edf447c88e59f5c5716180c1001cde9bf4473
Author: David Ahern <dsahern@gmail.com>
Date:   Mon Mar 11 18:34:48 2019 -0700

    ipv4: Allow ipv6 gateway with ipv4 routes


There is no real measurable difference for IPv4 fib lookups
(fib_lookup), input routes (ip_route_input_rcu) and output routes
(ip_route_output_key_hash_rcu). This is using a bare metal server with a
L5640 Xeon CPU.

The above commit is the target for the first 2 rounds of patches. I will
of course be checking performance each step along the way to nexthop
objects.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-25 17:02         ` David Ahern
@ 2019-03-26  3:05           ` Alexei Starovoitov
  2019-03-26 14:19             ` David Ahern
  0 siblings, 1 reply; 16+ messages in thread
From: Alexei Starovoitov @ 2019-03-26  3:05 UTC (permalink / raw)
  To: David Ahern; +Cc: David Miller, netdev, edumazet, kafai

On Mon, Mar 25, 2019 at 11:02:07AM -0600, David Ahern wrote:
> 
> That is followed by refactoring IPv6 again in a direction that makes
> IPv4 and IPv6 more consistent and enables changes (outside of the
> nexthop sets) that will improve IPv6 for a number of cases by removing
> the need to always generate a dst_entry.

That would be a nice feature to have.

> After that are a few patches exporting functions for use by nexthop code
> and then diving into the refactoring enabling separate nexthop objects.
> Again, impacts to performance have been top of mind, and I have done
> what I can to minimize any overhead in the datapath - to the point of a
> few ‘if  (nh)’ checks wrapped in an unlikely. And with the nexthop code
> in place it gives users an alternative to a broken IPv6 multipath API as
> one example.
...
> Again, I have tried to be very careful with the intrusion of checks into
> the datapath with the goal of no measurable impact to performance. I am
> invested to seeing that through and will continue looking for ways to
> improve it for all use cases.

Great to hear.
Can you split up your set into reviewable chunks?
Just this patch alone is too small to see the road ahead
and 80+ patches are too much to review properly.
I still have reservations regarding nexthop id concept, but sounds like
the first 20 or so patches should clean things up.
Especially if you can get rid of dst alloc/free back and forth in ipv6 case.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-26  3:05           ` Alexei Starovoitov
@ 2019-03-26 14:19             ` David Ahern
  2019-03-26 15:11               ` Alexei Starovoitov
  0 siblings, 1 reply; 16+ messages in thread
From: David Ahern @ 2019-03-26 14:19 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: David Miller, netdev, edumazet, kafai

On 3/25/19 9:05 PM, Alexei Starovoitov wrote:
> Just this patch alone is too small to see the road ahead
> and 80+ patches are too much to review properly.

I will be sending in reviewable chunks. I think I have established a
history of evolving the code in small, singly focused patches moving the
code from where it is to where I want it to be. Any time there is a
'large' patch I try to keep it to something mechanical - like a name
change or a change to a function signature.

The email I sent ("net: Improve route scalability via support for
nexthop objects") outlined the targeted steps and the intermediate prep
work needed. To be able to reach certain clear steps and still maintain
the 20'ish patches in a set requirement, I have sent a few patches which
are standalone and basically noise - like this one - to avoid
distractions on the real change. I pushed the entire set to github so
anyone can see the end goal as well as the individual patches.

I am not sure what else I can do to be open about this change and convey
the what and the why. I discussed the feature at netconf in Seoul in
October 2017. I sent RFC patches last August on the UAPI with a working
implementation as well as more details about what this change is going
to allow. I have given talks on what I believe is the best architecture
and implementation for Linux which is the motivation for improving the
kernel. And the previous email about improving scalability is almost a
novelette in length.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-26 14:19             ` David Ahern
@ 2019-03-26 15:11               ` Alexei Starovoitov
  2019-03-26 15:29                 ` David Ahern
  0 siblings, 1 reply; 16+ messages in thread
From: Alexei Starovoitov @ 2019-03-26 15:11 UTC (permalink / raw)
  To: David Ahern; +Cc: David Miller, netdev, edumazet, kafai

On Tue, Mar 26, 2019 at 08:19:06AM -0600, David Ahern wrote:
> work needed. To be able to reach certain clear steps and still maintain
> the 20'ish patches in a set requirement, I have sent a few patches which
> are standalone and basically noise - like this one - to avoid
> distractions on the real change.

This patch is far from noise. In the thread you said that you'll be
adding 4 more indirect calls and that rings performance alarms.
retpoline is expensive. If you want to add more indirect calls to fib lookup
it's no go. Benchmarks with retpoline=off won't show the difference.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-26 15:11               ` Alexei Starovoitov
@ 2019-03-26 15:29                 ` David Ahern
  0 siblings, 0 replies; 16+ messages in thread
From: David Ahern @ 2019-03-26 15:29 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: David Miller, netdev, edumazet, kafai

On 3/26/19 9:11 AM, Alexei Starovoitov wrote:
> On Tue, Mar 26, 2019 at 08:19:06AM -0600, David Ahern wrote:
>> work needed. To be able to reach certain clear steps and still maintain
>> the 20'ish patches in a set requirement, I have sent a few patches which
>> are standalone and basically noise - like this one - to avoid
>> distractions on the real change.
> 
> This patch is far from noise. In the thread you said that you'll be
> adding 4 more indirect calls and that rings performance alarms.
> retpoline is expensive. If you want to add more indirect calls to fib lookup
> it's no go. Benchmarks with retpoline=off won't show the difference.
> 

As I stated in the email I sent ("net: Improve route scalability via
support for nexthop objects"):

"adding hooks to the ipv6 stubs (bump sernum, send route notifications
and delete routes based on nexthop updates)."

Those plus initialization and release of fib6_nh are all done in the
control path.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file
  2019-03-22 13:06 [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file David Ahern
  2019-03-22 16:14 ` Alexei Starovoitov
  2019-03-24  1:40 ` David Miller
@ 2019-03-29 17:55 ` David Miller
  2 siblings, 0 replies; 16+ messages in thread
From: David Miller @ 2019-03-29 17:55 UTC (permalink / raw)
  To: dsahern; +Cc: netdev, dsahern

From: David Ahern <dsahern@kernel.org>
Date: Fri, 22 Mar 2019 06:06:09 -0700

> From: David Ahern <dsahern@gmail.com>
> 
> The number of stubs is growing and has nothing to do with addrconf.
> Move the definition of the stubs to a separate header file and update
> users. In the move, drop the vxlan specific comment before ipv6_stub.
> 
> Code move only; no functional change intended.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>

Applied, thanks David.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2019-03-29 17:55 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-22 13:06 [PATCH net-next] ipv6: Move ipv6 stubs to a separate header file David Ahern
2019-03-22 16:14 ` Alexei Starovoitov
2019-03-22 16:17   ` David Ahern
2019-03-22 17:04     ` Alexei Starovoitov
2019-03-24  1:40 ` David Miller
2019-03-24  3:55   ` Alexei Starovoitov
2019-03-24 12:56     ` David Ahern
2019-03-25  3:26       ` Alexei Starovoitov
2019-03-25 15:39         ` Stephen Suryaputra
2019-03-25 17:02         ` David Ahern
2019-03-26  3:05           ` Alexei Starovoitov
2019-03-26 14:19             ` David Ahern
2019-03-26 15:11               ` Alexei Starovoitov
2019-03-26 15:29                 ` David Ahern
2019-03-26  2:22         ` David Ahern
2019-03-29 17:55 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.