[PATCH net-next 0/9] net: Kernel side filtering for route dumps

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next 0/9] net: Kernel side filtering for route dumps
@ 2018-10-11 15:06 David Ahern
  2018-10-11 15:06 ` [PATCH net-next 1/9] net: Add struct for fib dump filter David Ahern
                   ` (9 more replies)
  0 siblings, 10 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of route dumps by protocol (e.g., which
routing daemon installed the route), route type (e.g., unicast), table
id and nexthop device.

iproute2 has been doing this filtering in userspace for years; pushing
the filters to the kernel side reduces the amount of data the kernel
sends and reduces wasted cycles on both sides processing unwanted data.
These initial options provide a huge improvement for efficiently
examining routes on large scale systems.

David Ahern (9):
  net: Add struct for fib dump filter
  net/ipv4: Plumb support for filtering route dumps
  net/ipv6: Plumb support for filtering route dumps
  net/mpls: Plumb support for filtering route dumps
  net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
  net: Enable kernel side filtering of route dumps
  net/mpls: Handle kernel side filtering of route dumps
  net/ipv6: Bail early if user only wants cloned entries
  net/ipv4: Bail early if user only wants prefix entries

 include/linux/mroute_base.h |  5 +--
 include/net/ip6_route.h     |  1 +
 include/net/ip_fib.h        | 14 ++++++--
 net/ipv4/fib_frontend.c     | 64 +++++++++++++++++++++++++++------
 net/ipv4/fib_trie.c         | 37 +++++++++++++------
 net/ipv4/ipmr.c             |  8 +++--
 net/ipv4/ipmr_base.c        | 33 ++++++++++++++++-
 net/ipv6/ip6_fib.c          | 17 +++++++--
 net/ipv6/ip6mr.c            |  7 ++--
 net/ipv6/route.c            | 40 ++++++++++++++++-----
 net/mpls/af_mpls.c          | 86 +++++++++++++++++++++++++++++++++++++++------
 11 files changed, 262 insertions(+), 50 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH net-next 1/9] net: Add struct for fib dump filter
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:06 ` [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps David Ahern
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Add struct fib_dump_filter for options on limiting which routes are
returned in a dump request. The current list is table id, protocol,
route type, rtm_flags and nexthop device index. struct net is needed
to lookup the net_device from the index.

Plumb the new arguments from dump handlers to ip_valid_fib_dump_req.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_route.h |  1 +
 include/net/ip_fib.h    | 12 +++++++++++-
 net/ipv4/fib_frontend.c |  6 ++++--
 net/ipv4/ipmr.c         |  6 +++++-
 net/ipv6/ip6_fib.c      |  5 +++--
 net/ipv6/ip6mr.c        |  5 ++++-
 net/mpls/af_mpls.c      | 12 ++++++++----
 7 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cef186dbd2ce..7ab119936e69 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,6 +174,7 @@ struct rt6_rtnl_dump_arg {
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
 	struct net *net;
+	struct fib_dump_filter filter;
 };
 
 int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9846b79c9ee1..9dde41ad02a1 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -222,6 +222,15 @@ struct fib_table {
 	unsigned long		__data[0];
 };
 
+struct fib_dump_filter {
+	bool			filter_set;
+	unsigned char		protocol;
+	unsigned char		rt_type;
+	u32			table_id;
+	unsigned int		flags;
+	struct net_device	*dev;
+};
+
 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags);
 int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
@@ -452,6 +461,7 @@ static inline void fib_proc_exit(struct net *net)
 
 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack);
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 038f511c73fa..d0fb9b7efa27 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -802,7 +802,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return err;
 }
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -837,6 +838,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct fib_dump_filter filter = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
 	struct fib_table *tb;
@@ -844,7 +846,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	int dumped = 0, err;
 
 	if (cb->strict_check) {
-		err = ip_valid_fib_dump_req(nlh, cb->extack);
+		err = ip_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 91b0d5671649..44d777058960 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2527,9 +2527,13 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct fib_dump_filter filter = {};
+
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(cb->nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e14d244c551f..6a169794a674 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -566,17 +566,18 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct rt6_rtnl_dump_arg arg = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
-	struct rt6_rtnl_dump_arg arg;
 	struct fib6_walker *w;
 	struct fib6_table *tb;
 	struct hlist_head *head;
 	int res = 0;
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d7563ef76518..dbd5166c5599 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2458,10 +2458,13 @@ static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
+	struct fib_dump_filter filter = {};
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5fe274c47c41..bfcb4759c9ee 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2032,13 +2032,15 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 }
 
 #if IS_ENABLED(CONFIG_INET)
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
-	return ip_valid_fib_dump_req(nlh, extack);
+	return ip_valid_fib_dump_req(net, nlh, filter, extack);
 }
 #else
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -2070,14 +2072,16 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
+	struct fib_dump_filter filter = {};
 	size_t platform_labels;
 	unsigned int index;
 
 	ASSERT_RTNL();
 
 	if (cb->strict_check) {
-		int err = mpls_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
  2018-10-11 15:06 ` [PATCH net-next 1/9] net: Add struct for fib dump filter David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:56   ` Andrew Lunn
  2018-10-11 15:06 ` [PATCH net-next 3/9] net/ipv6: " David Ahern
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by table id, egress device index,
protocol and route type.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h    |  2 +-
 net/ipv4/fib_frontend.c |  5 ++++-
 net/ipv4/fib_trie.c     | 37 ++++++++++++++++++++++++++-----------
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9dde41ad02a1..68967f4bd024 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -238,7 +238,7 @@ int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
 int fib_table_delete(struct net *, struct fib_table *, struct fib_config *,
 		     struct netlink_ext_ack *extack);
 int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
-		   struct netlink_callback *cb);
+		   struct netlink_callback *cb, struct fib_dump_filter *filter);
 int fib_table_flush(struct net *net, struct fib_table *table);
 struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
 void fib_table_flush_external(struct fib_table *table);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d0fb9b7efa27..1528b0919951 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -866,10 +866,13 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
 			if (e < s_e)
 				goto next;
+			if (filter.table_id && filter.table_id != tb->tb_id)
+				goto next;
+
 			if (dumped)
 				memset(&cb->args[2], 0, sizeof(cb->args) -
 						 2 * sizeof(cb->args[0]));
-			err = fib_table_dump(tb, skb, cb);
+			err = fib_table_dump(tb, skb, cb, &filter);
 			if (err < 0) {
 				if (likely(skb->len))
 					goto out;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..237c9f72b265 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
 }
 
 static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
-			     struct sk_buff *skb, struct netlink_callback *cb)
+			     struct sk_buff *skb, struct netlink_callback *cb,
+			     struct fib_dump_filter *filter)
 {
+	unsigned int flags = NLM_F_MULTI;
 	__be32 xkey = htonl(l->key);
 	struct fib_alias *fa;
 	int i, s_i;
 
+	if (filter->filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	s_i = cb->args[4];
 	i = 0;
 
@@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
 		int err;
 
-		if (i < s_i) {
-			i++;
-			continue;
-		}
+		if (i < s_i)
+			goto next;
 
-		if (tb->tb_id != fa->tb_id) {
-			i++;
-			continue;
+		if (tb->tb_id != fa->tb_id)
+			goto next;
+
+		if (filter->filter_set) {
+			if (filter->rt_type && fa->fa_type != filter->rt_type)
+				goto next;
+
+			if ((filter->protocol &&
+			     fa->fa_info->fib_protocol != filter->protocol))
+				goto next;
+
+			if (filter->dev &&
+			    !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+				goto next;
 		}
 
 		err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 				    tb->tb_id, fa->fa_type,
 				    xkey, KEYLENGTH - fa->fa_slen,
-				    fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+				    fa->fa_tos, fa->fa_info, flags);
 		if (err < 0) {
 			cb->args[4] = i;
 			return err;
 		}
+next:
 		i++;
 	}
 
@@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 
 /* rcu_read_lock needs to be hold by caller from readside */
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
-		   struct netlink_callback *cb)
+		   struct netlink_callback *cb, struct fib_dump_filter *filter)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *l, *tp = t->kv;
@@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
 		int err;
 
-		err = fn_trie_dump_leaf(l, tb, skb, cb);
+		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
 		if (err < 0) {
 			cb->args[3] = key;
 			cb->args[2] = count;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 3/9] net/ipv6: Plumb support for filtering route dumps
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
  2018-10-11 15:06 ` [PATCH net-next 1/9] net: Add struct for fib dump filter David Ahern
  2018-10-11 15:06 ` [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:06 ` [PATCH net-next 4/9] net/mpls: " David Ahern
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by table id, egress device
index, protocol, and route type.

Move the existing route flags check for prefix only routes to the new
filter.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/ip6_fib.c |  9 +++++++++
 net/ipv6/route.c   | 40 ++++++++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6a169794a674..dd6a43874192 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -580,6 +580,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
+	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+		struct rtmsg *rtm = nlmsg_data(nlh);
+
+		if (rtm->rtm_flags & RTM_F_PREFIX)
+			arg.filter.flags = RTM_F_PREFIX;
 	}
 
 	s_h = cb->args[0];
@@ -616,6 +621,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
 			if (e < s_e)
 				goto next;
+			if (arg.filter.table_id &&
+			    arg.filter.table_id != tb->tb6_id)
+				goto next;
+
 			res = fib6_dump_table(tb, skb, cb);
 			if (res != 0)
 				goto out;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bf4cd647d8b8..8ed2e7462657 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4763,28 +4763,52 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	return -EMSGSIZE;
 }
 
+static bool fib6_info_uses_dev(const struct fib6_info *f6i,
+			       const struct net_device *dev)
+{
+	if (f6i->fib6_nh.nh_dev == dev)
+		return true;
+
+	if (f6i->fib6_nsiblings) {
+		struct fib6_info *sibling, *next_sibling;
+
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &f6i->fib6_siblings, fib6_siblings) {
+			if (sibling->fib6_nh.nh_dev == dev)
+				return true;
+		}
+	}
+
+	return false;
+}
+
 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+	struct fib_dump_filter *filter = &arg->filter;
+	unsigned int flags = NLM_F_MULTI;
 	struct net *net = arg->net;
 
 	if (rt == net->ipv6.fib6_null_entry)
 		return 0;
 
-	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
-		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
-
-		/* user wants prefix routes only */
-		if (rtm->rtm_flags & RTM_F_PREFIX &&
-		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
-			/* success since this is not a prefix route */
+	if ((filter->flags & RTM_F_PREFIX) &&
+	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
+		/* success since this is not a prefix route */
+		return 1;
+	}
+	if (filter->filter_set) {
+		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
 			return 1;
 		}
+		flags |= NLM_F_DUMP_FILTERED;
 	}
 
 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
-			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
+			     arg->cb->nlh->nlmsg_seq, flags);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 4/9] net/mpls: Plumb support for filtering route dumps
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
                   ` (2 preceding siblings ...)
  2018-10-11 15:06 ` [PATCH net-next 3/9] net/ipv6: " David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:06 ` [PATCH net-next 5/9] net: Plumb support for filtering ipv4 and ipv6 multicast " David Ahern
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by egress device index and
protocol. MPLS uses only a single table and route type.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/mpls/af_mpls.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index bfcb4759c9ee..48f4cbd9fb38 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2067,12 +2067,35 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 }
 #endif
 
+static bool mpls_rt_uses_dev(struct mpls_route *rt,
+			     const struct net_device *dev)
+{
+	struct net_device *nh_dev;
+
+	if (rt->rt_nhn == 1) {
+		struct mpls_nh *nh = rt->rt_nh;
+
+		nh_dev = rtnl_dereference(nh->nh_dev);
+		if (dev == nh_dev)
+			return true;
+	} else {
+		for_nexthops(rt) {
+			nh_dev = rtnl_dereference(nh->nh_dev);
+			if (nh_dev == dev)
+				return true;
+		} endfor_nexthops(rt);
+	}
+
+	return false;
+}
+
 static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
 	struct fib_dump_filter filter = {};
+	unsigned int flags = NLM_F_MULTI;
 	size_t platform_labels;
 	unsigned int index;
 
@@ -2084,6 +2107,14 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
+
+		/* for MPLS, there is only 1 table with fixed type and flags.
+		 * If either are set in the filter then return nothing.
+		 */
+		if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) ||
+		    (filter.rt_type && filter.rt_type != RTN_UNICAST) ||
+		     filter.flags)
+			return skb->len;
 	}
 
 	index = cb->args[0];
@@ -2092,15 +2123,24 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	platform_labels = net->mpls.platform_labels;
+
+	if (filter.filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	for (; index < platform_labels; index++) {
 		struct mpls_route *rt;
+
 		rt = rtnl_dereference(platform_label[index]);
 		if (!rt)
 			continue;
 
+		if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) ||
+		    (filter.protocol && rt->rt_protocol != filter.protocol))
+			continue;
+
 		if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-				    index, rt, NLM_F_MULTI) < 0)
+				    index, rt, flags) < 0)
 			break;
 	}
 	cb->args[0] = index;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 5/9] net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
                   ` (3 preceding siblings ...)
  2018-10-11 15:06 ` [PATCH net-next 4/9] net/mpls: " David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:06 ` [PATCH net-next 6/9] net: Enable kernel side filtering of " David Ahern
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by egress device index and
table id.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/linux/mroute_base.h |  5 +++--
 net/ipv4/ipmr.c             |  2 +-
 net/ipv4/ipmr_base.c        | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/ip6mr.c            |  2 +-
 4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 6675b9f81979..8fc516c47a64 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -7,6 +7,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/fib_notifier.h>
+#include <net/ip_fib.h>
 
 /**
  * struct vif_device - interface representor for multicast routing
@@ -290,7 +291,7 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock);
+		     spinlock_t *lock, struct fib_dump_filter *filter);
 
 int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 	    int (*rules_dump)(struct net *net,
@@ -340,7 +341,7 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 			     struct sk_buff *skb,
 			     u32 portid, u32 seq, struct mr_mfc *c,
 			     int cmd, int flags),
-		 spinlock_t *lock)
+		 spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	return -EINVAL;
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 44d777058960..f6ad4ef1d3c7 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2539,7 +2539,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
-				_ipmr_fill_mroute, &mfc_unres_lock);
+				_ipmr_fill_mroute, &mfc_unres_lock, &filter);
 }
 
 static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 1ad9aa62a97b..647300a55f42 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,6 +268,24 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(mr_fill_mroute);
 
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+			    const struct mr_mfc *c,
+			    const struct net_device *dev)
+{
+	int ct;
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			const struct vif_device *vif;
+
+			vif = &mrt->vif_table[ct];
+			if (vif->dev == dev)
+				return true;
+		}
+	}
+	return false;
+}
+
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
@@ -275,17 +293,26 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock)
+		     spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
 	struct net *net = sock_net(skb->sk);
 	struct mr_table *mrt;
 	struct mr_mfc *mfc;
 
+	/* multicast does not track protocol or have route type other
+	 * than RTN_MULTICAST
+	 */
+	if (filter->protocol || filter->flags ||
+	    (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+		return 0;
+
 	rcu_read_lock();
 	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
 		if (t < s_t)
 			goto next_table;
+		if (filter->table_id && filter->table_id != mrt->id)
+			goto next_table;
 		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
 			if (e < s_e)
 				goto next_entry;
@@ -303,6 +330,10 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 			if (e < s_e)
 				goto next_entry2;
+			if (filter->dev &&
+			    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+				goto next_entry2;
+
 			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
 				 cb->nlh->nlmsg_seq, mfc,
 				 RTM_NEWROUTE, NLM_F_MULTI) < 0) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index dbd5166c5599..a7593d1c372c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2470,5 +2470,5 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 	return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
-				_ip6mr_fill_mroute, &mfc_unres_lock);
+				_ip6mr_fill_mroute, &mfc_unres_lock, &filter);
 }
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 6/9] net: Enable kernel side filtering of route dumps
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
                   ` (4 preceding siblings ...)
  2018-10-11 15:06 ` [PATCH net-next 5/9] net: Plumb support for filtering ipv4 and ipv6 multicast " David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:06 ` [PATCH net-next 7/9] net/mpls: Handle " David Ahern
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Update parsing of route dump request to enable kernel side filtering.
Allow filtering results by protocol (e.g., which routing daemon installed
the route), route type (e.g., unicast), table id and nexthop device. These
amount to the low hanging fruit, yet a huge improvement, for dumping
routes.

ip_valid_fib_dump_req is called with RTNL held, so __dev_get_by_index can
be used to look up the device index without taking a reference. From
there filter->dev is only used during dump loops with the lock still held.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_frontend.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1528b0919951..a99f2c7ba4e6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -806,7 +806,11 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack)
 {
+	struct nlattr *tb[RTA_MAX + 1];
 	struct rtmsg *rtm;
+	int err, i;
+
+	ASSERT_RTNL();
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
@@ -815,8 +819,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 
 	rtm = nlmsg_data(nlh);
 	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
-	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
-	    rtm->rtm_type) {
+	    rtm->rtm_scope) {
 		NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
@@ -825,9 +828,41 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 
-	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
-		NL_SET_ERR_MSG(extack, "Invalid data after header in FIB dump request");
-		return -EINVAL;
+	filter->flags    = rtm->rtm_flags;
+	filter->protocol = rtm->rtm_protocol;
+	filter->rt_type  = rtm->rtm_type;
+	filter->table_id = rtm->rtm_table;
+
+	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+				 rtm_ipv4_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case RTA_TABLE:
+			filter->table_id = nla_get_u32(tb[i]);
+			break;
+		case RTA_OIF:
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
+	}
+
+	if (filter->flags || filter->protocol || filter->rt_type ||
+	    filter->table_id || filter->dev) {
+		filter->filter_set = 1;
 	}
 
 	return 0;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 7/9] net/mpls: Handle kernel side filtering of route dumps
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
                   ` (5 preceding siblings ...)
  2018-10-11 15:06 ` [PATCH net-next 6/9] net: Enable kernel side filtering of " David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:06 ` [PATCH net-next 8/9] net/ipv6: Bail early if user only wants cloned entries David Ahern
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Update the dump request parsing in MPLS for the non-INET case to
enable kernel side filtering. If INET is disabled the other filters
that make sense for MPLS are protocol and nexthop device.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/mpls/af_mpls.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 48f4cbd9fb38..b256de02251b 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2043,7 +2043,9 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
+	struct nlattr *tb[RTA_MAX + 1];
 	struct rtmsg *rtm;
+	int err, i;
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request");
@@ -2052,15 +2054,35 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 
 	rtm = nlmsg_data(nlh);
 	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
-	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
-	    rtm->rtm_type    || rtm->rtm_flags) {
+	    rtm->rtm_table   || rtm->rtm_scope    || rtm->rtm_type  ||
+	    rtm->rtm_flags) {
 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
 
-	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
-		NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in FIB dump request");
-		return -EINVAL;
+	if (rtm->rtm_protocol) {
+		filter->protocol = rtm->rtm_protocol;
+		filter->filter_set = 1;
+	}
+
+	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+				 rtm_mpls_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (i == RTA_OIF) {
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			filter->filter_set = 1;
+		} else if (tb[i]) {
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 8/9] net/ipv6: Bail early if user only wants cloned entries
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
                   ` (6 preceding siblings ...)
  2018-10-11 15:06 ` [PATCH net-next 7/9] net/mpls: Handle " David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:06 ` [PATCH net-next 9/9] net/ipv4: Bail early if user only wants prefix entries David Ahern
  2018-10-11 15:26 ` [PATCH net-next 0/9] net: Kernel side filtering for route dumps Stephen Hemminger
  9 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Similar to IPv4, IPv6 fib no longer contains cloned routes. If a user
requests a route dump for only cloned entries, no sense walking the FIB
and returning everything.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/ip6_fib.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index dd6a43874192..0399fafc5136 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -583,10 +583,13 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 		struct rtmsg *rtm = nlmsg_data(nlh);
 
-		if (rtm->rtm_flags & RTM_F_PREFIX)
-			arg.filter.flags = RTM_F_PREFIX;
+		arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 	}
 
+	/* fib entries are never clones */
+	if (arg.filter.flags & RTM_F_CLONED)
+		return skb->len;
+
 	s_h = cb->args[0];
 	s_e = cb->args[1];
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH net-next 9/9] net/ipv4: Bail early if user only wants prefix entries
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
                   ` (7 preceding siblings ...)
  2018-10-11 15:06 ` [PATCH net-next 8/9] net/ipv6: Bail early if user only wants cloned entries David Ahern
@ 2018-10-11 15:06 ` David Ahern
  2018-10-11 15:26 ` [PATCH net-next 0/9] net: Kernel side filtering for route dumps Stephen Hemminger
  9 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:06 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Unlike IPv6, IPv4 does not have routes marked with RTF_PREFIX_RT. If the
flag is set in the dump request, just return.

In the process of this change, move the CLONE check to use the new
filter flags.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_frontend.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a99f2c7ba4e6..3e5e3c380dbb 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -884,10 +884,14 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		err = ip_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
+	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+		struct rtmsg *rtm = nlmsg_data(nlh);
+
+		filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 	}
 
-	if (nlmsg_len(nlh) >= sizeof(struct rtmsg) &&
-	    ((struct rtmsg *)nlmsg_data(nlh))->rtm_flags & RTM_F_CLONED)
+	/* fib entries are never clones and ipv4 does not use prefix flag */
+	if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED))
 		return skb->len;
 
 	s_h = cb->args[0];
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
                   ` (8 preceding siblings ...)
  2018-10-11 15:06 ` [PATCH net-next 9/9] net/ipv4: Bail early if user only wants prefix entries David Ahern
@ 2018-10-11 15:26 ` Stephen Hemminger
  2018-10-11 15:32   ` David Ahern
  2018-10-11 15:46   ` Sowmini Varadhan
  9 siblings, 2 replies; 30+ messages in thread
From: Stephen Hemminger @ 2018-10-11 15:26 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, davem, David Ahern

On Thu, 11 Oct 2018 08:06:18 -0700
David Ahern <dsahern@kernel.org> wrote:

> From: David Ahern <dsahern@gmail.com>
> 
> Implement kernel side filtering of route dumps by protocol (e.g., which
> routing daemon installed the route), route type (e.g., unicast), table
> id and nexthop device.
> 
> iproute2 has been doing this filtering in userspace for years; pushing
> the filters to the kernel side reduces the amount of data the kernel
> sends and reduces wasted cycles on both sides processing unwanted data.
> These initial options provide a huge improvement for efficiently
> examining routes on large scale systems.
> 
> David Ahern (9):
>   net: Add struct for fib dump filter
>   net/ipv4: Plumb support for filtering route dumps
>   net/ipv6: Plumb support for filtering route dumps
>   net/mpls: Plumb support for filtering route dumps
>   net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
>   net: Enable kernel side filtering of route dumps
>   net/mpls: Handle kernel side filtering of route dumps
>   net/ipv6: Bail early if user only wants cloned entries
>   net/ipv4: Bail early if user only wants prefix entries
> 
>  include/linux/mroute_base.h |  5 +--
>  include/net/ip6_route.h     |  1 +
>  include/net/ip_fib.h        | 14 ++++++--
>  net/ipv4/fib_frontend.c     | 64 +++++++++++++++++++++++++++------
>  net/ipv4/fib_trie.c         | 37 +++++++++++++------
>  net/ipv4/ipmr.c             |  8 +++--
>  net/ipv4/ipmr_base.c        | 33 ++++++++++++++++-
>  net/ipv6/ip6_fib.c          | 17 +++++++--
>  net/ipv6/ip6mr.c            |  7 ++--
>  net/ipv6/route.c            | 40 ++++++++++++++++-----
>  net/mpls/af_mpls.c          | 86 +++++++++++++++++++++++++++++++++++++++------
>  11 files changed, 262 insertions(+), 50 deletions(-)
> 

You can do the something like this already with BPF socket filters.
But writing BPF for multi-part messages is hard.

Maybe a generic eBPF filter mechanism would be more flexible?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 15:26 ` [PATCH net-next 0/9] net: Kernel side filtering for route dumps Stephen Hemminger
@ 2018-10-11 15:32   ` David Ahern
  2018-10-11 16:10     ` Sowmini Varadhan
  2018-10-11 15:46   ` Sowmini Varadhan
  1 sibling, 1 reply; 30+ messages in thread
From: David Ahern @ 2018-10-11 15:32 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern; +Cc: netdev, davem

On 10/11/18 9:26 AM, Stephen Hemminger wrote:
>>
> 
> You can do the something like this already with BPF socket filters.
> But writing BPF for multi-part messages is hard.
> 
> Maybe a generic eBPF filter mechanism would be more flexible?
> 

That exists today and does not cover what is needed here:
1. The filters apply *after* the skb has been filled in.

2. an skb will have many routes within it and the user filter could
apply to any of those messages within the skb. It is not efficient to
generate the skb and then re-create it with a bpf filter.

The point here is to not even fill in the skb for something userspace
does not care about.

Route dumps are done for the entire FIB for each address family. As we
approach internet routing tables (700k+ routes for IPv4, currently
around 55k for IPv6) with many VRFs dumping the entire table is grossly
inefficient when for example only a single VRF table is wanted.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 15:26 ` [PATCH net-next 0/9] net: Kernel side filtering for route dumps Stephen Hemminger
  2018-10-11 15:32   ` David Ahern
@ 2018-10-11 15:46   ` Sowmini Varadhan
  2018-10-11 16:07     ` Jamal Hadi Salim
  1 sibling, 1 reply; 30+ messages in thread
From: Sowmini Varadhan @ 2018-10-11 15:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Ahern, netdev, davem, David Ahern

On (10/11/18 08:26), Stephen Hemminger wrote:
> You can do the something like this already with BPF socket filters.
> But writing BPF for multi-part messages is hard.

Indeed. And I was just experimenting with this for ARP just last week.
So to handle the caes of "ip neigh show a.b.c.d" without walking through
the entire arp table and filtering in userspace, you could add a sk_filter()
hook like this:

@@ -2258,6 +2260,24 @@ static int neigh_fill_info(struct sk_buff *skb, struct ne
                goto nla_put_failure;
 
        nlmsg_end(skb, nlh);
+
+       /* XXX skb->sk can be null in the neigh_timer_handler->__neigh_notify 
+        * path. Revisit..
+        */
+       if (!skb->sk)
+               return 0;
+
+       /* pull/push skb->data pointers so that sk_filter only sees the
+        * most recent nlh that wasjust added.
+        */
+       len = skb->len - nlh->nlmsg_len;
+       skb_pull(skb, len);
+       ret = sk_filter(skb->sk, skb);
+       skb_push(skb, len);
+       if (ret)
+               nlmsg_cancel(skb, nlh);
        return 0;
 
Writing the cBPF filter is not horrible, due to the nla extension. e.g.,
to pass a filter that matches on if_index and ipv4 address, the bpf_asm
src below will do the job. The benefit of using cBPF is that we can 
use this older kernels as well

        /*
         * Generated from the bpf_asm src
         * ldb [20]     ; len(nlmsghdr) + offsetof(ndm_ifindex)
         * jne sll->sll_ifindex, skip
         * ld #28       ; A <- len(nlmsghdr) + len(ndmsg), payload offset
         * ldx #1       ; X <-  NDA_DST
         * ld #nla      ; A <- offset(NDA_DST)
         * jeq #0, skip
         * tax
         * ld [x + 4]   ; A <- value(NDA_DST)
         * jneq htonl(addr), skip
         * ret #-1
         * skip: ret #0
         */
        struct sock_filter bpf_filter[] = {
                { 0x30,  0,  0, 0x00000014 },
                { 0x15,  0,  1, sll->sll_ifindex },
                { 0000,  0,  0, 0x0000001c },
                { 0x01,  0,  0, 0x00000001 },
                { 0x20,  0,  0, 0xfffff00c },
                { 0x15,  4,  0, 0000000000 },
                { 0x07,  0,  0, 0000000000 },
                { 0x40,  0,  0, 0x00000004 },
                { 0x15,  0,  1, htonl(addr) },
                { 0x06,  0,  0, 0xffffffff },
                { 0x06,  0,  0, 0000000000 },
                { 0x06,  0,  0, 0xffffffff },
                { 0x06,  0,  0, 0000000000 },
        };

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps
  2018-10-11 15:06 ` [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps David Ahern
@ 2018-10-11 15:56   ` Andrew Lunn
  2018-10-11 16:44     ` David Ahern
  0 siblings, 1 reply; 30+ messages in thread
From: Andrew Lunn @ 2018-10-11 15:56 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, davem, David Ahern

> @@ -866,10 +866,13 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
>  		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
>  			if (e < s_e)
>  				goto next;
> +			if (filter.table_id && filter.table_id != tb->tb_id)
> +				goto next;
> +

Hi David

Should there be a test here that filter->filter_set is set, before
looking at filter.table_id.

>  			if (dumped)
>  				memset(&cb->args[2], 0, sizeof(cb->args) -
>  						 2 * sizeof(cb->args[0]));
> -			err = fib_table_dump(tb, skb, cb);
> +			err = fib_table_dump(tb, skb, cb, &filter);
>  			if (err < 0) {
>  				if (likely(skb->len))
>  					goto out;
> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> index 5bc0c89e81e4..237c9f72b265 100644
> --- a/net/ipv4/fib_trie.c
> +++ b/net/ipv4/fib_trie.c
> @@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
>  }
>  
>  static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
> -			     struct sk_buff *skb, struct netlink_callback *cb)
> +			     struct sk_buff *skb, struct netlink_callback *cb,
> +			     struct fib_dump_filter *filter)
>  {
> +	unsigned int flags = NLM_F_MULTI;
>  	__be32 xkey = htonl(l->key);
>  	struct fib_alias *fa;
>  	int i, s_i;
>  
> +	if (filter->filter_set)
> +		flags |= NLM_F_DUMP_FILTERED;

With the above code, it seems like table_id could be filtered without
setting this flag to indicate some filters have been applied?

	Andrew

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 15:46   ` Sowmini Varadhan
@ 2018-10-11 16:07     ` Jamal Hadi Salim
  2018-10-11 16:16       ` David Ahern
  0 siblings, 1 reply; 30+ messages in thread
From: Jamal Hadi Salim @ 2018-10-11 16:07 UTC (permalink / raw)
  To: Sowmini Varadhan, Stephen Hemminger
  Cc: David Ahern, netdev, davem, David Ahern

On 2018-10-11 11:46 a.m., Sowmini Varadhan wrote:
> On (10/11/18 08:26), Stephen Hemminger wrote:
>> You can do the something like this already with BPF socket filters.
>> But writing BPF for multi-part messages is hard.
> 
> Indeed. And I was just experimenting with this for ARP just last week.
> So to handle the caes of "ip neigh show a.b.c.d" without walking through
> the entire arp table and filtering in userspace, you could add a sk_filter()
> hook like this:
> 

If this could be done a lot earlier aka at xxx_fill_info() bpf would
be a very good answer.
skb->sk (hence attached filter) should be available at that point.
Classical bpf per Sowmini's example maybe trickier.
Better - why dont we have an ebpf hook at this stage and then
we dont have to make changes to the kernel when someone adds
one more field to the filter?

BTW: useful for events as well - not just dumps (as the name
fib_dump_filter suggests)

cheers,
jamal

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 15:32   ` David Ahern
@ 2018-10-11 16:10     ` Sowmini Varadhan
  2018-10-11 16:13       ` David Ahern
  0 siblings, 1 reply; 30+ messages in thread
From: Sowmini Varadhan @ 2018-10-11 16:10 UTC (permalink / raw)
  To: David Ahern; +Cc: Stephen Hemminger, David Ahern, netdev, davem

On (10/11/18 09:32), David Ahern wrote:
> 
> Route dumps are done for the entire FIB for each address family. As we
> approach internet routing tables (700k+ routes for IPv4, currently
> around 55k for IPv6) with many VRFs dumping the entire table is grossly
> inefficient when for example only a single VRF table is wanted.

I think someone mentioned a long time ago that a VRF is not an 
interface/driver/net_device but rather a separate routing table with a
dedicated set of interfaces, iirc :-) :-)

In the latter model, if you wanted to dump a VRF table, you'd only
lock that table, and walk it, instead of holding up other VRFS 

sorry, could not resist my i-told-you-so moment :-P

--Sowmini

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 16:10     ` Sowmini Varadhan
@ 2018-10-11 16:13       ` David Ahern
  0 siblings, 0 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 16:13 UTC (permalink / raw)
  To: Sowmini Varadhan; +Cc: Stephen Hemminger, David Ahern, netdev, davem

On 10/11/18 10:10 AM, Sowmini Varadhan wrote:
> On (10/11/18 09:32), David Ahern wrote:
>>
>> Route dumps are done for the entire FIB for each address family. As we
>> approach internet routing tables (700k+ routes for IPv4, currently
>> around 55k for IPv6) with many VRFs dumping the entire table is grossly
>> inefficient when for example only a single VRF table is wanted.
> 
> I think someone mentioned a long time ago that a VRF is not an 
> interface/driver/net_device but rather a separate routing table with a
> dedicated set of interfaces, iirc :-) :-)
> 
> In the latter model, if you wanted to dump a VRF table, you'd only
> lock that table, and walk it, instead of holding up other VRFS 
> 
> sorry, could not resist my i-told-you-so moment :-P

huh?

VRF is a device has absolutely no relevance to this patch set.

There is no existing mechanism for dumping a single table. That's one of
the points of this set.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 16:07     ` Jamal Hadi Salim
@ 2018-10-11 16:16       ` David Ahern
  2018-10-11 16:33         ` Roopa Prabhu
  2018-10-11 16:46         ` Jamal Hadi Salim
  0 siblings, 2 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 16:16 UTC (permalink / raw)
  To: Jamal Hadi Salim, Sowmini Varadhan, Stephen Hemminger
  Cc: David Ahern, netdev, davem

On 10/11/18 10:07 AM, Jamal Hadi Salim wrote:
> On 2018-10-11 11:46 a.m., Sowmini Varadhan wrote:
>> On (10/11/18 08:26), Stephen Hemminger wrote:
>>> You can do the something like this already with BPF socket filters.
>>> But writing BPF for multi-part messages is hard.
>>
>> Indeed. And I was just experimenting with this for ARP just last week.
>> So to handle the caes of "ip neigh show a.b.c.d" without walking through
>> the entire arp table and filtering in userspace, you could add a
>> sk_filter()
>> hook like this:
>>
> 
> If this could be done a lot earlier aka at xxx_fill_info() bpf would
> be a very good answer.

IMO, bpf at the fill_info stage is not appropriate.


> skb->sk (hence attached filter) should be available at that point.
> Classical bpf per Sowmini's example maybe trickier.
> Better - why dont we have an ebpf hook at this stage and then
> we dont have to make changes to the kernel when someone adds
> one more field to the filter?
> 
> BTW: useful for events as well - not just dumps (as the name
> fib_dump_filter suggests)

you mean kernel notifications on internal events?
1. there is no user socket when notifications are created and the
*_fill_info is invoked

2. notifications are global going to potentially many sockets. For these
cases the existing sk_filter is appropriate.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 16:16       ` David Ahern
@ 2018-10-11 16:33         ` Roopa Prabhu
  2018-10-11 16:37           ` Sowmini Varadhan
  2018-10-11 16:46         ` Jamal Hadi Salim
  1 sibling, 1 reply; 30+ messages in thread
From: Roopa Prabhu @ 2018-10-11 16:33 UTC (permalink / raw)
  To: David Ahern
  Cc: Jamal Hadi Salim, Sowmini Varadhan, Stephen Hemminger, dsahern,
	netdev, David Miller

On Thu, Oct 11, 2018 at 9:16 AM David Ahern <dsahern@gmail.com> wrote:
>
> On 10/11/18 10:07 AM, Jamal Hadi Salim wrote:
> > On 2018-10-11 11:46 a.m., Sowmini Varadhan wrote:
> >> On (10/11/18 08:26), Stephen Hemminger wrote:
> >>> You can do the something like this already with BPF socket filters.
> >>> But writing BPF for multi-part messages is hard.
> >>
> >> Indeed. And I was just experimenting with this for ARP just last week.
> >> So to handle the caes of "ip neigh show a.b.c.d" without walking through
> >> the entire arp table and filtering in userspace, you could add a
> >> sk_filter()
> >> hook like this:
> >>
> >
> > If this could be done a lot earlier aka at xxx_fill_info() bpf would
> > be a very good answer.
>
> IMO, bpf at the fill_info stage is not appropriate.
>
>
> > skb->sk (hence attached filter) should be available at that point.
> > Classical bpf per Sowmini's example maybe trickier.
> > Better - why dont we have an ebpf hook at this stage and then
> > we dont have to make changes to the kernel when someone adds
> > one more field to the filter?
> >
> > BTW: useful for events as well - not just dumps (as the name
> > fib_dump_filter suggests)
>
> you mean kernel notifications on internal events?
> 1. there is no user socket when notifications are created and the
> *_fill_info is invoked
>
> 2. notifications are global going to potentially many sockets. For these
> cases the existing sk_filter is appropriate.

3. All networking subsystems already have this type of netlink
attribute filtering that apps rely on. This series
just makes it consistent for route dumps. Apps use such mechanism
already when requesting dumps.
Like everywhere else, BPF hook can be an alternate parallel mechanism.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 16:33         ` Roopa Prabhu
@ 2018-10-11 16:37           ` Sowmini Varadhan
  0 siblings, 0 replies; 30+ messages in thread
From: Sowmini Varadhan @ 2018-10-11 16:37 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: David Ahern, Jamal Hadi Salim, Stephen Hemminger, dsahern,
	netdev, David Miller

On (10/11/18 09:33), Roopa Prabhu wrote:
> 3. All networking subsystems already have this type of netlink
> attribute filtering that apps rely on. This series
> just makes it consistent for route dumps. Apps use such mechanism
> already when requesting dumps.
> Like everywhere else, BPF hook can be an alternate parallel mechanism.

sure and that make sense. though I hope we will explore those
alternate mechanisms too.

--Sowmini

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps
  2018-10-11 15:56   ` Andrew Lunn
@ 2018-10-11 16:44     ` David Ahern
  2018-10-11 18:30       ` Andrew Lunn
  0 siblings, 1 reply; 30+ messages in thread
From: David Ahern @ 2018-10-11 16:44 UTC (permalink / raw)
  To: Andrew Lunn, David Ahern; +Cc: netdev, davem

On 10/11/18 9:56 AM, Andrew Lunn wrote:
>> @@ -866,10 +866,13 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
>>  		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
>>  			if (e < s_e)
>>  				goto next;
>> +			if (filter.table_id && filter.table_id != tb->tb_id)
>> +				goto next;
>> +
> 
> Hi David
> 
> Should there be a test here that filter->filter_set is set, before
> looking at filter.table_id.

filter_set is meant for places that would need to look at multiple flags.

But now that you point this out, if a table id is passed in I should do
a get on the table and not walk the hash list.


> 
>>  			if (dumped)
>>  				memset(&cb->args[2], 0, sizeof(cb->args) -
>>  						 2 * sizeof(cb->args[0]));
>> -			err = fib_table_dump(tb, skb, cb);
>> +			err = fib_table_dump(tb, skb, cb, &filter);
>>  			if (err < 0) {
>>  				if (likely(skb->len))
>>  					goto out;
>> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
>> index 5bc0c89e81e4..237c9f72b265 100644
>> --- a/net/ipv4/fib_trie.c
>> +++ b/net/ipv4/fib_trie.c
>> @@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
>>  }
>>  
>>  static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
>> -			     struct sk_buff *skb, struct netlink_callback *cb)
>> +			     struct sk_buff *skb, struct netlink_callback *cb,
>> +			     struct fib_dump_filter *filter)
>>  {
>> +	unsigned int flags = NLM_F_MULTI;
>>  	__be32 xkey = htonl(l->key);
>>  	struct fib_alias *fa;
>>  	int i, s_i;
>>  
>> +	if (filter->filter_set)
>> +		flags |= NLM_F_DUMP_FILTERED;
> 
> With the above code, it seems like table_id could be filtered without
> setting this flag to indicate some filters have been applied?

Right, I should handle that.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 16:16       ` David Ahern
  2018-10-11 16:33         ` Roopa Prabhu
@ 2018-10-11 16:46         ` Jamal Hadi Salim
  2018-10-11 17:04           ` David Ahern
  1 sibling, 1 reply; 30+ messages in thread
From: Jamal Hadi Salim @ 2018-10-11 16:46 UTC (permalink / raw)
  To: David Ahern, Sowmini Varadhan, Stephen Hemminger
  Cc: David Ahern, netdev, davem

On 2018-10-11 12:16 p.m., David Ahern wrote:

> IMO, bpf at the fill_info stage is not appropriate.
> 

Somewhere before the skb is formed (and nlmsg is built).
If you go as far as constructing it, then cBPF per what
Sowmini should work; but there will be constructs which
are trickier.

> 
>> skb->sk (hence attached filter) should be available at that point.
>> Classical bpf per Sowmini's example maybe trickier.
>> Better - why dont we have an ebpf hook at this stage and then
>> we dont have to make changes to the kernel when someone adds
>> one more field to the filter?
>>
>> BTW: useful for events as well - not just dumps (as the name
>> fib_dump_filter suggests)
> 
> you mean kernel notifications on internal events?
> 1. there is no user socket when notifications are created and the
> *_fill_info is invoked
> 
> 2. notifications are global going to potentially many sockets. For these
> cases the existing sk_filter is appropriate.

#2 - netlink being a broad/multicast bus with attached listeners.

Yes, you can do it with cBPF but some complexity may occur. Example:
if i was interested to netdevice events of "kind = vxlan &&
admin flag is down" then that is non trivial to do with classical but
would be reasonably comfortable to do with ebpf.
Note:
That filter will work fine for dumps as well since the semantics
are the same.
the win is:  in the future when you just wanna add that one new
filter attribute you dont need a kernel patch in and roll in a
new production kernel.

cheers,
jamal

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 16:46         ` Jamal Hadi Salim
@ 2018-10-11 17:04           ` David Ahern
  2018-10-11 18:05             ` Jamal Hadi Salim
  0 siblings, 1 reply; 30+ messages in thread
From: David Ahern @ 2018-10-11 17:04 UTC (permalink / raw)
  To: Jamal Hadi Salim, Sowmini Varadhan, Stephen Hemminger
  Cc: David Ahern, netdev, davem

On 10/11/18 10:46 AM, Jamal Hadi Salim wrote:
> On 2018-10-11 12:16 p.m., David Ahern wrote:
> 
> Yes, you can do it with cBPF but some complexity may occur. Example:
> if i was interested to netdevice events of "kind = vxlan &&
> admin flag is down" then that is non trivial to do with classical but
> would be reasonably comfortable to do with ebpf.
> Note:
> That filter will work fine for dumps as well since the semantics
> are the same.

You can already filter link dumps by kind. How? By passing in the KIND
attribute on a dump request. This type of filtering exists for link
dumps, neighbor dumps, fdb dumps. Why is there a push to make route
dumps different? Why can't they be consistent and use existing semantics?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 17:04           ` David Ahern
@ 2018-10-11 18:05             ` Jamal Hadi Salim
  2018-10-11 18:44               ` David Ahern
  0 siblings, 1 reply; 30+ messages in thread
From: Jamal Hadi Salim @ 2018-10-11 18:05 UTC (permalink / raw)
  To: David Ahern, Sowmini Varadhan, Stephen Hemminger
  Cc: David Ahern, netdev, davem

On 2018-10-11 1:04 p.m., David Ahern wrote:

> You can already filter link dumps by kind. How? By passing in the KIND
> attribute on a dump request. This type of filtering exists for link
> dumps, neighbor dumps, fdb dumps. Why is there a push to make route
> dumps different? Why can't they be consistent and use existing semantics?

I think you meant filtering by ifindex in neighbor.
note: I would argue that there are already "adhoc" ways of filtering
in place, mostly use case driven). Otherwise Sowmini wouldnt have to
craft that bpf filter. There are netlink users who have none or some
weird filtering involved. There is no arguement that your approach
works for rtm. But the rest of the users missing filters will require
similar kernel changes. Could this be made generic enough to benefit
other netlink users?
The problem is there's always one new attribute that would make sense
for some use case which requires a kernel change ("send me an event only
if you get link down" or "dump all ports with link down").

cheers,
jamal

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps
  2018-10-11 16:44     ` David Ahern
@ 2018-10-11 18:30       ` Andrew Lunn
  0 siblings, 0 replies; 30+ messages in thread
From: Andrew Lunn @ 2018-10-11 18:30 UTC (permalink / raw)
  To: David Ahern; +Cc: David Ahern, netdev, davem

On Thu, Oct 11, 2018 at 10:44:04AM -0600, David Ahern wrote:
> On 10/11/18 9:56 AM, Andrew Lunn wrote:
> >> @@ -866,10 +866,13 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
> >>  		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
> >>  			if (e < s_e)
> >>  				goto next;
> >> +			if (filter.table_id && filter.table_id != tb->tb_id)
> >> +				goto next;
> >> +
> > 
> > Hi David
> > 
> > Should there be a test here that filter->filter_set is set, before
> > looking at filter.table_id.
> 
> filter_set is meant for places that would need to look at multiple flags.

Hi David

It would be good to add some comments to struct fib_dump_filter. Maybe
also move table_id before filter_set, to try to indicate it is not
relevant for the table_id.

	 Andrew

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 18:05             ` Jamal Hadi Salim
@ 2018-10-11 18:44               ` David Ahern
  2018-10-11 19:28                 ` David Miller
  2018-10-11 19:54                 ` Jamal Hadi Salim
  0 siblings, 2 replies; 30+ messages in thread
From: David Ahern @ 2018-10-11 18:44 UTC (permalink / raw)
  To: Jamal Hadi Salim, Sowmini Varadhan, Stephen Hemminger
  Cc: David Ahern, netdev, davem

On 10/11/18 12:05 PM, Jamal Hadi Salim wrote:
> On 2018-10-11 1:04 p.m., David Ahern wrote:
> 
>> You can already filter link dumps by kind. How? By passing in the KIND
>> attribute on a dump request. This type of filtering exists for link
>> dumps, neighbor dumps, fdb dumps. Why is there a push to make route
>> dumps different? Why can't they be consistent and use existing semantics?
> 
> I think you meant filtering by ifindex in neighbor.

I meant the general API of users passing filter arguments as attributes
to the dump (or values in the header) -- KIND, MASTER, device index,
etc. This is an existing API and existing capability.

> note: I would argue that there are already "adhoc" ways of filtering
> in place, mostly use case driven). Otherwise Sowmini wouldnt have to
> craft that bpf filter. There are netlink users who have none or some
> weird filtering involved. There is no arguement that your approach
> works for rtm. But the rest of the users missing filters will require
> similar kernel changes. Could this be made generic enough to benefit
> other netlink users?
> The problem is there's always one new attribute that would make sense
> for some use case which requires a kernel change ("send me an event only
> if you get link down" or "dump all ports with link down").
> 

I disagree with your overall premise of bpf the end-all hammer. It is a
tool but not the only tool. For starters, you are proposing building the
message, run the filter on it, and potentially back the message up to
drop the recently added piece because the filter does not want it
included. That is still wasting a lot of cpu cycles to build and drop. I
am thinking about scale to 1 million routes -- I do not need the dump
loop building a message for 1 million entries only to drop 99% of them.
That is crazy.

The way the kernel manages route tables says I should pass in the table
id as it is a major differentiator on what is returned. From there
lookup the specific table (I need to fix this part per my response to
Andrew), and then only walk it. The existing semantics, capabilities
that exist for other dump commands is the most efficient for some of
these high level, big hammer filters.

What you want gets into the tiniest of details and yes the imagination
can go wild with combinations of filter options. So maybe this scanning
of post-built messages is reasonable *after* the high level sorting is done.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 18:44               ` David Ahern
@ 2018-10-11 19:28                 ` David Miller
  2018-10-11 19:32                   ` Sowmini Varadhan
  2018-10-11 19:54                 ` Jamal Hadi Salim
  1 sibling, 1 reply; 30+ messages in thread
From: David Miller @ 2018-10-11 19:28 UTC (permalink / raw)
  To: dsahern; +Cc: jhs, sowmini.varadhan, stephen, dsahern, netdev

From: David Ahern <dsahern@gmail.com>
Date: Thu, 11 Oct 2018 12:44:49 -0600

> I disagree with your overall premise of bpf the end-all hammer. It is a
> tool but not the only tool. For starters, you are proposing building the
> message, run the filter on it, and potentially back the message up to
> drop the recently added piece because the filter does not want it
> included. That is still wasting a lot of cpu cycles to build and drop. I
> am thinking about scale to 1 million routes -- I do not need the dump
> loop building a message for 1 million entries only to drop 99% of them.
> That is crazy.

I completely agree.

Once you've composed the message, the whole point of filtering is lost.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 19:28                 ` David Miller
@ 2018-10-11 19:32                   ` Sowmini Varadhan
  2018-10-11 19:43                     ` David Miller
  0 siblings, 1 reply; 30+ messages in thread
From: Sowmini Varadhan @ 2018-10-11 19:32 UTC (permalink / raw)
  To: David Miller; +Cc: dsahern, jhs, stephen, dsahern, netdev

Without getting into Ahern's patchset, which he obviously feels 
quite passionately about..

On (10/11/18 12:28), David Miller wrote:
> 
> Once you've composed the message, the whole point of filtering is lost.

it would be nice to apply the filter *before* constructing the skb, 
but afaict most things in BPF today only operate on sk_buffs. How should
we use *BPF on something other than an sk_buff?

--Sowmini

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 19:32                   ` Sowmini Varadhan
@ 2018-10-11 19:43                     ` David Miller
  0 siblings, 0 replies; 30+ messages in thread
From: David Miller @ 2018-10-11 19:43 UTC (permalink / raw)
  To: sowmini.varadhan; +Cc: dsahern, jhs, stephen, dsahern, netdev

From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 11 Oct 2018 15:32:48 -0400

> Without getting into Ahern's patchset, which he obviously feels 
> quite passionately about..
> 
> On (10/11/18 12:28), David Miller wrote:
>> 
>> Once you've composed the message, the whole point of filtering is lost.
> 
> it would be nice to apply the filter *before* constructing the skb, 
> but afaict most things in BPF today only operate on sk_buffs. How should
> we use *BPF on something other than an sk_buff?

Personally I'm not going to spend cycles on that.

What's important to me in the short term is that David's patch set is
an appropriate way to add filtering, using existing facilities and
mechanisms that already exist for that purpose.

If people want to explore a possible eBPF mechanism for the future,
with an emphasis on "future", feel free to explore to your heart's
content.

But that doesn't exist in any form whatsoever, so that's not what we
should be talking about here.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH net-next 0/9] net: Kernel side filtering for route dumps
  2018-10-11 18:44               ` David Ahern
  2018-10-11 19:28                 ` David Miller
@ 2018-10-11 19:54                 ` Jamal Hadi Salim
  1 sibling, 0 replies; 30+ messages in thread
From: Jamal Hadi Salim @ 2018-10-11 19:54 UTC (permalink / raw)
  To: David Ahern, Sowmini Varadhan, Stephen Hemminger
  Cc: David Ahern, netdev, davem

On 2018-10-11 2:44 p.m., David Ahern wrote:
> On 10/11/18 12:05 PM, Jamal Hadi Salim wrote:
>> On 2018-10-11 1:04 p.m., David Ahern wrote:

> 
> I meant the general API of users passing filter arguments as attributes
> to the dump (or values in the header) -- KIND, MASTER, device index,
> etc. This is an existing API and existing capability.
> 

which i referred to as use-case driven. It is not unreasonable
to optimize for the most common - but every time somebody
comes up with something new you need to patch the kernel.


> I disagree with your overall premise of bpf the end-all hammer. It is a
> tool but not the only tool. For starters, you are proposing building the
> message, run the filter on it, and potentially back the message up to
> drop the recently added piece because the filter does not want it
> included. That is still wasting a lot of cpu cycles to build and drop. I
> am thinking about scale to 1 million routes -- I do not need the dump
> loop building a message for 1 million entries only to drop 99% of them.
> That is crazy.
> 

My earlier suggestion was for somewhere before the skb is formed.
In the vicinity of xxx_fill_info()
The "create skb and drop" kind works already today with some
acrobatics needed for some cases with cbpf.

Is it unfeasible to add an ebpf hook at that point and ask a user
supplied code "is this ok to send?" - this is no different
than doing a "get by key" operation where key/filter is any arbitrary
construction of fields rtm understands (including the ones you
provided like table index) that are passed in the user program.
Classical "select" mechanism in database tables

> The way the kernel manages route tables says I should pass in the table
> id as it is a major differentiator on what is returned. From there
> lookup the specific table (I need to fix this part per my response to
> Andrew), and then only walk it. The existing semantics, capabilities
> that exist for other dump commands is the most efficient for some of
> these high level, big hammer filters.
> 

Sure.

> What you want gets into the tiniest of details and yes the imagination
> can go wild with combinations of filter options. So maybe this scanning
> of post-built messages is reasonable *after* the high level sorting is done.
> 

That doesnt require any change.

cheers,
jamal

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2018-10-12  3:23 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-11 15:06 [PATCH net-next 0/9] net: Kernel side filtering for route dumps David Ahern
2018-10-11 15:06 ` [PATCH net-next 1/9] net: Add struct for fib dump filter David Ahern
2018-10-11 15:06 ` [PATCH net-next 2/9] net/ipv4: Plumb support for filtering route dumps David Ahern
2018-10-11 15:56   ` Andrew Lunn
2018-10-11 16:44     ` David Ahern
2018-10-11 18:30       ` Andrew Lunn
2018-10-11 15:06 ` [PATCH net-next 3/9] net/ipv6: " David Ahern
2018-10-11 15:06 ` [PATCH net-next 4/9] net/mpls: " David Ahern
2018-10-11 15:06 ` [PATCH net-next 5/9] net: Plumb support for filtering ipv4 and ipv6 multicast " David Ahern
2018-10-11 15:06 ` [PATCH net-next 6/9] net: Enable kernel side filtering of " David Ahern
2018-10-11 15:06 ` [PATCH net-next 7/9] net/mpls: Handle " David Ahern
2018-10-11 15:06 ` [PATCH net-next 8/9] net/ipv6: Bail early if user only wants cloned entries David Ahern
2018-10-11 15:06 ` [PATCH net-next 9/9] net/ipv4: Bail early if user only wants prefix entries David Ahern
2018-10-11 15:26 ` [PATCH net-next 0/9] net: Kernel side filtering for route dumps Stephen Hemminger
2018-10-11 15:32   ` David Ahern
2018-10-11 16:10     ` Sowmini Varadhan
2018-10-11 16:13       ` David Ahern
2018-10-11 15:46   ` Sowmini Varadhan
2018-10-11 16:07     ` Jamal Hadi Salim
2018-10-11 16:16       ` David Ahern
2018-10-11 16:33         ` Roopa Prabhu
2018-10-11 16:37           ` Sowmini Varadhan
2018-10-11 16:46         ` Jamal Hadi Salim
2018-10-11 17:04           ` David Ahern
2018-10-11 18:05             ` Jamal Hadi Salim
2018-10-11 18:44               ` David Ahern
2018-10-11 19:28                 ` David Miller
2018-10-11 19:32                   ` Sowmini Varadhan
2018-10-11 19:43                     ` David Miller
2018-10-11 19:54                 ` Jamal Hadi Salim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).