netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
@ 2012-02-12 19:13 Greg Rose
  2012-02-13 17:28 ` Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Greg Rose @ 2012-02-12 19:13 UTC (permalink / raw)
  To: netdev; +Cc: davem

Implement a new nlattr type IFLA_EXT_MASK.  The mask is a 32 bit
value that can be used to indicate to the kernel that certain
extended ifinfo values are requested by the user application.
At this time the only mask value created is RTEXT_FILTER_VF to
indicate that the user wants the ifinfo dump to send information
about the VFs belonging to the interface.

I have kept the NLM_F_EXT nlmsg_flags bit to indicate to the kernel
that the extended ifinfo dump filter mask is present.  It does not
act as the filter itself which has changed since the first submission
of this RFC.  Older versions of user applications won't set this
flag which should fix the problem of some applications not allocating
a large enough buffer for all the extended interface information.

Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
---

 include/linux/if_link.h   |    1 +
 include/linux/netlink.h   |    2 +
 include/linux/rtnetlink.h |    9 ++++++
 include/net/rtnetlink.h   |    2 +
 net/core/rtnetlink.c      |   73 ++++++++++++++++++++++++++++++++-------------
 5 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index c52d4b5..4b24ff4 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -137,6 +137,7 @@ enum {
 	IFLA_AF_SPEC,
 	IFLA_GROUP,		/* Group the device belongs to */
 	IFLA_NET_NS_FD,
+	IFLA_EXT_MASK,		/* Extended info mask, VFs, etc */
 	__IFLA_MAX
 };
 
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index a390e9d..fe9e938 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -59,6 +59,7 @@ struct nlmsghdr {
 #define NLM_F_MATCH	0x200	/* return all matching	*/
 #define NLM_F_ATOMIC	0x400	/* atomic GET		*/
 #define NLM_F_DUMP	(NLM_F_ROOT|NLM_F_MATCH)
+#define NLM_F_EXT	0x800	/* Get extended interface info such as VFs */
 
 /* Modifiers to NEW request */
 #define NLM_F_REPLACE	0x100	/* Override existing		*/
@@ -215,6 +216,7 @@ int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
 #else
 #define NLMSG_GOODSIZE	SKB_WITH_OVERHEAD(8192UL)
 #endif
+#define NLMSG_EXT_GOODSIZE SKB_WITH_OVERHEAD(32768UL)
 
 #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN)
 
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 8e872ea..89244b8 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -602,6 +602,15 @@ struct tcamsg {
 #define TCA_ACT_TAB 1 /* attr type must be >=1 */	
 #define TCAA_MAX 1
 
+struct rtnl_req_extended {
+	struct nlmsghdr nlh;
+	struct rtgenmsg g;
+	char ext[RTA_SPACE(sizeof(__u32))];
+};
+
+/* New extended info filters for IFLA_EXT_MASK */
+#define RTEXT_FILTER_VF		(1 << 0)
+
 /* End of information exported to user level */
 
 #ifdef __KERNEL__
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 678f1ff..3702939 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -6,7 +6,7 @@
 
 typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, void *);
 typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
-typedef u16 (*rtnl_calcit_func)(struct sk_buff *);
+typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *);
 
 extern int	__rtnl_register(int protocol, int msgtype,
 				rtnl_doit_func, rtnl_dumpit_func,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 65aebd4..67f8c95 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -60,7 +60,6 @@ struct rtnl_link {
 };
 
 static DEFINE_MUTEX(rtnl_mutex);
-static u16 min_ifinfo_dump_size;
 
 void rtnl_lock(void)
 {
@@ -724,10 +723,11 @@ static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
 }
 
 /* All VF info */
-static inline int rtnl_vfinfo_size(const struct net_device *dev)
+static inline int rtnl_vfinfo_size(const struct net_device *dev,
+				   u32 ext_filter_mask)
 {
-	if (dev->dev.parent && dev_is_pci(dev->dev.parent)) {
-
+	if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
+	    (ext_filter_mask & RTEXT_FILTER_VF)) {
 		int num_vfs = dev_num_vf(dev->dev.parent);
 		size_t size = nla_total_size(sizeof(struct nlattr));
 		size += nla_total_size(num_vfs * sizeof(struct nlattr));
@@ -766,7 +766,8 @@ static size_t rtnl_port_size(const struct net_device *dev)
 		return port_self_size;
 }
 
-static noinline size_t if_nlmsg_size(const struct net_device *dev)
+static noinline size_t if_nlmsg_size(const struct net_device *dev,
+				     u32 ext_filter_mask)
 {
 	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
 	       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
@@ -785,7 +786,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev)
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
 	       + nla_total_size(4) /* IFLA_NUM_VF */
-	       + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
+	       + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
 	       + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
 	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
 	       + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
@@ -868,7 +869,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
 
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
-			    unsigned int flags)
+			    unsigned int flags, u32 ext_filter_mask)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -941,10 +942,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		goto nla_put_failure;
 	copy_rtnl_link_stats64(nla_data(attr), stats);
 
-	if (dev->dev.parent)
+	if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF))
 		NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
 
-	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) {
+	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent
+	    && (ext_filter_mask & RTEXT_FILTER_VF)) {
 		int i;
 
 		struct nlattr *vfinfo, *vf;
@@ -1048,6 +1050,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	struct net_device *dev;
 	struct hlist_head *head;
 	struct hlist_node *node;
+	struct rtnl_req_extended *req;
+	u32 ext_filter_mask = 0;
 
 	s_h = cb->args[0];
 	s_idx = cb->args[1];
@@ -1055,6 +1059,17 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	rcu_read_lock();
 	cb->seq = net->dev_base_seq;
 
+	if (cb->nlh->nlmsg_flags && NLM_F_EXT) {
+		struct rtattr *ext_req;
+		u32 *ext_req_data;
+		req = (struct rtnl_req_extended *)cb->nlh;
+		ext_req = (struct rtattr *)&req->ext;
+		if (ext_req->rta_type == IFLA_EXT_MASK) {
+			ext_req_data = RTA_DATA(ext_req);
+			ext_filter_mask = *ext_req_data;
+		}
+	}
+
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
 		head = &net->dev_index_head[h];
@@ -1064,7 +1079,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 			if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
 					     NETLINK_CB(cb->skb).pid,
 					     cb->nlh->nlmsg_seq, 0,
-					     NLM_F_MULTI) <= 0)
+					     NLM_F_MULTI,
+					     ext_filter_mask) <= 0)
 				goto out;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -1100,6 +1116,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_VF_PORTS]		= { .type = NLA_NESTED },
 	[IFLA_PORT_SELF]	= { .type = NLA_NESTED },
 	[IFLA_AF_SPEC]		= { .type = NLA_NESTED },
+	[IFLA_EXT_MASK]		= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL(ifla_policy);
 
@@ -1509,8 +1526,6 @@ errout:
 
 	if (send_addr_notify)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
-	min_ifinfo_dump_size = max_t(u16, if_nlmsg_size(dev),
-				     min_ifinfo_dump_size);
 
 	return err;
 }
@@ -1861,12 +1876,12 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	if (dev == NULL)
 		return -ENODEV;
 
-	nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
+	nskb = nlmsg_new(if_nlmsg_size(dev, 0), GFP_KERNEL);
 	if (nskb == NULL)
 		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
-			       nlh->nlmsg_seq, 0, 0);
+			       nlh->nlmsg_seq, 0, 0, 0);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -1877,9 +1892,26 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	return err;
 }
 
-static u16 rtnl_calcit(struct sk_buff *skb)
+static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-	return min_ifinfo_dump_size;
+	struct rtnl_req_extended *req;
+	u32 ext_filter_mask = 0;
+
+	if (nlh->nlmsg_flags && NLM_F_EXT) {
+		struct rtattr *ext_req;
+		u32 *ext_req_data;
+		req = (struct rtnl_req_extended *)&nlh;
+		ext_req = (struct rtattr *)&req->ext;
+		if (ext_req->rta_type == IFLA_EXT_MASK) {
+			ext_req_data = RTA_DATA(ext_req);
+			ext_filter_mask = *ext_req_data;
+		}
+	}
+
+	if (ext_filter_mask & RTEXT_FILTER_VF)
+		return NLMSG_EXT_GOODSIZE;
+	else
+		return NLMSG_GOODSIZE;
 }
 
 static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
@@ -1913,13 +1945,11 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 	int err = -ENOBUFS;
 	size_t if_info_size;
 
-	skb = nlmsg_new((if_info_size = if_nlmsg_size(dev)), GFP_KERNEL);
+	skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL);
 	if (skb == NULL)
 		goto errout;
 
-	min_ifinfo_dump_size = max_t(u16, if_info_size, min_ifinfo_dump_size);
-
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0);
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -1949,6 +1979,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	int type;
 	int err;
 
+
 	type = nlh->nlmsg_type;
 	if (type > RTM_MAX)
 		return -EOPNOTSUPP;
@@ -1977,7 +2008,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EOPNOTSUPP;
 		calcit = rtnl_get_calcit(family, type);
 		if (calcit)
-			min_dump_alloc = calcit(skb);
+			min_dump_alloc = calcit(skb, nlh);
 
 		__rtnl_unlock();
 		rtnl = net->rtnl;

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-12 19:13 [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation Greg Rose
@ 2012-02-13 17:28 ` Stephen Hemminger
  2012-02-13 18:24   ` Rose, Gregory V
  2012-02-14 21:13 ` Ben Hutchings
  2012-02-14 21:22 ` David Miller
  2 siblings, 1 reply; 13+ messages in thread
From: Stephen Hemminger @ 2012-02-13 17:28 UTC (permalink / raw)
  To: Greg Rose; +Cc: netdev, davem

On Sun, 12 Feb 2012 11:13:42 -0800
Greg Rose <gregory.v.rose@intel.com> wrote:

> Implement a new nlattr type IFLA_EXT_MASK.  The mask is a 32 bit
> value that can be used to indicate to the kernel that certain
> extended ifinfo values are requested by the user application.
> At this time the only mask value created is RTEXT_FILTER_VF to
> indicate that the user wants the ifinfo dump to send information
> about the VFs belonging to the interface.
> 
> I have kept the NLM_F_EXT nlmsg_flags bit to indicate to the kernel
> that the extended ifinfo dump filter mask is present.  It does not
> act as the filter itself which has changed since the first submission
> of this RFC.  Older versions of user applications won't set this
> flag which should fix the problem of some applications not allocating
> a large enough buffer for all the extended interface information.
> 
> Signed-off-by: Greg Rose <gregory.v.rose@intel.com>

Is u32 going to be enough in next 5 years?
Since it is TLV value it should work with bigger value;
could you try sending a oversize attribute and code kernel to ignore
bits it doesn't understand?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-13 17:28 ` Stephen Hemminger
@ 2012-02-13 18:24   ` Rose, Gregory V
  0 siblings, 0 replies; 13+ messages in thread
From: Rose, Gregory V @ 2012-02-13 18:24 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, davem

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Stephen Hemminger
> Sent: Monday, February 13, 2012 9:29 AM
> To: Rose, Gregory V
> Cc: netdev@vger.kernel.org; davem@davemloft.net
> Subject: Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
> 
> On Sun, 12 Feb 2012 11:13:42 -0800
> Greg Rose <gregory.v.rose@intel.com> wrote:
> 
> > Implement a new nlattr type IFLA_EXT_MASK.  The mask is a 32 bit
> > value that can be used to indicate to the kernel that certain
> > extended ifinfo values are requested by the user application.
> > At this time the only mask value created is RTEXT_FILTER_VF to
> > indicate that the user wants the ifinfo dump to send information
> > about the VFs belonging to the interface.
> >
> > I have kept the NLM_F_EXT nlmsg_flags bit to indicate to the kernel
> > that the extended ifinfo dump filter mask is present.  It does not
> > act as the filter itself which has changed since the first submission
> > of this RFC.  Older versions of user applications won't set this
> > flag which should fix the problem of some applications not allocating
> > a large enough buffer for all the extended interface information.
> >
> > Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
> 
> Is u32 going to be enough in next 5 years?

I'm not very good at predicting the future.

;^)

> Since it is TLV value it should work with bigger value;
> could you try sending a oversize attribute and code kernel to ignore
> bits it doesn't understand?

That sounds like a good idea.  While I'm carving out some new space for the dump request message I might as well allocate some extra for future requirements.  I'll increase the size of the ext space array to something like 16 or 32 u32 sized words and increase the attribute length accordingly.  That should do for a long time I suppose.

First we'll see if the associated kernel patch is going to fly or not.

Thanks,

- Greg

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-12 19:13 [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation Greg Rose
  2012-02-13 17:28 ` Stephen Hemminger
@ 2012-02-14 21:13 ` Ben Hutchings
  2012-02-14 21:27   ` Rose, Gregory V
  2012-02-15 14:08   ` Thomas Graf
  2012-02-14 21:22 ` David Miller
  2 siblings, 2 replies; 13+ messages in thread
From: Ben Hutchings @ 2012-02-14 21:13 UTC (permalink / raw)
  To: Greg Rose; +Cc: netdev, davem

Thanks for persisting, Greg.

On Sun, 2012-02-12 at 11:13 -0800, Greg Rose wrote:
> Implement a new nlattr type IFLA_EXT_MASK.  The mask is a 32 bit
> value that can be used to indicate to the kernel that certain
> extended ifinfo values are requested by the user application.
> At this time the only mask value created is RTEXT_FILTER_VF to
> indicate that the user wants the ifinfo dump to send information
> about the VFs belonging to the interface.
> 
> I have kept the NLM_F_EXT nlmsg_flags bit to indicate to the kernel
> that the extended ifinfo dump filter mask is present.  It does not
> act as the filter itself which has changed since the first submission
> of this RFC.  Older versions of user applications won't set this
> flag which should fix the problem of some applications not allocating
> a large enough buffer for all the extended interface information.
[...]
> --- a/include/linux/netlink.h
> +++ b/include/linux/netlink.h
> @@ -59,6 +59,7 @@ struct nlmsghdr {
>  #define NLM_F_MATCH	0x200	/* return all matching	*/
>  #define NLM_F_ATOMIC	0x400	/* atomic GET		*/
>  #define NLM_F_DUMP	(NLM_F_ROOT|NLM_F_MATCH)
> +#define NLM_F_EXT	0x800	/* Get extended interface info such as VFs */
>  
>  /* Modifiers to NEW request */
>  #define NLM_F_REPLACE	0x100	/* Override existing		*/
> @@ -215,6 +216,7 @@ int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
>  #else
>  #define NLMSG_GOODSIZE	SKB_WITH_OVERHEAD(8192UL)
>  #endif
> +#define NLMSG_EXT_GOODSIZE SKB_WITH_OVERHEAD(32768UL)

This will probably do for now, but it would be preferable to really
calculate the maximum size.

I fear this is going to be too small in some cases while resulting in
allocation failures in others (that's an order-4 page allocation!).
Maybe reduce the message size slightly so that the skb data allocation
is within 32K?

[...]
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
[...]
> @@ -1055,6 +1059,17 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
>  	rcu_read_lock();
>  	cb->seq = net->dev_base_seq;
>  
> +	if (cb->nlh->nlmsg_flags && NLM_F_EXT) {

& not &&

> +		struct rtattr *ext_req;
> +		u32 *ext_req_data;
> +		req = (struct rtnl_req_extended *)cb->nlh;
> +		ext_req = (struct rtattr *)&req->ext;
> +		if (ext_req->rta_type == IFLA_EXT_MASK) {
> +			ext_req_data = RTA_DATA(ext_req);
> +			ext_filter_mask = *ext_req_data;
> +		}
> +	}

We cannot trust a flag to tell us what the length of the message is.  We
have to check the value of nlmsg_len (which netlink has already
validated as being within the skb length and >= our declared request
header length).  I think that makes the flag redundant.

In fact, I think we should really use nlmsg_parse() here.  That might be
overkill when there's only a single valid attribute; I don't know.

[...]
> @@ -1861,12 +1876,12 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
>  	if (dev == NULL)
>  		return -ENODEV;
>  
> -	nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
> +	nskb = nlmsg_new(if_nlmsg_size(dev, 0), GFP_KERNEL);
>  	if (nskb == NULL)
>  		return -ENOBUFS;
>  
>  	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
> -			       nlh->nlmsg_seq, 0, 0);
> +			       nlh->nlmsg_seq, 0, 0, 0);
>  	if (err < 0) {
>  		/* -EMSGSIZE implies BUG in if_nlmsg_size */
>  		WARN_ON(err == -EMSGSIZE);

It seems like this ought to support IFLA_EXT_MASK as well, though it's
maybe less important ('ip link' doesn't need it).

> @@ -1877,9 +1892,26 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
>  	return err;
>  }
>  
> -static u16 rtnl_calcit(struct sk_buff *skb)
> +static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
>  {
> -	return min_ifinfo_dump_size;
> +	struct rtnl_req_extended *req;
> +	u32 ext_filter_mask = 0;
> +
> +	if (nlh->nlmsg_flags && NLM_F_EXT) {
> +		struct rtattr *ext_req;
> +		u32 *ext_req_data;
> +		req = (struct rtnl_req_extended *)&nlh;
> +		ext_req = (struct rtattr *)&req->ext;
> +		if (ext_req->rta_type == IFLA_EXT_MASK) {
> +			ext_req_data = RTA_DATA(ext_req);
> +			ext_filter_mask = *ext_req_data;
> +		}
> +	}
[...]

This has the same parsing problem as rtnl_dump_ifinfo().

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-12 19:13 [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation Greg Rose
  2012-02-13 17:28 ` Stephen Hemminger
  2012-02-14 21:13 ` Ben Hutchings
@ 2012-02-14 21:22 ` David Miller
  2012-02-14 21:34   ` Rose, Gregory V
  2 siblings, 1 reply; 13+ messages in thread
From: David Miller @ 2012-02-14 21:22 UTC (permalink / raw)
  To: gregory.v.rose; +Cc: netdev

From: Greg Rose <gregory.v.rose@intel.com>
Date: Sun, 12 Feb 2012 11:13:42 -0800

> I have kept the NLM_F_EXT nlmsg_flags bit to indicate to the kernel
> that the extended ifinfo dump filter mask is present.

No extra indications other than presence of the attribute itself is
necessary.  Please remove this flag.

> @@ -215,6 +216,7 @@ int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
>  #else
>  #define NLMSG_GOODSIZE	SKB_WITH_OVERHEAD(8192UL)
>  #endif
> +#define NLMSG_EXT_GOODSIZE SKB_WITH_OVERHEAD(32768UL)

I indicated in my suggestions that you'll need to calculate this at
run-time based upon the extensions enabled and the size of the resulting
message plus attributes.

You absolutely cannot just pick some large number and run with this,
it's highly wasteful and will be potentially causing allocation
failures.

The goodsize value is specifically choosen such that we don't exceed
an order-1 allocation with page sizes of 4096 and larger.

> +struct rtnl_req_extended {
> +	struct nlmsghdr nlh;
> +	struct rtgenmsg g;
> +	char ext[RTA_SPACE(sizeof(__u32))];
> +};
> +
> +/* New extended info filters for IFLA_EXT_MASK */
> +#define RTEXT_FILTER_VF		(1 << 0)
> +

This seems completely unnecessary.

Just define IFLA_EXT_MASK as a variable length array of u32's, but to
be honest, for now, you can just make it a normal u32 attribute.

If we extend it in the future, we can make the kernel handle any
length, u32 or otherwise.  The size field already present in all
netlink attributes will allow us to do this transparently.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-14 21:13 ` Ben Hutchings
@ 2012-02-14 21:27   ` Rose, Gregory V
  2012-02-14 21:31     ` David Miller
  2012-02-15 14:08   ` Thomas Graf
  1 sibling, 1 reply; 13+ messages in thread
From: Rose, Gregory V @ 2012-02-14 21:27 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, davem

> -----Original Message-----
> From: Ben Hutchings [mailto:bhutchings@solarflare.com]
> Sent: Tuesday, February 14, 2012 1:13 PM
> To: Rose, Gregory V
> Cc: netdev@vger.kernel.org; davem@davemloft.net
> Subject: Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
> 
> Thanks for persisting, Greg.

We'll slay this dragon for good someday.

> 
> On Sun, 2012-02-12 at 11:13 -0800, Greg Rose wrote:
> > Implement a new nlattr type IFLA_EXT_MASK.  The mask is a 32 bit
> > value that can be used to indicate to the kernel that certain
> > extended ifinfo values are requested by the user application.
> > At this time the only mask value created is RTEXT_FILTER_VF to
> > indicate that the user wants the ifinfo dump to send information
> > about the VFs belonging to the interface.
> >
> > I have kept the NLM_F_EXT nlmsg_flags bit to indicate to the kernel
> > that the extended ifinfo dump filter mask is present.  It does not
> > act as the filter itself which has changed since the first submission
> > of this RFC.  Older versions of user applications won't set this
> > flag which should fix the problem of some applications not allocating
> > a large enough buffer for all the extended interface information.
> [...]
> > --- a/include/linux/netlink.h
> > +++ b/include/linux/netlink.h
> > @@ -59,6 +59,7 @@ struct nlmsghdr {
> >  #define NLM_F_MATCH	0x200	/* return all matching	*/
> >  #define NLM_F_ATOMIC	0x400	/* atomic GET		*/
> >  #define NLM_F_DUMP	(NLM_F_ROOT|NLM_F_MATCH)
> > +#define NLM_F_EXT	0x800	/* Get extended interface info such as VFs
> */
> >
> >  /* Modifiers to NEW request */
> >  #define NLM_F_REPLACE	0x100	/* Override existing		*/
> > @@ -215,6 +216,7 @@ int netlink_sendskb(struct sock *sk, struct sk_buff
> *skb);
> >  #else
> >  #define NLMSG_GOODSIZE	SKB_WITH_OVERHEAD(8192UL)
> >  #endif
> > +#define NLMSG_EXT_GOODSIZE SKB_WITH_OVERHEAD(32768UL)
> 
> This will probably do for now, but it would be preferable to really
> calculate the maximum size.

That's a real sticky problem.  When the dump request comes in we don't know how big the required buffer might actually get.  That's why we used to have the min_ifinfo_dump_size.  However there were problems with that also so Dave asked me to get rid of it.  The only thing I could come up with just allocating a really big buffer.  If someone could point out to me how to get a maximum ifinfo dump size before rtnl_dump_ifinfo starts filtering through the matching interfaces then that would really help.  But at the point the dump request comes in we don't know beforehand how to calculate the info dump size because if_nlmsg_size() requires a pointer to a netdevice.

> 
> I fear this is going to be too small in some cases while resulting in
> allocation failures in others (that's an order-4 page allocation!).
> Maybe reduce the message size slightly so that the skb data allocation
> is within 32K?

Yeah, I could certainly do that.

> 
> [...]
> > --- a/net/core/rtnetlink.c
> > +++ b/net/core/rtnetlink.c
> [...]
> > @@ -1055,6 +1059,17 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb,
> struct netlink_callback *cb)
> >  	rcu_read_lock();
> >  	cb->seq = net->dev_base_seq;
> >
> > +	if (cb->nlh->nlmsg_flags && NLM_F_EXT) {
> 
> & not &&

Oops.  OK.

> 
> > +		struct rtattr *ext_req;
> > +		u32 *ext_req_data;
> > +		req = (struct rtnl_req_extended *)cb->nlh;
> > +		ext_req = (struct rtattr *)&req->ext;
> > +		if (ext_req->rta_type == IFLA_EXT_MASK) {
> > +			ext_req_data = RTA_DATA(ext_req);
> > +			ext_filter_mask = *ext_req_data;
> > +		}
> > +	}
> 
> We cannot trust a flag to tell us what the length of the message is.  We
> have to check the value of nlmsg_len (which netlink has already
> validated as being within the skb length and >= our declared request
> header length).  I think that makes the flag redundant.
> 
> In fact, I think we should really use nlmsg_parse() here.  That might be
> overkill when there's only a single valid attribute; I don't know.

Hmm, but we do want to plan for future expansion and more attributes.  I think that is probably a good idea.

> 
> [...]
> > @@ -1861,12 +1876,12 @@ static int rtnl_getlink(struct sk_buff *skb,
> struct nlmsghdr* nlh, void *arg)
> >  	if (dev == NULL)
> >  		return -ENODEV;
> >
> > -	nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
> > +	nskb = nlmsg_new(if_nlmsg_size(dev, 0), GFP_KERNEL);
> >  	if (nskb == NULL)
> >  		return -ENOBUFS;
> >
> >  	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
> > -			       nlh->nlmsg_seq, 0, 0);
> > +			       nlh->nlmsg_seq, 0, 0, 0);
> >  	if (err < 0) {
> >  		/* -EMSGSIZE implies BUG in if_nlmsg_size */
> >  		WARN_ON(err == -EMSGSIZE);
> 
> It seems like this ought to support IFLA_EXT_MASK as well, though it's
> maybe less important ('ip link' doesn't need it).

That's all I've been testing with but if there are other callers I'm unaware of then I can add the parsing here too.

> 
> > @@ -1877,9 +1892,26 @@ static int rtnl_getlink(struct sk_buff *skb,
> struct nlmsghdr* nlh, void *arg)
> >  	return err;
> >  }
> >
> > -static u16 rtnl_calcit(struct sk_buff *skb)
> > +static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
> >  {
> > -	return min_ifinfo_dump_size;
> > +	struct rtnl_req_extended *req;
> > +	u32 ext_filter_mask = 0;
> > +
> > +	if (nlh->nlmsg_flags && NLM_F_EXT) {
> > +		struct rtattr *ext_req;
> > +		u32 *ext_req_data;
> > +		req = (struct rtnl_req_extended *)&nlh;
> > +		ext_req = (struct rtattr *)&req->ext;
> > +		if (ext_req->rta_type == IFLA_EXT_MASK) {
> > +			ext_req_data = RTA_DATA(ext_req);
> > +			ext_filter_mask = *ext_req_data;
> > +		}
> > +	}
> [...]
> 
> This has the same parsing problem as rtnl_dump_ifinfo().

OK.

Thanks for the review.  I'll work in your suggested changes and resubmit the RFC.

- Greg


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-14 21:27   ` Rose, Gregory V
@ 2012-02-14 21:31     ` David Miller
  2012-02-14 21:41       ` Rose, Gregory V
  0 siblings, 1 reply; 13+ messages in thread
From: David Miller @ 2012-02-14 21:31 UTC (permalink / raw)
  To: gregory.v.rose; +Cc: bhutchings, netdev

From: "Rose, Gregory V" <gregory.v.rose@intel.com>
Date: Tue, 14 Feb 2012 21:27:59 +0000

> That's a real sticky problem.  When the dump request comes in we
> don't know how big the required buffer might actually get.  That's
> why we used to have the min_ifinfo_dump_size.  However there were
> problems with that also so Dave asked me to get rid of it.  The only
> thing I could come up with just allocating a really big buffer.  If
> someone could point out to me how to get a maximum ifinfo dump size
> before rtnl_dump_ifinfo starts filtering through the matching
> interfaces then that would really help.  But at the point the dump
> request comes in we don't know beforehand how to calculate the info
> dump size because if_nlmsg_size() requires a pointer to a netdevice.

Two quick ideas:

1) Have a first pass which iterates over the devices and calculates
   the maximum of all if_nlmsg_size() calculations over those devices
   then uses that for the dump buffer allocation size.

2) Replace min_ifinfo_dump_size() with an array of sizes, one for
   each combination of extended features.  The only concern with this
   one is that whilst it's practical to do this now with only 1 or
   a few feature bits, in the future it may be a lot less practical.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-14 21:22 ` David Miller
@ 2012-02-14 21:34   ` Rose, Gregory V
  0 siblings, 0 replies; 13+ messages in thread
From: Rose, Gregory V @ 2012-02-14 21:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Tuesday, February 14, 2012 1:23 PM
> To: Rose, Gregory V
> Cc: netdev@vger.kernel.org
> Subject: Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
> 
> From: Greg Rose <gregory.v.rose@intel.com>
> Date: Sun, 12 Feb 2012 11:13:42 -0800
> 
> > I have kept the NLM_F_EXT nlmsg_flags bit to indicate to the kernel
> > that the extended ifinfo dump filter mask is present.
> 
> No extra indications other than presence of the attribute itself is
> necessary.  Please remove this flag.
> 
> > @@ -215,6 +216,7 @@ int netlink_sendskb(struct sock *sk, struct sk_buff
> *skb);
> >  #else
> >  #define NLMSG_GOODSIZE	SKB_WITH_OVERHEAD(8192UL)
> >  #endif
> > +#define NLMSG_EXT_GOODSIZE SKB_WITH_OVERHEAD(32768UL)
> 
> I indicated in my suggestions that you'll need to calculate this at
> run-time based upon the extensions enabled and the size of the resulting
> message plus attributes.
> 
> You absolutely cannot just pick some large number and run with this,
> it's highly wasteful and will be potentially causing allocation
> failures.
> 
> The goodsize value is specifically choosen such that we don't exceed
> an order-1 allocation with page sizes of 4096 and larger.

If I do the buffer size calculation based upon the extensions enabled then I'll have to assume the maximum possible size because at the time the buffer size is calculated we don't' have a handle to the net devices yet.

So if the VF flag is set then I'll assume that up to 255 VFs might be present and calculate based upon that.

> 
> > +struct rtnl_req_extended {
> > +	struct nlmsghdr nlh;
> > +	struct rtgenmsg g;
> > +	char ext[RTA_SPACE(sizeof(__u32))];
> > +};
> > +
> > +/* New extended info filters for IFLA_EXT_MASK */
> > +#define RTEXT_FILTER_VF		(1 << 0)
> > +
> 
> This seems completely unnecessary.
> 
> Just define IFLA_EXT_MASK as a variable length array of u32's, but to
> be honest, for now, you can just make it a normal u32 attribute.
> 
> If we extend it in the future, we can make the kernel handle any
> length, u32 or otherwise.  The size field already present in all
> netlink attributes will allow us to do this transparently.

OK, I think I see where you and Ben are coming from on this.

Thanks,

- Greg

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-14 21:31     ` David Miller
@ 2012-02-14 21:41       ` Rose, Gregory V
  2012-02-14 21:48         ` David Miller
  0 siblings, 1 reply; 13+ messages in thread
From: Rose, Gregory V @ 2012-02-14 21:41 UTC (permalink / raw)
  To: David Miller; +Cc: bhutchings, netdev

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Tuesday, February 14, 2012 1:32 PM
> To: Rose, Gregory V
> Cc: bhutchings@solarflare.com; netdev@vger.kernel.org
> Subject: Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
> 
> From: "Rose, Gregory V" <gregory.v.rose@intel.com>
> Date: Tue, 14 Feb 2012 21:27:59 +0000
> 
> > That's a real sticky problem.  When the dump request comes in we
> > don't know how big the required buffer might actually get.  That's
> > why we used to have the min_ifinfo_dump_size.  However there were
> > problems with that also so Dave asked me to get rid of it.  The only
> > thing I could come up with just allocating a really big buffer.  If
> > someone could point out to me how to get a maximum ifinfo dump size
> > before rtnl_dump_ifinfo starts filtering through the matching
> > interfaces then that would really help.  But at the point the dump
> > request comes in we don't know beforehand how to calculate the info
> > dump size because if_nlmsg_size() requires a pointer to a netdevice.
> 
> Two quick ideas:
> 
> 1) Have a first pass which iterates over the devices and calculates
>    the maximum of all if_nlmsg_size() calculations over those devices
>    then uses that for the dump buffer allocation size.
> 
> 2) Replace min_ifinfo_dump_size() with an array of sizes, one for
>    each combination of extended features.  The only concern with this
>    one is that whilst it's practical to do this now with only 1 or
>    a few feature bits, in the future it may be a lot less practical.

The second item sort of matches what I said in the last reply.  Base the buffer allocation size on the maximum possible for the given extension which as I said, is up to 255 VFs for the case under immediate consideration.

The first one seems like a good idea but I wonder what the effect would be on a system with a large number of interfaces.

- Greg

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-14 21:41       ` Rose, Gregory V
@ 2012-02-14 21:48         ` David Miller
  2012-02-14 21:51           ` Rose, Gregory V
  0 siblings, 1 reply; 13+ messages in thread
From: David Miller @ 2012-02-14 21:48 UTC (permalink / raw)
  To: gregory.v.rose; +Cc: bhutchings, netdev

From: "Rose, Gregory V" <gregory.v.rose@intel.com>
Date: Tue, 14 Feb 2012 21:41:57 +0000

> The second item sort of matches what I said in the last reply.  Base
> the buffer allocation size on the maximum possible for the given
> extension which as I said, is up to 255 VFs for the case under
> immediate consideration.

That's what we're trying to avoid, because that will result in multi-order
allocations (which are failure prone) when most of the time such a large
buffer is entirely unnecessary.

> The first one seems like a good idea but I wonder what the effect
> would be on a system with a large number of interfaces.

It's already expensive to dump a large number of devices, and in
effect you'll be optimizing the buffer allocation which could in fact
end up helping performance.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-14 21:48         ` David Miller
@ 2012-02-14 21:51           ` Rose, Gregory V
  0 siblings, 0 replies; 13+ messages in thread
From: Rose, Gregory V @ 2012-02-14 21:51 UTC (permalink / raw)
  To: David Miller; +Cc: bhutchings, netdev

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Tuesday, February 14, 2012 1:48 PM
> To: Rose, Gregory V
> Cc: bhutchings@solarflare.com; netdev@vger.kernel.org
> Subject: Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
> 
> From: "Rose, Gregory V" <gregory.v.rose@intel.com>
> Date: Tue, 14 Feb 2012 21:41:57 +0000
> 
> > The second item sort of matches what I said in the last reply.  Base
> > the buffer allocation size on the maximum possible for the given
> > extension which as I said, is up to 255 VFs for the case under
> > immediate consideration.
> 
> That's what we're trying to avoid, because that will result in multi-order
> allocations (which are failure prone) when most of the time such a large
> buffer is entirely unnecessary.
> 
> > The first one seems like a good idea but I wonder what the effect
> > would be on a system with a large number of interfaces.
> 
> It's already expensive to dump a large number of devices, and in
> effect you'll be optimizing the buffer allocation which could in fact
> end up helping performance.

OK, I'll see what I can do.

- Greg

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-14 21:13 ` Ben Hutchings
  2012-02-14 21:27   ` Rose, Gregory V
@ 2012-02-15 14:08   ` Thomas Graf
  2012-02-15 16:32     ` Rose, Gregory V
  1 sibling, 1 reply; 13+ messages in thread
From: Thomas Graf @ 2012-02-15 14:08 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Greg Rose, netdev, davem

On Tue, Feb 14, 2012 at 09:13:13PM +0000, Ben Hutchings wrote:
> > +		struct rtattr *ext_req;
> > +		u32 *ext_req_data;
> > +		req = (struct rtnl_req_extended *)cb->nlh;
> > +		ext_req = (struct rtattr *)&req->ext;
> > +		if (ext_req->rta_type == IFLA_EXT_MASK) {
> > +			ext_req_data = RTA_DATA(ext_req);
> > +			ext_filter_mask = *ext_req_data;
> > +		}
> > +	}
> 
> We cannot trust a flag to tell us what the length of the message is.  We
> have to check the value of nlmsg_len (which netlink has already
> validated as being within the skb length and >= our declared request
> header length).  I think that makes the flag redundant.
> 
> In fact, I think we should really use nlmsg_parse() here.  That might be
> overkill when there's only a single valid attribute; I don't know.

I think it's worth the effort. You get all the validation for free. And
please use the netlink interface in <net/netlink.h>, the rtattr based
interface has been deprecated a while ago.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
  2012-02-15 14:08   ` Thomas Graf
@ 2012-02-15 16:32     ` Rose, Gregory V
  0 siblings, 0 replies; 13+ messages in thread
From: Rose, Gregory V @ 2012-02-15 16:32 UTC (permalink / raw)
  To: Thomas Graf, Ben Hutchings; +Cc: netdev, davem

> -----Original Message-----
> From: Thomas Graf [mailto:tgr@infradead.org] On Behalf Of Thomas Graf
> Sent: Wednesday, February 15, 2012 6:08 AM
> To: Ben Hutchings
> Cc: Rose, Gregory V; netdev@vger.kernel.org; davem@davemloft.net
> Subject: Re: [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation
> 
> On Tue, Feb 14, 2012 at 09:13:13PM +0000, Ben Hutchings wrote:
> > > +		struct rtattr *ext_req;
> > > +		u32 *ext_req_data;
> > > +		req = (struct rtnl_req_extended *)cb->nlh;
> > > +		ext_req = (struct rtattr *)&req->ext;
> > > +		if (ext_req->rta_type == IFLA_EXT_MASK) {
> > > +			ext_req_data = RTA_DATA(ext_req);
> > > +			ext_filter_mask = *ext_req_data;
> > > +		}
> > > +	}
> >
> > We cannot trust a flag to tell us what the length of the message is.  We
> > have to check the value of nlmsg_len (which netlink has already
> > validated as being within the skb length and >= our declared request
> > header length).  I think that makes the flag redundant.
> >
> > In fact, I think we should really use nlmsg_parse() here.  That might be
> > overkill when there's only a single valid attribute; I don't know.
> 
> I think it's worth the effort. You get all the validation for free. And
> please use the netlink interface in <net/netlink.h>, the rtattr based
> interface has been deprecated a while ago.

Yes, I'm changing it to use nlmsg_parse.  Didn't know about the deprecation of rtattr though, thanks for the tip.

- Greg

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-02-15 16:31 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-02-12 19:13 [RFC V2 PATCH] rtnetlink: Fix problem with buffer allocation Greg Rose
2012-02-13 17:28 ` Stephen Hemminger
2012-02-13 18:24   ` Rose, Gregory V
2012-02-14 21:13 ` Ben Hutchings
2012-02-14 21:27   ` Rose, Gregory V
2012-02-14 21:31     ` David Miller
2012-02-14 21:41       ` Rose, Gregory V
2012-02-14 21:48         ` David Miller
2012-02-14 21:51           ` Rose, Gregory V
2012-02-15 14:08   ` Thomas Graf
2012-02-15 16:32     ` Rose, Gregory V
2012-02-14 21:22 ` David Miller
2012-02-14 21:34   ` Rose, Gregory V

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).