netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 0/2] net: use net->dev_by_index in two places
@ 2024-02-09 14:56 Eric Dumazet
  2024-02-09 14:56 ` [PATCH net-next 1/2] vlan: use xarray iterator to implement /proc/net/vlan/config Eric Dumazet
  2024-02-09 14:56 ` [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo() Eric Dumazet
  0 siblings, 2 replies; 8+ messages in thread
From: Eric Dumazet @ 2024-02-09 14:56 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, eric.dumazet, Eric Dumazet

Bring "ip link" ordering to /proc/net/dev one (by ifindexes).

Do the same for /proc/net/vlan/config

Eric Dumazet (2):
  vlan: use xarray iterator to implement /proc/net/vlan/config
  rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()

 net/8021q/vlanproc.c | 46 ++++++++++++----------------------
 net/core/rtnetlink.c | 59 ++++++++++++++------------------------------
 2 files changed, 35 insertions(+), 70 deletions(-)

-- 
2.43.0.687.g38aa6559b0-goog


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH net-next 1/2] vlan: use xarray iterator to implement /proc/net/vlan/config
  2024-02-09 14:56 [PATCH net-next 0/2] net: use net->dev_by_index in two places Eric Dumazet
@ 2024-02-09 14:56 ` Eric Dumazet
  2024-02-09 22:25   ` Jakub Kicinski
  2024-02-09 14:56 ` [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo() Eric Dumazet
  1 sibling, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2024-02-09 14:56 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, eric.dumazet, Eric Dumazet

Adopt net->dev_by_index as I did in commit 0e0939c0adf9
("net-procfs: use xarray iterator to implement /proc/net/dev")

Not only this removes quadratic behavior, it also makes sure
an existing vlan device is always visible in the dump,
regardless of concurrent net->dev_base_head changes.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/8021q/vlanproc.c | 46 +++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index 7825c129742a0f334bd8404bc3dc26547eb69083..87b959da00cd3844de1b56b45045e88e6a0293ba 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -163,48 +163,34 @@ void vlan_proc_rem_dev(struct net_device *vlandev)
  * The following few functions build the content of /proc/net/vlan/config
  */
 
-/* start read of /proc/net/vlan/config */
-static void *vlan_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(rcu)
+static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos)
 {
+	unsigned long ifindex = *pos;
 	struct net_device *dev;
-	struct net *net = seq_file_net(seq);
-	loff_t i = 1;
-
-	rcu_read_lock();
-	if (*pos == 0)
-		return SEQ_START_TOKEN;
 
-	for_each_netdev_rcu(net, dev) {
+	for_each_netdev_dump(seq_file_net(seq), dev, ifindex) {
 		if (!is_vlan_dev(dev))
 			continue;
-
-		if (i++ == *pos)
-			return dev;
+		*pos = dev->ifindex;
+		return dev;
 	}
+	return NULL;
+}
+
+static void *vlan_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
 
-	return  NULL;
+	return vlan_seq_from_index(seq, pos);
 }
 
 static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct net_device *dev;
-	struct net *net = seq_file_net(seq);
-
 	++*pos;
-
-	dev = v;
-	if (v == SEQ_START_TOKEN)
-		dev = net_device_entry(&net->dev_base_head);
-
-	for_each_netdev_continue_rcu(net, dev) {
-		if (!is_vlan_dev(dev))
-			continue;
-
-		return dev;
-	}
-
-	return NULL;
+	return vlan_seq_from_index(seq, pos);
 }
 
 static void vlan_seq_stop(struct seq_file *seq, void *v)
-- 
2.43.0.687.g38aa6559b0-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()
  2024-02-09 14:56 [PATCH net-next 0/2] net: use net->dev_by_index in two places Eric Dumazet
  2024-02-09 14:56 ` [PATCH net-next 1/2] vlan: use xarray iterator to implement /proc/net/vlan/config Eric Dumazet
@ 2024-02-09 14:56 ` Eric Dumazet
  2024-02-09 22:24   ` Jakub Kicinski
  1 sibling, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2024-02-09 14:56 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: netdev, eric.dumazet, Eric Dumazet

Adopt net->dev_by_index as I did in commit 0e0939c0adf9
("net-procfs: use xarray iterator to implement /proc/net/dev")

This makes sure an existing device is always visible in the dump,
regardless of concurrent insertions/deletions.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/rtnetlink.c | 59 ++++++++++++++------------------------------
 1 file changed, 19 insertions(+), 40 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 31f433950c8dc953bcb65cc0469f7df962314b87..68a224bbf1dd6118526329782362a4bfc192d6b1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2188,25 +2188,20 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
 
 static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	const struct rtnl_link_ops *kind_ops = NULL;
 	struct netlink_ext_ack *extack = cb->extack;
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
-	struct net *tgt_net = net;
-	int h, s_h;
-	int idx = 0, s_idx;
-	struct net_device *dev;
-	struct hlist_head *head;
+	unsigned long ifindex = cb->args[0];
+	unsigned int flags = NLM_F_MULTI;
 	struct nlattr *tb[IFLA_MAX+1];
+	struct net *tgt_net = net;
 	u32 ext_filter_mask = 0;
-	const struct rtnl_link_ops *kind_ops = NULL;
-	unsigned int flags = NLM_F_MULTI;
+	struct net_device *dev;
 	int master_idx = 0;
 	int netnsid = -1;
 	int err, i;
 
-	s_h = cb->args[0];
-	s_idx = cb->args[1];
-
 	err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
 	if (err < 0) {
 		if (cb->strict_check)
@@ -2250,42 +2245,26 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 		flags |= NLM_F_DUMP_FILTERED;
 
 walk_entries:
-	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
-		idx = 0;
-		head = &tgt_net->dev_index_head[h];
-		hlist_for_each_entry(dev, head, index_hlist) {
-			if (link_dump_filtered(dev, master_idx, kind_ops))
-				goto cont;
-			if (idx < s_idx)
-				goto cont;
-			err = rtnl_fill_ifinfo(skb, dev, net,
-					       RTM_NEWLINK,
-					       NETLINK_CB(cb->skb).portid,
-					       nlh->nlmsg_seq, 0, flags,
-					       ext_filter_mask, 0, NULL, 0,
-					       netnsid, GFP_KERNEL);
-
-			if (err < 0) {
-				if (likely(skb->len))
-					goto out;
-
-				goto out_err;
-			}
-cont:
-			idx++;
-		}
-	}
-out:
 	err = skb->len;
-out_err:
-	cb->args[1] = idx;
-	cb->args[0] = h;
+	for_each_netdev_dump(tgt_net, dev, ifindex) {
+		if (link_dump_filtered(dev, master_idx, kind_ops))
+			continue;
+		err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
+				       NETLINK_CB(cb->skb).portid,
+				       nlh->nlmsg_seq, 0, flags,
+				       ext_filter_mask, 0, NULL, 0,
+				       netnsid, GFP_KERNEL);
+
+		if (err < 0)
+			break;
+		cb->args[0] = ifindex + 1;
+	}
 	cb->seq = tgt_net->dev_base_seq;
 	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 	if (netnsid >= 0)
 		put_net(tgt_net);
 
-	return err;
+	return skb->len ?: err;
 }
 
 int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
-- 
2.43.0.687.g38aa6559b0-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()
  2024-02-09 14:56 ` [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo() Eric Dumazet
@ 2024-02-09 22:24   ` Jakub Kicinski
  2024-02-10 11:23     ` Eric Dumazet
  0 siblings, 1 reply; 8+ messages in thread
From: Jakub Kicinski @ 2024-02-09 22:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, Paolo Abeni, netdev, eric.dumazet

On Fri,  9 Feb 2024 14:56:15 +0000 Eric Dumazet wrote:
> +	unsigned long ifindex = cb->args[0];

[snip]

> +	for_each_netdev_dump(tgt_net, dev, ifindex) {
> +		if (link_dump_filtered(dev, master_idx, kind_ops))
> +			continue;
> +		err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
> +				       NETLINK_CB(cb->skb).portid,
> +				       nlh->nlmsg_seq, 0, flags,
> +				       ext_filter_mask, 0, NULL, 0,
> +				       netnsid, GFP_KERNEL);
> +
> +		if (err < 0)
> +			break;
> +		cb->args[0] = ifindex + 1;

Perhaps we can cast the context buffer onto something typed and use 
it directly? I think it's a tiny bit less error prone:

	struct {
		unsigned long ifindex;
	} *ctx = (void *)cb->ctx;

Then we can:

	for_each_netdev_dump(tgt_net, dev, ctx->ifindex)
					   ^^^^^^^^^^^^

and not need to worry about saving the ifindex back to cb before
exiting.

Up to you.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 1/2] vlan: use xarray iterator to implement /proc/net/vlan/config
  2024-02-09 14:56 ` [PATCH net-next 1/2] vlan: use xarray iterator to implement /proc/net/vlan/config Eric Dumazet
@ 2024-02-09 22:25   ` Jakub Kicinski
  0 siblings, 0 replies; 8+ messages in thread
From: Jakub Kicinski @ 2024-02-09 22:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, Paolo Abeni, netdev, eric.dumazet

On Fri,  9 Feb 2024 14:56:14 +0000 Eric Dumazet wrote:
> Adopt net->dev_by_index as I did in commit 0e0939c0adf9
> ("net-procfs: use xarray iterator to implement /proc/net/dev")
> 
> Not only this removes quadratic behavior, it also makes sure
> an existing vlan device is always visible in the dump,
> regardless of concurrent net->dev_base_head changes.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Reviewed-by: Jakub Kicinski <kuba@kernel.org>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()
  2024-02-09 22:24   ` Jakub Kicinski
@ 2024-02-10 11:23     ` Eric Dumazet
  2024-02-11 18:29       ` Ido Schimmel
  0 siblings, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2024-02-10 11:23 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: David S . Miller, Paolo Abeni, netdev, eric.dumazet

On Fri, Feb 9, 2024 at 11:24 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Fri,  9 Feb 2024 14:56:15 +0000 Eric Dumazet wrote:
> > +     unsigned long ifindex = cb->args[0];
>
> [snip]
>
> > +     for_each_netdev_dump(tgt_net, dev, ifindex) {
> > +             if (link_dump_filtered(dev, master_idx, kind_ops))
> > +                     continue;
> > +             err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
> > +                                    NETLINK_CB(cb->skb).portid,
> > +                                    nlh->nlmsg_seq, 0, flags,
> > +                                    ext_filter_mask, 0, NULL, 0,
> > +                                    netnsid, GFP_KERNEL);
> > +
> > +             if (err < 0)
> > +                     break;
> > +             cb->args[0] = ifindex + 1;
>
> Perhaps we can cast the context buffer onto something typed and use
> it directly? I think it's a tiny bit less error prone:
>
>         struct {
>                 unsigned long ifindex;
>         } *ctx = (void *)cb->ctx;
>
> Then we can:
>
>         for_each_netdev_dump(tgt_net, dev, ctx->ifindex)
>                                            ^^^^^^^^^^^^
>
> and not need to worry about saving the ifindex back to cb before
> exiting.

Hi Jakub

I tried something like that (adding a common structure for future
conversions), but this was not working properly.

Unfortunately we only can save the ifindex back to cb->XXXX only after
 rtnl_fill_ifinfo() was a success.

For instance, after applying the following diff to my patch, we have a
bug, because iip link loops on the last device.

We need to set cb->args[0] to  last_dev->ifindex + 1 to end the dump.

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 68a224bbf1dd6118526329782362a4bfc192d6b1..7f562d6e40ebf5329e9c0b1c7add81c461683907
100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2192,9 +2192,11 @@ static int rtnl_dump_ifinfo(struct sk_buff
*skb, struct netlink_callback *cb)
        struct netlink_ext_ack *extack = cb->extack;
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
-       unsigned long ifindex = cb->args[0];
        unsigned int flags = NLM_F_MULTI;
        struct nlattr *tb[IFLA_MAX+1];
+       struct {
+               unsigned long ifindex;
+       } *ctx = (void *)cb->ctx;
        struct net *tgt_net = net;
        u32 ext_filter_mask = 0;
        struct net_device *dev;
@@ -2246,7 +2248,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb,
struct netlink_callback *cb)

 walk_entries:
        err = skb->len;
-       for_each_netdev_dump(tgt_net, dev, ifindex) {
+       for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
                if (link_dump_filtered(dev, master_idx, kind_ops))
                        continue;
                err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
@@ -2257,7 +2259,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb,
struct netlink_callback *cb)

                if (err < 0)
                        break;
-               cb->args[0] = ifindex + 1;
        }
        cb->seq = tgt_net->dev_base_seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()
  2024-02-10 11:23     ` Eric Dumazet
@ 2024-02-11 18:29       ` Ido Schimmel
  2024-02-11 21:35         ` Eric Dumazet
  0 siblings, 1 reply; 8+ messages in thread
From: Ido Schimmel @ 2024-02-11 18:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jakub Kicinski, David S . Miller, Paolo Abeni, netdev, eric.dumazet

On Sat, Feb 10, 2024 at 12:23:30PM +0100, Eric Dumazet wrote:
> On Fri, Feb 9, 2024 at 11:24 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Fri,  9 Feb 2024 14:56:15 +0000 Eric Dumazet wrote:
> > > +     unsigned long ifindex = cb->args[0];
> >
> > [snip]
> >
> > > +     for_each_netdev_dump(tgt_net, dev, ifindex) {
> > > +             if (link_dump_filtered(dev, master_idx, kind_ops))
> > > +                     continue;
> > > +             err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
> > > +                                    NETLINK_CB(cb->skb).portid,
> > > +                                    nlh->nlmsg_seq, 0, flags,
> > > +                                    ext_filter_mask, 0, NULL, 0,
> > > +                                    netnsid, GFP_KERNEL);
> > > +
> > > +             if (err < 0)
> > > +                     break;
> > > +             cb->args[0] = ifindex + 1;
> >
> > Perhaps we can cast the context buffer onto something typed and use
> > it directly? I think it's a tiny bit less error prone:
> >
> >         struct {
> >                 unsigned long ifindex;
> >         } *ctx = (void *)cb->ctx;
> >
> > Then we can:
> >
> >         for_each_netdev_dump(tgt_net, dev, ctx->ifindex)
> >                                            ^^^^^^^^^^^^
> >
> > and not need to worry about saving the ifindex back to cb before
> > exiting.
> 
> Hi Jakub
> 
> I tried something like that (adding a common structure for future
> conversions), but this was not working properly.
> 
> Unfortunately we only can save the ifindex back to cb->XXXX only after
>  rtnl_fill_ifinfo() was a success.
> 
> For instance, after applying the following diff to my patch, we have a
> bug, because iip link loops on the last device.
> 
> We need to set cb->args[0] to  last_dev->ifindex + 1 to end the dump.

I was looking into that in the past because of another rtnetlink dump
handler. See merge commit f8d3e0dc4b3a ("Merge branch
'nexthop-nexthop-dump-fixes'") and commit 913f60cacda7 ("nexthop: Fix
infinite nexthop dump when using maximum nexthop ID").

Basically, rtnetlink dump handlers always return a positive value if
some entries were filled in the skb to signal that more information
needs to be dumped, even if the dump is complete. I suspect this was
done to ensure that appending the NLMSG_DONE message will not fail, but
Jason fixed it in 2017 in commit 0642840b8bb0 ("af_netlink: ensure that
NLMSG_DONE never fails in dumps").

You can see that the dump is split across two buffers with only a single
netdev configured:

# strace -e sendto,recvmsg ip l
sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707673609, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 764
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707673609, nlmsg_pid=565}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 764
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707673609, nlmsg_pid=565}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+++ exited with 0 +++

The fake sentinel ('last_dev->ifindex + 1') is needed so that in the
second invocation of the dump handler it will not fill anything and then
return zero to signal that the dump is complete.

The following diff avoids this inefficiency and returns zero when the
dump is complete:

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 31f433950c8d..4efd571a6a3f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2267,17 +2267,15 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 
                        if (err < 0) {
                                if (likely(skb->len))
-                                       goto out;
-
-                               goto out_err;
+                                       err = skb->len;
+                               goto out;
                        }
 cont:
                        idx++;
                }
        }
+
 out:
-       err = skb->len;
-out_err:
        cb->args[1] = idx;
        cb->args[0] = h;
        cb->seq = tgt_net->dev_base_seq;

You can see that both messages (RTM_NEWLINK and NLMSG_DONE) are dumped
in a single buffer with this patch:

# strace -e sendto,recvmsg ip l
sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707674313, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 784
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707674313, nlmsg_pid=570}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707674313, nlmsg_pid=570}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 784
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+++ exited with 0 +++

And then it's possible to apply your patch with Jakub's suggestion:

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4efd571a6a3f..dba13b31c58b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2188,25 +2188,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
 
 static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
+       const struct rtnl_link_ops *kind_ops = NULL;
        struct netlink_ext_ack *extack = cb->extack;
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
-       struct net *tgt_net = net;
-       int h, s_h;
-       int idx = 0, s_idx;
-       struct net_device *dev;
-       struct hlist_head *head;
+       unsigned int flags = NLM_F_MULTI;
        struct nlattr *tb[IFLA_MAX+1];
+       struct {
+               unsigned long ifindex;
+       } *ctx = (void *)cb->ctx;
+       struct net *tgt_net = net;
        u32 ext_filter_mask = 0;
-       const struct rtnl_link_ops *kind_ops = NULL;
-       unsigned int flags = NLM_F_MULTI;
+       struct net_device *dev;
        int master_idx = 0;
        int netnsid = -1;
        int err, i;
 
-       s_h = cb->args[0];
-       s_idx = cb->args[1];
-
        err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
        if (err < 0) {
                if (cb->strict_check)
@@ -2250,34 +2247,22 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
                flags |= NLM_F_DUMP_FILTERED;
 
 walk_entries:
-       for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
-               idx = 0;
-               head = &tgt_net->dev_index_head[h];
-               hlist_for_each_entry(dev, head, index_hlist) {
-                       if (link_dump_filtered(dev, master_idx, kind_ops))
-                               goto cont;
-                       if (idx < s_idx)
-                               goto cont;
-                       err = rtnl_fill_ifinfo(skb, dev, net,
-                                              RTM_NEWLINK,
-                                              NETLINK_CB(cb->skb).portid,
-                                              nlh->nlmsg_seq, 0, flags,
-                                              ext_filter_mask, 0, NULL, 0,
-                                              netnsid, GFP_KERNEL);
-
-                       if (err < 0) {
-                               if (likely(skb->len))
-                                       err = skb->len;
-                               goto out;
-                       }
-cont:
-                       idx++;
+       err = 0;
+       for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+               if (link_dump_filtered(dev, master_idx, kind_ops))
+                       continue;
+               err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
+                                      NETLINK_CB(cb->skb).portid,
+                                      nlh->nlmsg_seq, 0, flags,
+                                      ext_filter_mask, 0, NULL, 0,
+                                      netnsid, GFP_KERNEL);
+
+               if (err < 0) {
+                       if (likely(skb->len))
+                               err = skb->len;
+                       break;
                }
        }
-
-out:
-       cb->args[1] = idx;
-       cb->args[0] = h;
        cb->seq = tgt_net->dev_base_seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        if (netnsid >= 0)

And it will not hang:

# strace -e sendto,recvmsg ip l
sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707675119, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 784
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707675119, nlmsg_pid=564}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707675119, nlmsg_pid=564}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 784
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+++ exited with 0 +++

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()
  2024-02-11 18:29       ` Ido Schimmel
@ 2024-02-11 21:35         ` Eric Dumazet
  0 siblings, 0 replies; 8+ messages in thread
From: Eric Dumazet @ 2024-02-11 21:35 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Jakub Kicinski, David S . Miller, Paolo Abeni, netdev, eric.dumazet

On Sun, Feb 11, 2024 at 7:29 PM Ido Schimmel <idosch@idosch.org> wrote:
>

>
> I was looking into that in the past because of another rtnetlink dump
> handler. See merge commit f8d3e0dc4b3a ("Merge branch
> 'nexthop-nexthop-dump-fixes'") and commit 913f60cacda7 ("nexthop: Fix
> infinite nexthop dump when using maximum nexthop ID").
>
> Basically, rtnetlink dump handlers always return a positive value if
> some entries were filled in the skb to signal that more information
> needs to be dumped, even if the dump is complete. I suspect this was
> done to ensure that appending the NLMSG_DONE message will not fail, but
> Jason fixed it in 2017 in commit 0642840b8bb0 ("af_netlink: ensure that
> NLMSG_DONE never fails in dumps").
>
> You can see that the dump is split across two buffers with only a single
> netdev configured:
>
> # strace -e sendto,recvmsg ip l
> sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707673609, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 764
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707673609, nlmsg_pid=565}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 764
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707673609, nlmsg_pid=565}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
>     link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> +++ exited with 0 +++
>
> The fake sentinel ('last_dev->ifindex + 1') is needed so that in the
> second invocation of the dump handler it will not fill anything and then
> return zero to signal that the dump is complete.
>
> The following diff avoids this inefficiency and returns zero when the
> dump is complete:
>
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index 31f433950c8d..4efd571a6a3f 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -2267,17 +2267,15 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
>
>                         if (err < 0) {
>                                 if (likely(skb->len))
> -                                       goto out;
> -
> -                               goto out_err;
> +                                       err = skb->len;
> +                               goto out;
>                         }
>  cont:
>                         idx++;
>                 }
>         }
> +
>  out:
> -       err = skb->len;
> -out_err:
>         cb->args[1] = idx;
>         cb->args[0] = h;
>         cb->seq = tgt_net->dev_base_seq;
>
> You can see that both messages (RTM_NEWLINK and NLMSG_DONE) are dumped
> in a single buffer with this patch:
>
> # strace -e sendto,recvmsg ip l
> sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707674313, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 784
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707674313, nlmsg_pid=570}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707674313, nlmsg_pid=570}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 784
> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
>     link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> +++ exited with 0 +++
>
> And then it's possible to apply your patch with Jakub's suggestion:
>
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index 4efd571a6a3f..dba13b31c58b 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -2188,25 +2188,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
>
>  static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
>  {
> +       const struct rtnl_link_ops *kind_ops = NULL;
>         struct netlink_ext_ack *extack = cb->extack;
>         const struct nlmsghdr *nlh = cb->nlh;
>         struct net *net = sock_net(skb->sk);
> -       struct net *tgt_net = net;
> -       int h, s_h;
> -       int idx = 0, s_idx;
> -       struct net_device *dev;
> -       struct hlist_head *head;
> +       unsigned int flags = NLM_F_MULTI;
>         struct nlattr *tb[IFLA_MAX+1];
> +       struct {
> +               unsigned long ifindex;
> +       } *ctx = (void *)cb->ctx;
> +       struct net *tgt_net = net;
>         u32 ext_filter_mask = 0;
> -       const struct rtnl_link_ops *kind_ops = NULL;
> -       unsigned int flags = NLM_F_MULTI;
> +       struct net_device *dev;
>         int master_idx = 0;
>         int netnsid = -1;
>         int err, i;
>
> -       s_h = cb->args[0];
> -       s_idx = cb->args[1];
> -
>         err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
>         if (err < 0) {
>                 if (cb->strict_check)
> @@ -2250,34 +2247,22 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
>                 flags |= NLM_F_DUMP_FILTERED;
>
>  walk_entries:
> -       for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
> -               idx = 0;
> -               head = &tgt_net->dev_index_head[h];
> -               hlist_for_each_entry(dev, head, index_hlist) {
> -                       if (link_dump_filtered(dev, master_idx, kind_ops))
> -                               goto cont;
> -                       if (idx < s_idx)
> -                               goto cont;
> -                       err = rtnl_fill_ifinfo(skb, dev, net,
> -                                              RTM_NEWLINK,
> -                                              NETLINK_CB(cb->skb).portid,
> -                                              nlh->nlmsg_seq, 0, flags,
> -                                              ext_filter_mask, 0, NULL, 0,
> -                                              netnsid, GFP_KERNEL);
> -
> -                       if (err < 0) {
> -                               if (likely(skb->len))
> -                                       err = skb->len;
> -                               goto out;
> -                       }
> -cont:
> -                       idx++;
> +       err = 0;
> +       for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
> +               if (link_dump_filtered(dev, master_idx, kind_ops))
> +                       continue;
> +               err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
> +                                      NETLINK_CB(cb->skb).portid,
> +                                      nlh->nlmsg_seq, 0, flags,
> +                                      ext_filter_mask, 0, NULL, 0,
> +                                      netnsid, GFP_KERNEL);
> +
> +               if (err < 0) {
> +                       if (likely(skb->len))
> +                               err = skb->len;
> +                       break;
>                 }
>         }
> -
> -out:
> -       cb->args[1] = idx;
> -       cb->args[0] = h;
>         cb->seq = tgt_net->dev_base_seq;
>         nl_dump_check_consistent(cb, nlmsg_hdr(skb));
>         if (netnsid >= 0)
>
> And it will not hang:
>
> # strace -e sendto,recvmsg ip l
> sendto(3, [{nlmsg_len=40, nlmsg_type=0x12 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|0x300, nlmsg_seq=1707675119, nlmsg_pid=0}, "\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x1d\x00\x01\x00\x00\x00"], 40, 0, NULL, 0) = 40
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 784
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=764, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707675119, nlmsg_pid=564}, "\x00\x00\x04\x03\x01\x00\x00\x00\x49\x00\x01\x00\x00\x00\x00\x00\x07\x00\x03\x00\x6c\x6f\x00\x00\x08\x00\x0d\x00\xe8\x03\x00\x00"...], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1707675119, nlmsg_pid=564}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 784
> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
>     link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> +++ exited with 0 +++

Excellent, thanks for helping me on this ;)

I am sending a V2 right away.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-02-11 21:35 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-09 14:56 [PATCH net-next 0/2] net: use net->dev_by_index in two places Eric Dumazet
2024-02-09 14:56 ` [PATCH net-next 1/2] vlan: use xarray iterator to implement /proc/net/vlan/config Eric Dumazet
2024-02-09 22:25   ` Jakub Kicinski
2024-02-09 14:56 ` [PATCH net-next 2/2] rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo() Eric Dumazet
2024-02-09 22:24   ` Jakub Kicinski
2024-02-10 11:23     ` Eric Dumazet
2024-02-11 18:29       ` Ido Schimmel
2024-02-11 21:35         ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).