[PATCH net-next v4] rtnetlink: Support fine-grained netdevice bulk deletion

* [PATCH net-next v4] rtnetlink: Support fine-grained netdevice bulk deletion
@ 2021-12-02 17:45 Lahav Schlesinger
  2021-12-04  1:06 ` Jakub Kicinski
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Lahav Schlesinger @ 2021-12-02 17:45 UTC (permalink / raw)
  To: netdev; +Cc: kuba, dsahern

Under large scale, some routers are required to support tens of thousands
of devices at once, both physical and virtual (e.g. loopbacks, tunnels,
vrfs, etc).
At times such routers are required to delete massive amounts of devices
at once, such as when a factory reset is performed on the router (causing
a deletion of all devices), or when a configuration is restored after an
upgrade, or as a request from an operator.

Currently there are 2 means of deleting devices using Netlink:
1. Deleting a single device (either by ifindex using ifinfomsg::ifi_index,
or by name using IFLA_IFNAME)
2. Delete all device that belong to a group (using IFLA_GROUP)

Deletion of devices one-by-one has poor performance on large scale of
devices compared to "group deletion":
After all device are handled, netdev_run_todo() is called which
calls rcu_barrier() to finish any outstanding RCU callbacks that were
registered during the deletion of the device, then wait until the
refcount of all the devices is 0, then perform final cleanups.

However, calling rcu_barrier() is a very costly operation, each call
taking in the order of 10s of milliseconds.

When deleting a large number of device one-by-one, rcu_barrier()
will be called for each device being deleted.
As an example, following benchmark deletes 10K loopback devices,
all of which are UP and with only IPv6 LLA being configured:

1. Deleting one-by-one using 1 thread : 243 seconds
2. Deleting one-by-one using 10 thread: 70 seconds
3. Deleting one-by-one using 50 thread: 54 seconds
4. Deleting all using "group deletion": 30 seconds

Note that even though the deletion logic takes place under the rtnl
lock, since the call to rcu_barrier() is outside the lock we gain
some improvements.

But, while "group deletion" is the fastest, it is not suited for
deleting large number of arbitrary devices which are unknown a head of
time. Furthermore, moving large number of devices to a group is also a
costly operation.

This patch adds support for passing an arbitrary list of ifindex of
devices to delete with a new IFLA_IFINDEX attribute. A single message
may contain multiple instances of this attribute).
This gives a more fine-grained control over which devices to delete,
while still resulting in rcu_barrier() being called only once.
Indeed, the timings of using this new API to delete 10K devices is
the same as using the existing "group" deletion.

Signed-off-by: Lahav Schlesinger <lschlesinger@drivenets.com>
---
v3 -> v4
 - Change single IFLA_INDEX_LIST into multiple IFLA_IFINDEX
 - Fail if passing both IFLA_GROUP and at least one IFLA_IFNEX

v2 -> v3
 - Rename 'ifindex_list' to 'ifindices', and pass it as int*
 - Clamp 'ops' variable in second loop.

v1 -> v2
 - Unset 'len' of IFLA_IFINDEX_LIST in policy.
 - Use __dev_get_by_index() instead of n^2 loop.
 - Return -ENODEV if any ifindex is not present.
 - Saved devices in an array.
 - Fix formatting.

 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 68 ++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index eebd3894fe89..68fcde9c0c5e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -348,6 +348,7 @@ enum {
 	IFLA_PARENT_DEV_NAME,
 	IFLA_PARENT_DEV_BUS_NAME,
 
+	IFLA_IFINDEX,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fd030e02f16d..9d804866fe72 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1880,6 +1880,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
 	[IFLA_NEW_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
 	[IFLA_PARENT_DEV_NAME]	= { .type = NLA_NUL_STRING },
+	[IFLA_IFINDEX]		= { .type = NLA_S32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -3050,6 +3051,65 @@ static int rtnl_group_dellink(const struct net *net, int group)
 	return 0;
 }
 
+static int rtnl_ifindex_dellink(struct net *net, struct nlattr *head, int len,
+				struct netlink_ext_ack *extack)
+{
+	int i = 0, num_devices = 0, rem;
+	struct net_device **dev_list;
+	const struct nlattr *nla;
+	LIST_HEAD(list_kill);
+	int ret;
+
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla_type(nla) == IFLA_IFINDEX)
+			num_devices++;
+	}
+
+	dev_list = kmalloc_array(num_devices, sizeof(*dev_list), GFP_KERNEL);
+	if (!dev_list)
+		return -ENOMEM;
+
+	nla_for_each_attr(nla, head, len, rem) {
+		const struct rtnl_link_ops *ops;
+		struct net_device *dev;
+		int ifindex;
+
+		if (nla_type(nla) != IFLA_IFINDEX)
+			continue;
+
+		ifindex = nla_get_s32(nla);
+		ret = -ENODEV;
+		dev = __dev_get_by_index(net, ifindex);
+		if (!dev) {
+			NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown ifindex");
+			goto out_free;
+		}
+
+		ret = -EOPNOTSUPP;
+		ops = dev->rtnl_link_ops;
+		if (!ops || !ops->dellink) {
+			NL_SET_ERR_MSG_ATTR(extack, nla, "Device cannot be deleted");
+			goto out_free;
+		}
+
+		dev_list[i++] = dev;
+	}
+
+	for (i = 0; i < num_devices; i++) {
+		struct net_device *dev = dev_list[i];
+
+		dev->rtnl_link_ops->dellink(dev, &list_kill);
+	}
+
+	unregister_netdevice_many(&list_kill);
+
+	ret = 0;
+
+out_free:
+	kfree(dev_list);
+	return ret;
+}
+
 int rtnl_delete_link(struct net_device *dev)
 {
 	const struct rtnl_link_ops *ops;
@@ -3093,6 +3153,11 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			return PTR_ERR(tgt_net);
 	}
 
+	if (tb[IFLA_GROUP] && tb[IFLA_IFINDEX]) {
+		NL_SET_ERR_MSG(extack, "Can't pass both IFLA_GROUP and IFLA_IFINDEX");
+		return -EOPNOTSUPP;
+	}
+
 	err = -EINVAL;
 	ifm = nlmsg_data(nlh);
 	if (ifm->ifi_index > 0)
@@ -3102,6 +3167,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 				   tb[IFLA_ALT_IFNAME], NULL);
 	else if (tb[IFLA_GROUP])
 		err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
+	else if (tb[IFLA_IFINDEX])
+		err = rtnl_ifindex_dellink(tgt_net, nlmsg_attrdata(nlh, sizeof(*ifm)),
+					   nlmsg_attrlen(nlh, sizeof(*ifm)), extack);
 	else
 		goto out;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread