All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch
@ 2021-04-30 15:33 Mark Gray
  2021-05-19 21:47 ` Pravin Shelar
  2021-05-28 19:49 ` [ovs-dev] " Flavio Leitner
  0 siblings, 2 replies; 8+ messages in thread
From: Mark Gray @ 2021-04-30 15:33 UTC (permalink / raw)
  To: netdev, dev; +Cc: Mark Gray

The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.

This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:

* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)

This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.

In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:

a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.

The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-April/382618.html

Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
---
 include/uapi/linux/openvswitch.h |  8 ++++
 net/openvswitch/datapath.c       | 70 +++++++++++++++++++++++++++++++-
 net/openvswitch/datapath.h       | 18 ++++++++
 net/openvswitch/flow_netlink.c   |  4 --
 4 files changed, 94 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 8d16744edc31..6571b57b2268 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
  * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
  * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
  * not be sent.
+ * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
+ * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
  * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
@@ -87,6 +89,9 @@ enum ovs_datapath_attr {
 	OVS_DP_ATTR_USER_FEATURES,	/* OVS_DP_F_*  */
 	OVS_DP_ATTR_PAD,
 	OVS_DP_ATTR_MASKS_CACHE_SIZE,
+	OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in per-cpu
+				     * dispatch mode
+				     */
 	__OVS_DP_ATTR_MAX
 };
 
@@ -127,6 +132,9 @@ struct ovs_vport_stats {
 /* Allow tc offload recirc sharing */
 #define OVS_DP_F_TC_RECIRC_SHARING	(1 << 2)
 
+/* Allow per-cpu dispatch of upcalls */
+#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU	(1 << 3)
+
 /* Fixed logical ports. */
 #define OVSP_LOCAL      ((__u32)0)
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9d6ef6cb9b26..98d54f41fdaa 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -121,6 +121,8 @@ int lockdep_ovsl_is_held(void)
 #endif
 
 static struct vport *new_vport(const struct vport_parms *);
+static u32 ovs_dp_get_upcall_portid(const struct datapath *, uint32_t);
+static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *);
 static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
 			     const struct sw_flow_key *,
 			     const struct dp_upcall_info *,
@@ -238,7 +240,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 
 		memset(&upcall, 0, sizeof(upcall));
 		upcall.cmd = OVS_PACKET_CMD_MISS;
-		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
+		if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+			upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id());
+		else
+			upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
 		upcall.mru = OVS_CB(skb)->mru;
 		error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
 		if (unlikely(error))
@@ -1590,16 +1597,67 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb,
 
 DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
 
+static int ovs_dp_set_upcall_portids(struct datapath *dp,
+				     const struct nlattr *ids)
+{
+	struct dp_portids *old, *dp_portids;
+
+	if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
+		return -EINVAL;
+
+	old = ovsl_dereference(dp->upcall_portids);
+
+	dp_portids = kmalloc(sizeof(*dp_portids) + nla_len(ids),
+			     GFP_KERNEL);
+	if (!dp)
+		return -ENOMEM;
+
+	dp_portids->n_ids = nla_len(ids) / sizeof(u32);
+	nla_memcpy(dp_portids->ids, ids, nla_len(ids));
+
+	rcu_assign_pointer(dp->upcall_portids, dp_portids);
+
+	if (old)
+		kfree_rcu(old, rcu);
+	return 0;
+}
+
+static u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
+{
+	struct dp_portids *dp_portids;
+
+	dp_portids = rcu_dereference_ovsl(dp->upcall_portids);
+
+	if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && dp_portids) {
+		if (cpu_id < dp_portids->n_ids) {
+			return dp_portids->ids[cpu_id];
+		} else if (dp_portids->n_ids > 0 && cpu_id >= dp_portids->n_ids) {
+			/* If the number of netlink PIDs is mismatched with the number of
+			 * CPUs as seen by the kernel, log this and send the upcall to an
+			 * arbitrary socket (0) in order to not drop packets
+			 */
+			pr_info_ratelimited("cpu_id mismatch with handler threads");
+			return dp_portids->ids[0];
+		} else {
+			return 0;
+		}
+	} else {
+		return 0;
+	}
+}
+
 static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 {
 	u32 user_features = 0;
+	int err;
 
 	if (a[OVS_DP_ATTR_USER_FEATURES]) {
 		user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
 
 		if (user_features & ~(OVS_DP_F_VPORT_PIDS |
 				      OVS_DP_F_UNALIGNED |
-				      OVS_DP_F_TC_RECIRC_SHARING))
+				      OVS_DP_F_TC_RECIRC_SHARING |
+				      OVS_DP_F_DISPATCH_UPCALL_PER_CPU))
 			return -EOPNOTSUPP;
 
 #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
@@ -1620,6 +1678,14 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
 
 	dp->user_features = user_features;
 
+	if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU &&
+	    a[OVS_DP_ATTR_PER_CPU_PIDS]) {
+		/* Upcall Netlink Port IDs have been updated */
+		err = ovs_dp_set_upcall_portids(dp, a[OVS_DP_ATTR_PER_CPU_PIDS]);
+		if (err)
+			return err;
+	}
+
 	if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
 		static_branch_enable(&tc_recirc_sharing_support);
 	else
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 38f7d3e66ca6..6003eba81958 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -50,6 +50,21 @@ struct dp_stats_percpu {
 	struct u64_stats_sync syncp;
 };
 
+/**
+ * struct dp_portids - array of netlink portids of for a datapath.
+ *                     This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU
+ *                     is enabled and must be protected by rcu.
+ * @rcu: RCU callback head for deferred destruction.
+ * @n_ids: Size of @ids array.
+ * @ids: Array storing the Netlink socket PIDs indexed by CPU ID for packets
+ *       that miss the flow table.
+ */
+struct dp_portids {
+	struct rcu_head rcu;
+	u32 n_ids;
+	u32 ids[];
+};
+
 /**
  * struct datapath - datapath for flow-based packet switching
  * @rcu: RCU callback head for deferred destruction.
@@ -61,6 +76,7 @@ struct dp_stats_percpu {
  * @net: Reference to net namespace.
  * @max_headroom: the maximum headroom of all vports in this datapath; it will
  * be used by all the internal vports in this dp.
+ * @upcall_portids: RCU protected 'struct dp_portids'.
  *
  * Context: See the comment on locking at the top of datapath.c for additional
  * locking information.
@@ -87,6 +103,8 @@ struct datapath {
 
 	/* Switch meters. */
 	struct dp_meter_table meter_tbl;
+
+	struct dp_portids __rcu *upcall_portids;
 };
 
 /**
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index fd1f809e9bc1..97242bc1d960 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2928,10 +2928,6 @@ static int validate_userspace(const struct nlattr *attr)
 	if (error)
 		return error;
 
-	if (!a[OVS_USERSPACE_ATTR_PID] ||
-	    !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
-		return -EINVAL;
-
 	return 0;
 }
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread
* Re: [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch
  2021-04-30 15:33 [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch Mark Gray
@ 2021-05-03  9:19 ` Dan Carpenter
  2021-05-28 19:49 ` [ovs-dev] " Flavio Leitner
  1 sibling, 0 replies; 8+ messages in thread
From: kernel test robot @ 2021-04-30 19:29 UTC (permalink / raw)
  To: kbuild

[-- Attachment #1: Type: text/plain, Size: 3054 bytes --]

CC: kbuild-all(a)lists.01.org
In-Reply-To: <20210430153325.28322-1-mark.d.gray@redhat.com>
References: <20210430153325.28322-1-mark.d.gray@redhat.com>
TO: Mark Gray <mark.d.gray@redhat.com>

Hi Mark,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Mark-Gray/openvswitch-Introduce-per-cpu-upcall-dispatch/20210430-233534
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 4a52dd8fefb45626dace70a63c0738dbd83b7edb
:::::: branch date: 4 hours ago
:::::: commit date: 4 hours ago
config: i386-randconfig-m021-20210430 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>

smatch warnings:
net/openvswitch/datapath.c:1612 ovs_dp_set_upcall_portids() warn: variable dereferenced before check 'dp' (see line 1608)
net/openvswitch/datapath.c:1613 ovs_dp_set_upcall_portids() warn: possible memory leak of 'dp_portids'

vim +/dp +1612 net/openvswitch/datapath.c

95a7233c452a58 Paul Blakey 2019-09-04  1599  
96349cd061e4c2 Mark Gray   2021-04-30  1600  static int ovs_dp_set_upcall_portids(struct datapath *dp,
96349cd061e4c2 Mark Gray   2021-04-30  1601  				     const struct nlattr *ids)
96349cd061e4c2 Mark Gray   2021-04-30  1602  {
96349cd061e4c2 Mark Gray   2021-04-30  1603  	struct dp_portids *old, *dp_portids;
96349cd061e4c2 Mark Gray   2021-04-30  1604  
96349cd061e4c2 Mark Gray   2021-04-30  1605  	if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
96349cd061e4c2 Mark Gray   2021-04-30  1606  		return -EINVAL;
96349cd061e4c2 Mark Gray   2021-04-30  1607  
96349cd061e4c2 Mark Gray   2021-04-30 @1608  	old = ovsl_dereference(dp->upcall_portids);
96349cd061e4c2 Mark Gray   2021-04-30  1609  
96349cd061e4c2 Mark Gray   2021-04-30  1610  	dp_portids = kmalloc(sizeof(*dp_portids) + nla_len(ids),
96349cd061e4c2 Mark Gray   2021-04-30  1611  			     GFP_KERNEL);
96349cd061e4c2 Mark Gray   2021-04-30 @1612  	if (!dp)
96349cd061e4c2 Mark Gray   2021-04-30 @1613  		return -ENOMEM;
96349cd061e4c2 Mark Gray   2021-04-30  1614  
96349cd061e4c2 Mark Gray   2021-04-30  1615  	dp_portids->n_ids = nla_len(ids) / sizeof(u32);
96349cd061e4c2 Mark Gray   2021-04-30  1616  	nla_memcpy(dp_portids->ids, ids, nla_len(ids));
96349cd061e4c2 Mark Gray   2021-04-30  1617  
96349cd061e4c2 Mark Gray   2021-04-30  1618  	rcu_assign_pointer(dp->upcall_portids, dp_portids);
96349cd061e4c2 Mark Gray   2021-04-30  1619  
96349cd061e4c2 Mark Gray   2021-04-30  1620  	if (old)
96349cd061e4c2 Mark Gray   2021-04-30  1621  		kfree_rcu(old, rcu);
96349cd061e4c2 Mark Gray   2021-04-30  1622  	return 0;
96349cd061e4c2 Mark Gray   2021-04-30  1623  }
96349cd061e4c2 Mark Gray   2021-04-30  1624  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 30829 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-06-30  9:44 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-30 15:33 [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch Mark Gray
2021-05-19 21:47 ` Pravin Shelar
2021-06-30  9:44   ` Mark Gray
2021-05-28 19:49 ` [ovs-dev] " Flavio Leitner
2021-06-30  9:44   ` Mark Gray
2021-04-30 19:29 kernel test robot
2021-05-03  9:19 ` Dan Carpenter
2021-06-30  9:43 ` Mark Gray

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.