All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] net: openvswitch: Reduce stack usage
@ 2023-10-11  3:43 Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 1/7] net: openvswitch: generalise the per-cpu flow key allocation stack Nicholas Piggin
                   ` (8 more replies)
  0 siblings, 9 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-11  3:43 UTC (permalink / raw)
  To: netdev
  Cc: Nicholas Piggin, dev, Pravin B Shelar, Aaron Conole,
	Eelco Chaudron, Ilya Maximets, Flavio Leitner

Hi,

I'll post this out again to keep discussion going. Thanks all for the
testing and comments so far.

Changes since the RFC
https://lore.kernel.org/netdev/20230927001308.749910-1-npiggin@gmail.com/

- Replace slab allocations for flow keys with expanding the use
  of the per-CPU key allocator to ovs_vport_receive.

- Drop patch 1 with Ilya's since they did the same thing (that is
  added at patch 3).

- Change push_nsh stack reduction from slab allocation to per-cpu
  buffer.

- Drop the ovs_fragment stack usage reduction for now sinc it used
  slab and was a bit more complicated.

I posted an initial version of the per-cpu flow allocator patch in
the RFC thread. Since then I cleaned up some debug code and increased
the allocator size to accommodate the additional user of it.

Thanks,
Nick

Ilya Maximets (1):
  openvswitch: reduce stack usage in do_execute_actions

Nicholas Piggin (6):
  net: openvswitch: generalise the per-cpu flow key allocation stack
  net: openvswitch: Use flow key allocator in ovs_vport_receive
  net: openvswitch: Reduce push_nsh stack usage
  net: openvswitch: uninline action execution
  net: openvswitch: uninline ovs_fragment to control stack usage
  net: openvswitch: Reduce stack usage in ovs_dp_process_packet

 net/openvswitch/actions.c  | 208 +++++++++++++++++++++++--------------
 net/openvswitch/datapath.c |  56 +++++-----
 net/openvswitch/flow.h     |   3 +
 net/openvswitch/vport.c    |  27 +++--
 4 files changed, 185 insertions(+), 109 deletions(-)

-- 
2.42.0


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/7] net: openvswitch: generalise the per-cpu flow key allocation stack
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
@ 2023-10-11  3:43 ` Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 2/7] net: openvswitch: Use flow key allocator in ovs_vport_receive Nicholas Piggin
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-11  3:43 UTC (permalink / raw)
  To: netdev
  Cc: Nicholas Piggin, dev, Pravin B Shelar, Aaron Conole,
	Eelco Chaudron, Ilya Maximets, Flavio Leitner

Rather than an implicit key allocation index based on the recursion
level, make this a standalone FIFO allocator. This makes it usable
in other places without modifying the recursion accounting.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 net/openvswitch/actions.c | 104 ++++++++++++++++++++++++++------------
 1 file changed, 72 insertions(+), 32 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index fd66014d8a76..bc7a8c2fff91 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -59,9 +59,10 @@ struct ovs_frag_data {
 
 static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
 
-#define DEFERRED_ACTION_FIFO_SIZE 10
 #define OVS_RECURSION_LIMIT 5
-#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
+#define NR_FLOW_KEYS 5
+#define DEFERRED_ACTION_FIFO_SIZE 10
+
 struct action_fifo {
 	int head;
 	int tail;
@@ -69,27 +70,64 @@ struct action_fifo {
 	struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
-struct action_flow_keys {
-	struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+struct flow_key_stack {
+	struct sw_flow_key key[NR_FLOW_KEYS];
 };
 
-static struct action_fifo __percpu *action_fifos;
-static struct action_flow_keys __percpu *flow_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
+static struct flow_key_stack __percpu *flow_key_stack;
+static DEFINE_PER_CPU(int, flow_keys_allocated);
+
+static struct action_fifo __percpu *action_fifos;
+
+/*
+ * ovs_flow_key_alloc provides a per-CPU sw_flow_key allocator. keys must be
+ * freed in the reverse order that they were allocated in (i.e., a stack).
+ */
+static struct sw_flow_key *ovs_flow_key_alloc(void)
+{
+	struct flow_key_stack *keys = this_cpu_ptr(flow_key_stack);
+	int level = this_cpu_read(flow_keys_allocated);
+
+	if (unlikely(level >= NR_FLOW_KEYS))
+		return NULL;
+
+	__this_cpu_inc(flow_keys_allocated);
+
+	return &keys->key[level];
+}
+
+static void ovs_flow_key_free(struct sw_flow_key *key)
+{
+	struct flow_key_stack *keys = this_cpu_ptr(flow_key_stack);
+	int level = this_cpu_read(flow_keys_allocated);
+
+	/*
+	 * If these debug checks fire then keys will cease being freed
+	 * and the allocator will become exhausted and stop working. This
+	 * gives a graceful failure mode for programming errors.
+	 */
+
+	if (WARN_ON_ONCE(level == 0))
+		return; /* Underflow */
+
+	if (WARN_ON_ONCE(key != &keys->key[level - 1]))
+		return; /* Mismatched alloc/free order */
+
+	__this_cpu_dec(flow_keys_allocated);
+}
+
 /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
  * space. Return NULL if out of key spaces.
  */
 static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
 {
-	struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
-	int level = this_cpu_read(exec_actions_level);
-	struct sw_flow_key *key = NULL;
+	struct sw_flow_key *key;
 
-	if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
-		key = &keys->key[level - 1];
+	key = ovs_flow_key_alloc();
+	if (likely(key))
 		*key = *key_;
-	}
 
 	return key;
 }
@@ -1522,9 +1560,10 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb,
 {
 	struct deferred_action *da;
 	struct sw_flow_key *clone;
+	int err = 0;
 
 	skb = last ? skb : skb_clone(skb, GFP_ATOMIC);
-	if (!skb) {
+	if (unlikely(!skb)) {
 		/* Out of memory, skip this action.
 		 */
 		return 0;
@@ -1536,26 +1575,27 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb,
 	 * 'flow_keys'. If clone is successful, execute the actions
 	 * without deferring.
 	 */
-	clone = clone_flow_key ? clone_key(key) : key;
-	if (clone) {
-		int err = 0;
+	if (clone_flow_key) {
+		clone = clone_key(key);
+		if (unlikely(!clone))
+			goto defer;
+	} else {
+		clone = key;
+	}
 
-		if (actions) { /* Sample action */
-			if (clone_flow_key)
-				__this_cpu_inc(exec_actions_level);
+	if (actions) { /* Sample action */
+		err = do_execute_actions(dp, skb, clone, actions, len);
+	} else { /* Recirc action */
+		clone->recirc_id = recirc_id;
+		ovs_dp_process_packet(skb, clone);
+	}
 
-			err = do_execute_actions(dp, skb, clone,
-						 actions, len);
+	if (clone_flow_key)
+		ovs_flow_key_free(clone);
 
-			if (clone_flow_key)
-				__this_cpu_dec(exec_actions_level);
-		} else { /* Recirc action */
-			clone->recirc_id = recirc_id;
-			ovs_dp_process_packet(skb, clone);
-		}
-		return err;
-	}
+	return err;
 
+defer:
 	/* Out of 'flow_keys' space. Defer actions */
 	da = add_deferred_actions(skb, key, actions, len);
 	if (da) {
@@ -1642,8 +1682,8 @@ int action_fifos_init(void)
 	if (!action_fifos)
 		return -ENOMEM;
 
-	flow_keys = alloc_percpu(struct action_flow_keys);
-	if (!flow_keys) {
+	flow_key_stack = alloc_percpu(struct flow_key_stack);
+	if (!flow_key_stack) {
 		free_percpu(action_fifos);
 		return -ENOMEM;
 	}
@@ -1654,5 +1694,5 @@ int action_fifos_init(void)
 void action_fifos_exit(void)
 {
 	free_percpu(action_fifos);
-	free_percpu(flow_keys);
+	free_percpu(flow_key_stack);
 }
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/7] net: openvswitch: Use flow key allocator in ovs_vport_receive
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 1/7] net: openvswitch: generalise the per-cpu flow key allocation stack Nicholas Piggin
@ 2023-10-11  3:43 ` Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 3/7] openvswitch: reduce stack usage in do_execute_actions Nicholas Piggin
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-11  3:43 UTC (permalink / raw)
  To: netdev
  Cc: Nicholas Piggin, dev, Pravin B Shelar, Aaron Conole,
	Eelco Chaudron, Ilya Maximets, Flavio Leitner

Rather than allocate the flow key on stack in ovs_vport_receive,
use the per-cpu flow key allocator introduced with the previous
change.

The number of keys are increased because ovs_vport_receive can
be in the recursion path too.

This brings ovs_vport_receive stack usage from 544 bytes to 64
bytes on ppc64le.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 net/openvswitch/actions.c |  6 +++---
 net/openvswitch/flow.h    |  3 +++
 net/openvswitch/vport.c   | 27 ++++++++++++++++++++-------
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index bc7a8c2fff91..7a66574672d3 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -60,7 +60,7 @@ struct ovs_frag_data {
 static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
 
 #define OVS_RECURSION_LIMIT 5
-#define NR_FLOW_KEYS 5
+#define NR_FLOW_KEYS 10
 #define DEFERRED_ACTION_FIFO_SIZE 10
 
 struct action_fifo {
@@ -85,7 +85,7 @@ static struct action_fifo __percpu *action_fifos;
  * ovs_flow_key_alloc provides a per-CPU sw_flow_key allocator. keys must be
  * freed in the reverse order that they were allocated in (i.e., a stack).
  */
-static struct sw_flow_key *ovs_flow_key_alloc(void)
+struct sw_flow_key *ovs_flow_key_alloc(void)
 {
 	struct flow_key_stack *keys = this_cpu_ptr(flow_key_stack);
 	int level = this_cpu_read(flow_keys_allocated);
@@ -98,7 +98,7 @@ static struct sw_flow_key *ovs_flow_key_alloc(void)
 	return &keys->key[level];
 }
 
-static void ovs_flow_key_free(struct sw_flow_key *key)
+void ovs_flow_key_free(struct sw_flow_key *key)
 {
 	struct flow_key_stack *keys = this_cpu_ptr(flow_key_stack);
 	int level = this_cpu_read(flow_keys_allocated);
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index b5711aff6e76..612459518af9 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -285,6 +285,9 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *,
 void ovs_flow_stats_clear(struct sw_flow *);
 u64 ovs_flow_used_time(unsigned long flow_jiffies);
 
+struct sw_flow_key *ovs_flow_key_alloc(void);
+void ovs_flow_key_free(struct sw_flow_key *key);
+
 int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
 int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key);
 int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 972ae01a70f7..80887a17e23b 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -494,7 +494,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport,
 int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 		      const struct ip_tunnel_info *tun_info)
 {
-	struct sw_flow_key key;
+	struct sw_flow_key *key;
 	int error;
 
 	OVS_CB(skb)->input_vport = vport;
@@ -509,14 +509,27 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 		tun_info = NULL;
 	}
 
-	/* Extract flow from 'skb' into 'key'. */
-	error = ovs_flow_key_extract(tun_info, skb, &key);
-	if (unlikely(error)) {
-		kfree_skb(skb);
-		return error;
+	key = ovs_flow_key_alloc();
+	if (unlikely(!key)) {
+		error = -ENOMEM;
+		goto err_skb;
 	}
-	ovs_dp_process_packet(skb, &key);
+
+	/* Extract flow from 'skb' into 'key'. */
+	error = ovs_flow_key_extract(tun_info, skb, key);
+	if (unlikely(error))
+		goto err_key;
+
+	ovs_dp_process_packet(skb, key);
+	ovs_flow_key_free(key);
+
 	return 0;
+
+err_key:
+	ovs_flow_key_free(key);
+err_skb:
+	kfree_skb(skb);
+	return error;
 }
 
 static int packet_length(const struct sk_buff *skb,
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/7] openvswitch: reduce stack usage in do_execute_actions
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 1/7] net: openvswitch: generalise the per-cpu flow key allocation stack Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 2/7] net: openvswitch: Use flow key allocator in ovs_vport_receive Nicholas Piggin
@ 2023-10-11  3:43 ` Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 4/7] net: openvswitch: Reduce push_nsh stack usage Nicholas Piggin
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-11  3:43 UTC (permalink / raw)
  To: netdev
  Cc: Nicholas Piggin, dev, Pravin B Shelar, Aaron Conole,
	Eelco Chaudron, Ilya Maximets, Flavio Leitner, Ilya Maximets

From: Ilya Maximets <i.maximets@ovn.org>

do_execute_actions() function can be called recursively multiple
times while executing actions that require pipeline forking or
recirculations.  It may also be re-entered multiple times if the packet
leaves openvswitch module and re-enters it through a different port.

Currently, there is a 256-byte array allocated on stack in this
function that is supposed to hold NSH header.  Compilers tend to
pre-allocate that space right at the beginning of the function:

     a88:       48 81 ec b0 01 00 00    sub    $0x1b0,%rsp

NSH is not a very common protocol, but the space is allocated on every
recursive call or re-entry multiplying the wasted stack space.

Move the stack allocation to push_nsh() function that is only used
if NSH actions are actually present.  push_nsh() is also a simple
function without a possibility for re-entry, so the stack is returned
right away.

With this change the preallocated space is reduced by 256 B per call:

     b18:       48 81 ec b0 00 00 00    sub    $0xb0,%rsp

Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 net/openvswitch/actions.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 7a66574672d3..be15ef693284 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -349,11 +349,18 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
-static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
-		    const struct nshhdr *nh)
+static noinline_for_stack int push_nsh(struct sk_buff *skb,
+				       struct sw_flow_key *key,
+				       const struct nlattr *a)
 {
+	u8 buffer[NSH_HDR_MAX_LEN];
+	struct nshhdr *nh = (struct nshhdr *)buffer;
 	int err;
 
+	err = nsh_hdr_from_nlattr(a, nh, NSH_HDR_MAX_LEN);
+	if (err)
+		return err;
+
 	err = nsh_push(skb, nh);
 	if (err)
 		return err;
@@ -1477,17 +1484,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			err = pop_eth(skb, key);
 			break;
 
-		case OVS_ACTION_ATTR_PUSH_NSH: {
-			u8 buffer[NSH_HDR_MAX_LEN];
-			struct nshhdr *nh = (struct nshhdr *)buffer;
-
-			err = nsh_hdr_from_nlattr(nla_data(a), nh,
-						  NSH_HDR_MAX_LEN);
-			if (unlikely(err))
-				break;
-			err = push_nsh(skb, key, nh);
+		case OVS_ACTION_ATTR_PUSH_NSH:
+			err = push_nsh(skb, key, nla_data(a));
 			break;
-		}
 
 		case OVS_ACTION_ATTR_POP_NSH:
 			err = pop_nsh(skb, key);
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/7] net: openvswitch: Reduce push_nsh stack usage
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
                   ` (2 preceding siblings ...)
  2023-10-11  3:43 ` [PATCH 3/7] openvswitch: reduce stack usage in do_execute_actions Nicholas Piggin
@ 2023-10-11  3:43 ` Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 5/7] net: openvswitch: uninline action execution Nicholas Piggin
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-11  3:43 UTC (permalink / raw)
  To: netdev
  Cc: Nicholas Piggin, dev, Pravin B Shelar, Aaron Conole,
	Eelco Chaudron, Ilya Maximets, Flavio Leitner

Use percpu data to move the large temporary buffer off the push_nsh
stack. This reduces stack consumption from 336 bytes to 64 bytes.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 net/openvswitch/actions.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index be15ef693284..fa53e22f3ebe 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -349,12 +349,18 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+struct tmp_nsh_hdr {
+	u8 data[NSH_HDR_MAX_LEN];
+};
+
+static DEFINE_PER_CPU(struct tmp_nsh_hdr, tmp_nsh_hdr);
+
 static noinline_for_stack int push_nsh(struct sk_buff *skb,
 				       struct sw_flow_key *key,
 				       const struct nlattr *a)
 {
-	u8 buffer[NSH_HDR_MAX_LEN];
-	struct nshhdr *nh = (struct nshhdr *)buffer;
+	struct tmp_nsh_hdr *hdr = this_cpu_ptr(&tmp_nsh_hdr);
+	struct nshhdr *nh = (struct nshhdr *)&hdr->data[0];
 	int err;
 
 	err = nsh_hdr_from_nlattr(a, nh, NSH_HDR_MAX_LEN);
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/7] net: openvswitch: uninline action execution
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
                   ` (3 preceding siblings ...)
  2023-10-11  3:43 ` [PATCH 4/7] net: openvswitch: Reduce push_nsh stack usage Nicholas Piggin
@ 2023-10-11  3:43 ` Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 6/7] net: openvswitch: uninline ovs_fragment to control stack usage Nicholas Piggin
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-11  3:43 UTC (permalink / raw)
  To: netdev
  Cc: Nicholas Piggin, dev, Pravin B Shelar, Aaron Conole,
	Eelco Chaudron, Ilya Maximets, Flavio Leitner

A function tends to use as much stack as the maximum of any control
flow path. Compilers can "shrink wrap" to special-case stack allocation,
but that only works well for exclusive paths. The switch statement in
the loop in do_execute_actions uses as much stack as the maximum of its
cases, and so inlining large actions increases overall stack uage. This
is particularly bad because the actions that cause recursion are not the
largest stack users.

Uninline action execution functions, which reduces the stack usage of
do_execute_actions from 288 bytes to 112 bytes.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 net/openvswitch/actions.c | 69 +++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index fa53e22f3ebe..87ec668d5556 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -964,8 +964,9 @@ static void ovs_fragment(struct net *net, struct vport *vport,
 	ovs_kfree_skb_reason(skb, reason);
 }
 
-static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
-		      struct sw_flow_key *key)
+static noinline_for_stack void do_output(struct datapath *dp,
+					 struct sk_buff *skb, int out_port,
+					 struct sw_flow_key *key)
 {
 	struct vport *vport = ovs_vport_rcu(dp, out_port);
 
@@ -995,10 +996,11 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
 	}
 }
 
-static int output_userspace(struct datapath *dp, struct sk_buff *skb,
-			    struct sw_flow_key *key, const struct nlattr *attr,
-			    const struct nlattr *actions, int actions_len,
-			    uint32_t cutlen)
+static noinline_for_stack
+int output_userspace(struct datapath *dp, struct sk_buff *skb,
+		     struct sw_flow_key *key, const struct nlattr *attr,
+		     const struct nlattr *actions, int actions_len,
+		     uint32_t cutlen)
 {
 	struct dp_upcall_info upcall;
 	const struct nlattr *a;
@@ -1073,9 +1075,9 @@ static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb,
  * Otherwise, sample() should keep 'skb' intact regardless what
  * actions are executed within sample().
  */
-static int sample(struct datapath *dp, struct sk_buff *skb,
-		  struct sw_flow_key *key, const struct nlattr *attr,
-		  bool last)
+static noinline_for_stack int sample(struct datapath *dp, struct sk_buff *skb,
+				     struct sw_flow_key *key,
+				     const struct nlattr *attr, bool last)
 {
 	struct nlattr *actions;
 	struct nlattr *sample_arg;
@@ -1104,9 +1106,10 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
  * Otherwise, clone() should keep 'skb' intact regardless what
  * actions are executed within clone().
  */
-static int clone(struct datapath *dp, struct sk_buff *skb,
-		 struct sw_flow_key *key, const struct nlattr *attr,
-		 bool last)
+static noinline_for_stack int clone(struct datapath *dp,
+				    struct sk_buff *skb,
+				    struct sw_flow_key *key,
+				    const struct nlattr *attr, bool last)
 {
 	struct nlattr *actions;
 	struct nlattr *clone_arg;
@@ -1122,8 +1125,9 @@ static int clone(struct datapath *dp, struct sk_buff *skb,
 			     !dont_clone_flow_key);
 }
 
-static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
-			 const struct nlattr *attr)
+static noinline_for_stack void execute_hash(struct sk_buff *skb,
+					    struct sw_flow_key *key,
+					    const struct nlattr *attr)
 {
 	struct ovs_action_hash *hash_act = nla_data(attr);
 	u32 hash = 0;
@@ -1145,9 +1149,9 @@ static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
 	key->ovs_flow_hash = hash;
 }
 
-static int execute_set_action(struct sk_buff *skb,
-			      struct sw_flow_key *flow_key,
-			      const struct nlattr *a)
+static noinline_for_stack int execute_set_action(struct sk_buff *skb,
+						 struct sw_flow_key *flow_key,
+						 const struct nlattr *a)
 {
 	/* Only tunnel set execution is supported without a mask. */
 	if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
@@ -1165,9 +1169,9 @@ static int execute_set_action(struct sk_buff *skb,
 /* Mask is at the midpoint of the data. */
 #define get_mask(a, type) ((const type)nla_data(a) + 1)
 
-static int execute_masked_set_action(struct sk_buff *skb,
-				     struct sw_flow_key *flow_key,
-				     const struct nlattr *a)
+static noinline_for_stack
+int execute_masked_set_action(struct sk_buff *skb, struct sw_flow_key *flow_key,
+			      const struct nlattr *a)
 {
 	int err = 0;
 
@@ -1240,9 +1244,9 @@ static int execute_masked_set_action(struct sk_buff *skb,
 	return err;
 }
 
-static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
-			  struct sw_flow_key *key,
-			  const struct nlattr *a, bool last)
+static noinline_for_stack
+int execute_recirc(struct datapath *dp, struct sk_buff *skb,
+		   struct sw_flow_key *key, const struct nlattr *a, bool last)
 {
 	u32 recirc_id;
 
@@ -1259,9 +1263,10 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
 	return clone_execute(dp, skb, key, recirc_id, NULL, 0, last, true);
 }
 
-static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
-				 struct sw_flow_key *key,
-				 const struct nlattr *attr, bool last)
+static noinline_for_stack
+int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
+			  struct sw_flow_key *key, const struct nlattr *attr,
+			  bool last)
 {
 	struct ovs_skb_cb *ovs_cb = OVS_CB(skb);
 	const struct nlattr *actions, *cpl_arg;
@@ -1298,7 +1303,8 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
 			     nla_len(actions), last, clone_flow_key);
 }
 
-static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
+static noinline_for_stack int execute_dec_ttl(struct sk_buff *skb,
+					      struct sw_flow_key *key)
 {
 	int err;
 
@@ -1558,10 +1564,11 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
  * The execution may be deferred in case the actions can not be executed
  * immediately.
  */
-static int clone_execute(struct datapath *dp, struct sk_buff *skb,
-			 struct sw_flow_key *key, u32 recirc_id,
-			 const struct nlattr *actions, int len,
-			 bool last, bool clone_flow_key)
+static noinline_for_stack
+int clone_execute(struct datapath *dp, struct sk_buff *skb,
+		  struct sw_flow_key *key, u32 recirc_id,
+		  const struct nlattr *actions, int len, bool last,
+		  bool clone_flow_key)
 {
 	struct deferred_action *da;
 	struct sw_flow_key *clone;
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 6/7] net: openvswitch: uninline ovs_fragment to control stack usage
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
                   ` (4 preceding siblings ...)
  2023-10-11  3:43 ` [PATCH 5/7] net: openvswitch: uninline action execution Nicholas Piggin
@ 2023-10-11  3:43 ` Nicholas Piggin
  2023-10-11  3:43 ` [PATCH 7/7] net: openvswitch: Reduce stack usage in ovs_dp_process_packet Nicholas Piggin
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-11  3:43 UTC (permalink / raw)
  To: netdev
  Cc: Nicholas Piggin, dev, Pravin B Shelar, Aaron Conole,
	Eelco Chaudron, Ilya Maximets, Flavio Leitner

ovs_fragment uses a lot of stack, 400 bytes. It is a leaf function
but its caller do_output is involved in openvswitch recursion.
GCC 13.2 for powerpc64le is not inlining it, but it only has a single
call site, so it is liable to being inlined.

Mark it noinline_for_stack, to ensure it doesn't bloat stack use in
the recursive path.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 net/openvswitch/actions.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 87ec668d5556..ef3a59012d26 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -900,9 +900,9 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb,
 	skb_pull(skb, hlen);
 }
 
-static void ovs_fragment(struct net *net, struct vport *vport,
-			 struct sk_buff *skb, u16 mru,
-			 struct sw_flow_key *key)
+static noinline_for_stack
+void ovs_fragment(struct net *net, struct vport *vport, struct sk_buff *skb,
+		  u16 mru, struct sw_flow_key *key)
 {
 	enum ovs_drop_reason reason;
 	u16 orig_network_offset = 0;
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 7/7] net: openvswitch: Reduce stack usage in ovs_dp_process_packet
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
                   ` (5 preceding siblings ...)
  2023-10-11  3:43 ` [PATCH 6/7] net: openvswitch: uninline ovs_fragment to control stack usage Nicholas Piggin
@ 2023-10-11  3:43 ` Nicholas Piggin
  2023-10-11 12:22 ` [PATCH 0/7] net: openvswitch: Reduce stack usage Ilya Maximets
  2023-10-11 13:23 ` Aaron Conole
  8 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-11  3:43 UTC (permalink / raw)
  To: netdev
  Cc: Nicholas Piggin, dev, Pravin B Shelar, Aaron Conole,
	Eelco Chaudron, Ilya Maximets, Flavio Leitner

The upcall in ovs_dp_process_packet some stack and is not involved in
the recursive call. Move it out of line, reducing stack overhead of
ovs_dp_process_packet from 144 to 96 bytes.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 net/openvswitch/datapath.c | 56 ++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 11c69415c605..fdc24b1e9bbc 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -242,6 +242,37 @@ void ovs_dp_detach_port(struct vport *p)
 	ovs_vport_del(p);
 }
 
+static noinline_for_stack
+void do_packet_upcall(struct sk_buff *skb, struct sw_flow_key *key,
+		      const struct vport *p, struct datapath *dp)
+{
+	struct dp_upcall_info upcall;
+	int error;
+
+	memset(&upcall, 0, sizeof(upcall));
+	upcall.cmd = OVS_PACKET_CMD_MISS;
+
+	if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+		upcall.portid =
+		    ovs_dp_get_upcall_portid(dp, smp_processor_id());
+	else
+		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
+	upcall.mru = OVS_CB(skb)->mru;
+	error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
+	switch (error) {
+	case 0:
+	case -EAGAIN:
+	case -ERESTARTSYS:
+	case -EINTR:
+		consume_skb(skb);
+		break;
+	default:
+		kfree_skb(skb);
+		break;
+	}
+}
+
 /* Must be called with rcu_read_lock. */
 void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 {
@@ -261,30 +292,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 	flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
 					 &n_mask_hit, &n_cache_hit);
 	if (unlikely(!flow)) {
-		struct dp_upcall_info upcall;
-
-		memset(&upcall, 0, sizeof(upcall));
-		upcall.cmd = OVS_PACKET_CMD_MISS;
-
-		if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
-			upcall.portid =
-			    ovs_dp_get_upcall_portid(dp, smp_processor_id());
-		else
-			upcall.portid = ovs_vport_find_upcall_portid(p, skb);
-
-		upcall.mru = OVS_CB(skb)->mru;
-		error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
-		switch (error) {
-		case 0:
-		case -EAGAIN:
-		case -ERESTARTSYS:
-		case -EINTR:
-			consume_skb(skb);
-			break;
-		default:
-			kfree_skb(skb);
-			break;
-		}
+		do_packet_upcall(skb, key, p, dp);
 		stats_counter = &stats->n_missed;
 		goto out;
 	}
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/7] net: openvswitch: Reduce stack usage
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
                   ` (6 preceding siblings ...)
  2023-10-11  3:43 ` [PATCH 7/7] net: openvswitch: Reduce stack usage in ovs_dp_process_packet Nicholas Piggin
@ 2023-10-11 12:22 ` Ilya Maximets
  2023-10-12  0:08   ` Nicholas Piggin
  2023-10-11 13:23 ` Aaron Conole
  8 siblings, 1 reply; 15+ messages in thread
From: Ilya Maximets @ 2023-10-11 12:22 UTC (permalink / raw)
  To: Nicholas Piggin, netdev
  Cc: dev, Pravin B Shelar, Aaron Conole, Eelco Chaudron,
	Flavio Leitner, i.maximets, Simon Horman

On 10/11/23 05:43, Nicholas Piggin wrote:
> Hi,
> 
> I'll post this out again to keep discussion going. Thanks all for the
> testing and comments so far.

Hi, Nicholas.  This patch set still needs performance evaluation
since it touches very performance-sensitive parts of the stack.
Did you run any performance tests with this version?

IIRC, Aaron was still working on testing for the RFC.  I think,
we should wait for his feedback before re-spinning a new version.

> 
> Changes since the RFC
> https://lore.kernel.org/netdev/20230927001308.749910-1-npiggin@gmail.com/
> 
> - Replace slab allocations for flow keys with expanding the use
>   of the per-CPU key allocator to ovs_vport_receive.

While this is likely to work faster than a dynamic memory allocation,
it is unlikley to be on par with a stack allocation.  Performance
evaluation is necessary.

> 
> - Drop patch 1 with Ilya's since they did the same thing (that is
>   added at patch 3).

The patch is already in net-next, so should not be included in this set.
For the next version (please, hold) please rebase the set on the
net-next/main and add the net-next to the subject prefix of the patches.
They are not simple bug fixes, so should go through net-next, IMO.

You may also see in netdev+bpf patchwork that CI failed trying to guess
on which tree the patches should be applied and no tests were executed.

> 
> - Change push_nsh stack reduction from slab allocation to per-cpu
>   buffer.

I still think this change is not needed and will only consume a lot
of per-CPU memory space for no reason, as NSH is not a frequently
used thing in OVS and the function is not on the recursive path and
explicitly not inlined already.

Best regards, Ilya Maximets.

P.S.  Please use my ovn.org email instead.

> 
> - Drop the ovs_fragment stack usage reduction for now sinc it used
>   slab and was a bit more complicated.
> 
> I posted an initial version of the per-cpu flow allocator patch in
> the RFC thread. Since then I cleaned up some debug code and increased
> the allocator size to accommodate the additional user of it.
> 
> Thanks,
> Nick
> 
> Ilya Maximets (1):
>   openvswitch: reduce stack usage in do_execute_actions
> 
> Nicholas Piggin (6):
>   net: openvswitch: generalise the per-cpu flow key allocation stack
>   net: openvswitch: Use flow key allocator in ovs_vport_receive
>   net: openvswitch: Reduce push_nsh stack usage
>   net: openvswitch: uninline action execution
>   net: openvswitch: uninline ovs_fragment to control stack usage
>   net: openvswitch: Reduce stack usage in ovs_dp_process_packet
> 
>  net/openvswitch/actions.c  | 208 +++++++++++++++++++++++--------------
>  net/openvswitch/datapath.c |  56 +++++-----
>  net/openvswitch/flow.h     |   3 +
>  net/openvswitch/vport.c    |  27 +++--
>  4 files changed, 185 insertions(+), 109 deletions(-)
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/7] net: openvswitch: Reduce stack usage
  2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
                   ` (7 preceding siblings ...)
  2023-10-11 12:22 ` [PATCH 0/7] net: openvswitch: Reduce stack usage Ilya Maximets
@ 2023-10-11 13:23 ` Aaron Conole
  2023-10-12  1:19   ` Nicholas Piggin
  8 siblings, 1 reply; 15+ messages in thread
From: Aaron Conole @ 2023-10-11 13:23 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: netdev, dev, Pravin B Shelar, Eelco Chaudron, Ilya Maximets,
	Flavio Leitner, Paolo Abeni, Jakub Kicinski, David S. Miller,
	Eric Dumazet

Nicholas Piggin <npiggin@gmail.com> writes:

> Hi,
>
> I'll post this out again to keep discussion going. Thanks all for the
> testing and comments so far.

Thanks for the update - did you mean for this to be tagged RFC as well?

I don't see any performance data with the deployments on x86_64 and
ppc64le that cause these stack overflows.  Are you able to provide the
impact on ppc64le and x86_64?

I guess the change probably should be tagged as -next since it doesn't
really have a specific set of commits it is "fixing."  It's really like
a major change and shouldn't really go through stable trees, but I'll
let the maintainers tell me off if I got it wrong.

> Changes since the RFC
> https://lore.kernel.org/netdev/20230927001308.749910-1-npiggin@gmail.com/
>
> - Replace slab allocations for flow keys with expanding the use
>   of the per-CPU key allocator to ovs_vport_receive.
>
> - Drop patch 1 with Ilya's since they did the same thing (that is
>   added at patch 3).
>
> - Change push_nsh stack reduction from slab allocation to per-cpu
>   buffer.
>
> - Drop the ovs_fragment stack usage reduction for now sinc it used
>   slab and was a bit more complicated.
>
> I posted an initial version of the per-cpu flow allocator patch in
> the RFC thread. Since then I cleaned up some debug code and increased
> the allocator size to accommodate the additional user of it.
>
> Thanks,
> Nick
>
> Ilya Maximets (1):
>   openvswitch: reduce stack usage in do_execute_actions
>
> Nicholas Piggin (6):
>   net: openvswitch: generalise the per-cpu flow key allocation stack
>   net: openvswitch: Use flow key allocator in ovs_vport_receive
>   net: openvswitch: Reduce push_nsh stack usage
>   net: openvswitch: uninline action execution
>   net: openvswitch: uninline ovs_fragment to control stack usage
>   net: openvswitch: Reduce stack usage in ovs_dp_process_packet
>
>  net/openvswitch/actions.c  | 208 +++++++++++++++++++++++--------------
>  net/openvswitch/datapath.c |  56 +++++-----
>  net/openvswitch/flow.h     |   3 +
>  net/openvswitch/vport.c    |  27 +++--
>  4 files changed, 185 insertions(+), 109 deletions(-)


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/7] net: openvswitch: Reduce stack usage
  2023-10-11 12:22 ` [PATCH 0/7] net: openvswitch: Reduce stack usage Ilya Maximets
@ 2023-10-12  0:08   ` Nicholas Piggin
  0 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-12  0:08 UTC (permalink / raw)
  To: Ilya Maximets, netdev
  Cc: dev, Pravin B Shelar, Aaron Conole, Eelco Chaudron,
	Flavio Leitner, Simon Horman

On Wed Oct 11, 2023 at 10:22 PM AEST, Ilya Maximets wrote:
> On 10/11/23 05:43, Nicholas Piggin wrote:
> > Hi,
> > 
> > I'll post this out again to keep discussion going. Thanks all for the
> > testing and comments so far.
>
> Hi, Nicholas.  This patch set still needs performance evaluation
> since it touches very performance-sensitive parts of the stack.
> Did you run any performance tests with this version?

I did, the recipe in the previous thread was in the noise on my
system.

> IIRC, Aaron was still working on testing for the RFC.  I think,
> we should wait for his feedback before re-spinning a new version.

The RFC was a a couple of % slow on the same microbenchmark. I
gave an updated git tree with reworked to avoid the slab allocs
he was looking at, but I thought I'd post it out for others to
see.

> > 
> > Changes since the RFC
> > https://lore.kernel.org/netdev/20230927001308.749910-1-npiggin@gmail.com/
> > 
> > - Replace slab allocations for flow keys with expanding the use
> >   of the per-CPU key allocator to ovs_vport_receive.
>
> While this is likely to work faster than a dynamic memory allocation,
> it is unlikley to be on par with a stack allocation.  Performance
> evaluation is necessary.

Sure.

> > 
> > - Drop patch 1 with Ilya's since they did the same thing (that is
> >   added at patch 3).
>
> The patch is already in net-next, so should not be included in this set.
> For the next version (please, hold) please rebase the set on the
> net-next/main and add the net-next to the subject prefix of the patches.
> They are not simple bug fixes, so should go through net-next, IMO.
>
> You may also see in netdev+bpf patchwork that CI failed trying to guess
> on which tree the patches should be applied and no tests were executed.

I was thinking you might take them through your ovs merge process,
but I'm happy to go whatever way you like. And yes they're not
intended for merge now, I did intend to add RFC v2 prefix.

>
> > 
> > - Change push_nsh stack reduction from slab allocation to per-cpu
> >   buffer.
>
> I still think this change is not needed and will only consume a lot
> of per-CPU memory space for no reason, as NSH is not a frequently
> used thing in OVS and the function is not on the recursive path and
> explicitly not inlined already.

If it's infrequent and you're concerned with per-CPU memory usage, we
could go back to using slab.

It's not in the recursive path but it can be a leaf called from the
recursive path. It could still be function that uses the most stack
in any given scenario, no?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/7] net: openvswitch: Reduce stack usage
  2023-10-11 13:23 ` Aaron Conole
@ 2023-10-12  1:19   ` Nicholas Piggin
  2023-10-13  8:27     ` David Laight
  2023-10-20 17:04     ` Aaron Conole
  0 siblings, 2 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-12  1:19 UTC (permalink / raw)
  To: Aaron Conole
  Cc: netdev, dev, Pravin B Shelar, Eelco Chaudron, Ilya Maximets,
	Flavio Leitner, Paolo Abeni, Jakub Kicinski, David S. Miller,
	Eric Dumazet

On Wed Oct 11, 2023 at 11:23 PM AEST, Aaron Conole wrote:
> Nicholas Piggin <npiggin@gmail.com> writes:
>
> > Hi,
> >
> > I'll post this out again to keep discussion going. Thanks all for the
> > testing and comments so far.
>
> Thanks for the update - did you mean for this to be tagged RFC as well?

Yeah, it wasn't intended for merge with no RB or tests of course.
I intended to tag it RFC v2.

>
> I don't see any performance data with the deployments on x86_64 and
> ppc64le that cause these stack overflows.  Are you able to provide the
> impact on ppc64le and x86_64?

Don't think it'll be easy but they are not be pushing such rates
so it wouldn't say much.  If you want to show the worst case, those
tput and latency microbenchmarks should do it.

It's the same tradeoff and reasons the per-cpu key allocator was
added in the first place, presumably. Probably more expensive than
stack, but similar order of magnitude O(cycles) vs slab which is
probably O(100s cycles).

> I guess the change probably should be tagged as -next since it doesn't
> really have a specific set of commits it is "fixing."  It's really like
> a major change and shouldn't really go through stable trees, but I'll
> let the maintainers tell me off if I got it wrong.

It should go upstream first if anything. I thought it was relatively
simple and elegant to reuse the per-cpu key allocator though :(

It is a kernel crash, so we need something for stable. But In a case
like this there's not one single problem. Linux kernel stack use has
always been pretty dumb - "don't use too much", for some values of
too much, and just cross fingers config and compiler and worlkoad
doesn't hit some overflow case.

And powerpc has always used more stack x86, so probably it should stay
one power-of-two larger to be safe. And that may be the best fix for
-stable.

But also, ovs uses too much stack. Look at the stack sizes in the first
RFC patch, and ovs takes the 5 largest. That's because it has always
been the practice to not put large variables on stack, and when you're
introducing significant recursion that puts extra onus on you to be
lean. Even if it costs a percent. There are probably lots of places in
the kernel that could get a few cycles by sticking large structures on
stack, but unfortunately we can't all do that.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 0/7] net: openvswitch: Reduce stack usage
  2023-10-12  1:19   ` Nicholas Piggin
@ 2023-10-13  8:27     ` David Laight
  2023-10-20 17:04     ` Aaron Conole
  1 sibling, 0 replies; 15+ messages in thread
From: David Laight @ 2023-10-13  8:27 UTC (permalink / raw)
  To: 'Nicholas Piggin', Aaron Conole
  Cc: netdev, dev, Pravin B Shelar, Eelco Chaudron, Ilya Maximets,
	Flavio Leitner, Paolo Abeni, Jakub Kicinski, David S. Miller,
	Eric Dumazet

From: Nicholas Piggin
> Sent: 12 October 2023 02:19
...
> It is a kernel crash, so we need something for stable. But In a case
> like this there's not one single problem. Linux kernel stack use has
> always been pretty dumb - "don't use too much", for some values of
> too much, and just cross fingers config and compiler and worlkoad
> doesn't hit some overflow case.

I think it ought to be possible to use a FINE_IBT (I think that
is what it is called) compile to get indirect calls grouped
and change objtool to output the stack offset of every call.
It is then a simple (for some definition of simple) process
to work out the static maximum stack usage.

Any recursion causes grief (the stack depth for each
loop can be reported).

Also I suspect the compiler will need an enhancement to
add a constant into the hash to disambiguate indirect
calls with the same argument list.

My suspicion (from doing this exercise on an embedded system
a long time ago) is that the stack will be blown somewhere
inside printk() in some error path.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/7] net: openvswitch: Reduce stack usage
  2023-10-12  1:19   ` Nicholas Piggin
  2023-10-13  8:27     ` David Laight
@ 2023-10-20 17:04     ` Aaron Conole
  2023-10-25  4:06       ` Nicholas Piggin
  1 sibling, 1 reply; 15+ messages in thread
From: Aaron Conole @ 2023-10-20 17:04 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: netdev, dev, Pravin B Shelar, Eelco Chaudron, Ilya Maximets,
	Flavio Leitner, Paolo Abeni, Jakub Kicinski, David S. Miller,
	Eric Dumazet

"Nicholas Piggin" <npiggin@gmail.com> writes:

> On Wed Oct 11, 2023 at 11:23 PM AEST, Aaron Conole wrote:
>> Nicholas Piggin <npiggin@gmail.com> writes:
>>
>> > Hi,
>> >
>> > I'll post this out again to keep discussion going. Thanks all for the
>> > testing and comments so far.
>>
>> Thanks for the update - did you mean for this to be tagged RFC as well?
>
> Yeah, it wasn't intended for merge with no RB or tests of course.
> I intended to tag it RFC v2.

I only did a basic test with this because of some other stuff, and I
only tested 1, 2, and 3.  I didn't see any real performance changes, but
that is only with a simple port-port test.  I plan to do some additional
testing with some recursive calls.  That will also help to understand
the limits a bit.

That said, I'm very nervous about the key allocator, especially if it is
possible that it runs out.  We probably will need the limit to be
bigger, but I want to get a worst-case flow from OVN side to understand.

>>
>> I don't see any performance data with the deployments on x86_64 and
>> ppc64le that cause these stack overflows.  Are you able to provide the
>> impact on ppc64le and x86_64?
>
> Don't think it'll be easy but they are not be pushing such rates
> so it wouldn't say much.  If you want to show the worst case, those
> tput and latency microbenchmarks should do it.
>
> It's the same tradeoff and reasons the per-cpu key allocator was
> added in the first place, presumably. Probably more expensive than
> stack, but similar order of magnitude O(cycles) vs slab which is
> probably O(100s cycles).
>
>> I guess the change probably should be tagged as -next since it doesn't
>> really have a specific set of commits it is "fixing."  It's really like
>> a major change and shouldn't really go through stable trees, but I'll
>> let the maintainers tell me off if I got it wrong.
>
> It should go upstream first if anything. I thought it was relatively
> simple and elegant to reuse the per-cpu key allocator though :(
>
> It is a kernel crash, so we need something for stable. But In a case
> like this there's not one single problem. Linux kernel stack use has
> always been pretty dumb - "don't use too much", for some values of
> too much, and just cross fingers config and compiler and worlkoad
> doesn't hit some overflow case.
>
> And powerpc has always used more stack x86, so probably it should stay
> one power-of-two larger to be safe. And that may be the best fix for
> -stable.

Given the reply from David (with msg-id:
<ff6cd12e28894f158d9a6c9f7157487f@AcuMS.aculab.com>), are there other
things we can look at with respect to the compiler as well?

> But also, ovs uses too much stack. Look at the stack sizes in the first
> RFC patch, and ovs takes the 5 largest. That's because it has always
> been the practice to not put large variables on stack, and when you're
> introducing significant recursion that puts extra onus on you to be
> lean. Even if it costs a percent. There are probably lots of places in
> the kernel that could get a few cycles by sticking large structures on
> stack, but unfortunately we can't all do that.

Well, OVS operated this way for at least 6 years, so it isn't a recent
thing.  But we should look at it.

I also wonder if we need to recurse in the internal devices, or if we
shouldn't just push the skb into the packet queue.  That will cut out
1/3 of the stack frame that you reported originally, and then when doing
the xmit, will cut out 2/3rds. I have no idea what the performance
impact hit there might be.  Maybe it looks more like a latency hit
rather than a throughput hit, but just speculating.

> Thanks,
> Nick


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/7] net: openvswitch: Reduce stack usage
  2023-10-20 17:04     ` Aaron Conole
@ 2023-10-25  4:06       ` Nicholas Piggin
  0 siblings, 0 replies; 15+ messages in thread
From: Nicholas Piggin @ 2023-10-25  4:06 UTC (permalink / raw)
  To: Aaron Conole
  Cc: netdev, dev, Pravin B Shelar, Eelco Chaudron, Ilya Maximets,
	Flavio Leitner, Paolo Abeni, Jakub Kicinski, David S. Miller,
	Eric Dumazet

On Sat Oct 21, 2023 at 3:04 AM AEST, Aaron Conole wrote:
> "Nicholas Piggin" <npiggin@gmail.com> writes:
>
> > On Wed Oct 11, 2023 at 11:23 PM AEST, Aaron Conole wrote:
> >> Nicholas Piggin <npiggin@gmail.com> writes:
> >>
> >> > Hi,
> >> >
> >> > I'll post this out again to keep discussion going. Thanks all for the
> >> > testing and comments so far.
> >>
> >> Thanks for the update - did you mean for this to be tagged RFC as well?
> >
> > Yeah, it wasn't intended for merge with no RB or tests of course.
> > I intended to tag it RFC v2.
>
> I only did a basic test with this because of some other stuff, and I
> only tested 1, 2, and 3.  I didn't see any real performance changes, but
> that is only with a simple port-port test.  I plan to do some additional
> testing with some recursive calls.  That will also help to understand
> the limits a bit.

Thanks. Good so far.

>
> That said, I'm very nervous about the key allocator, especially if it is
> possible that it runs out.  We probably will need the limit to be
> bigger, but I want to get a worst-case flow from OVN side to understand.
>
> >>
> >> I don't see any performance data with the deployments on x86_64 and
> >> ppc64le that cause these stack overflows.  Are you able to provide the
> >> impact on ppc64le and x86_64?
> >
> > Don't think it'll be easy but they are not be pushing such rates
> > so it wouldn't say much.  If you want to show the worst case, those
> > tput and latency microbenchmarks should do it.
> >
> > It's the same tradeoff and reasons the per-cpu key allocator was
> > added in the first place, presumably. Probably more expensive than
> > stack, but similar order of magnitude O(cycles) vs slab which is
> > probably O(100s cycles).
> >
> >> I guess the change probably should be tagged as -next since it doesn't
> >> really have a specific set of commits it is "fixing."  It's really like
> >> a major change and shouldn't really go through stable trees, but I'll
> >> let the maintainers tell me off if I got it wrong.
> >
> > It should go upstream first if anything. I thought it was relatively
> > simple and elegant to reuse the per-cpu key allocator though :(
> >
> > It is a kernel crash, so we need something for stable. But In a case
> > like this there's not one single problem. Linux kernel stack use has
> > always been pretty dumb - "don't use too much", for some values of
> > too much, and just cross fingers config and compiler and worlkoad
> > doesn't hit some overflow case.
> >
> > And powerpc has always used more stack x86, so probably it should stay
> > one power-of-two larger to be safe. And that may be the best fix for
> > -stable.
>
> Given the reply from David (with msg-id:
> <ff6cd12e28894f158d9a6c9f7157487f@AcuMS.aculab.com>), are there other
> things we can look at with respect to the compiler as well?

Not too easily or immediately, and if we did get something improved
then old compilers still have to be supported for some time.

ppc64 stack might be increased to 32K too, which would avoid the
problem.

But whatever else we do, it would still be good to reduce ovs stack.

> > But also, ovs uses too much stack. Look at the stack sizes in the first
> > RFC patch, and ovs takes the 5 largest. That's because it has always
> > been the practice to not put large variables on stack, and when you're
> > introducing significant recursion that puts extra onus on you to be
> > lean. Even if it costs a percent. There are probably lots of places in
> > the kernel that could get a few cycles by sticking large structures on
> > stack, but unfortunately we can't all do that.
>
> Well, OVS operated this way for at least 6 years, so it isn't a recent
> thing.  But we should look at it.

Maybe ppc64 hadn't used it too much before and it was perfectly
fine on x86-64. And ovs developers shouldn't really be expected to
test on or understand stack allocation of all architectures. It
can't be called an ovs bug, there's just a bunch of things where
improvements could be made.

Thanks,
Nick

> I also wonder if we need to recurse in the internal devices, or if we
> shouldn't just push the skb into the packet queue.  That will cut out
> 1/3 of the stack frame that you reported originally, and then when doing
> the xmit, will cut out 2/3rds. I have no idea what the performance
> impact hit there might be.  Maybe it looks more like a latency hit
> rather than a throughput hit, but just speculating.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-10-25  4:06 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-11  3:43 [PATCH 0/7] net: openvswitch: Reduce stack usage Nicholas Piggin
2023-10-11  3:43 ` [PATCH 1/7] net: openvswitch: generalise the per-cpu flow key allocation stack Nicholas Piggin
2023-10-11  3:43 ` [PATCH 2/7] net: openvswitch: Use flow key allocator in ovs_vport_receive Nicholas Piggin
2023-10-11  3:43 ` [PATCH 3/7] openvswitch: reduce stack usage in do_execute_actions Nicholas Piggin
2023-10-11  3:43 ` [PATCH 4/7] net: openvswitch: Reduce push_nsh stack usage Nicholas Piggin
2023-10-11  3:43 ` [PATCH 5/7] net: openvswitch: uninline action execution Nicholas Piggin
2023-10-11  3:43 ` [PATCH 6/7] net: openvswitch: uninline ovs_fragment to control stack usage Nicholas Piggin
2023-10-11  3:43 ` [PATCH 7/7] net: openvswitch: Reduce stack usage in ovs_dp_process_packet Nicholas Piggin
2023-10-11 12:22 ` [PATCH 0/7] net: openvswitch: Reduce stack usage Ilya Maximets
2023-10-12  0:08   ` Nicholas Piggin
2023-10-11 13:23 ` Aaron Conole
2023-10-12  1:19   ` Nicholas Piggin
2023-10-13  8:27     ` David Laight
2023-10-20 17:04     ` Aaron Conole
2023-10-25  4:06       ` Nicholas Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.