Netdev Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH net 0/4] tcp: add three static keys
@ 2019-06-14 23:22 Eric Dumazet
  2019-06-14 23:22 ` [PATCH net 1/4] sysctl: define proc_do_static_key() Eric Dumazet
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Eric Dumazet @ 2019-06-14 23:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Willem de Bruijn, Feng Tang, Eric Dumazet

Recent addition of per TCP socket rx/tx cache brought
regressions for some workloads, as reported by Feng Tang.

It seems better to make them opt-in, before we adopt better
heuristics.

The last patch adds high_order_alloc_disable sysctl
to ask TCP sendmsg() to exclusively use order-0 allocations,
as mm layer has specific optimizations.

Eric Dumazet (4):
  sysctl: define proc_do_static_key()
  tcp: add tcp_rx_skb_cache sysctl
  tcp: add tcp_tx_skb_cache sysctl
  net: add high_order_alloc_disable sysctl/static key

 Documentation/networking/ip-sysctl.txt |  8 +++++
 include/linux/bpf.h                    |  1 -
 include/linux/sysctl.h                 |  3 ++
 include/net/sock.h                     | 12 ++++---
 kernel/bpf/core.c                      |  1 -
 kernel/sysctl.c                        | 44 ++++++++++++++------------
 net/core/sock.c                        |  4 ++-
 net/core/sysctl_net_core.c             |  7 ++++
 net/ipv4/sysctl_net_ipv4.c             | 17 ++++++++++
 9 files changed, 68 insertions(+), 29 deletions(-)

-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH net 1/4] sysctl: define proc_do_static_key()
  2019-06-14 23:22 [PATCH net 0/4] tcp: add three static keys Eric Dumazet
@ 2019-06-14 23:22 ` Eric Dumazet
  2019-06-14 23:45   ` Alexei Starovoitov
  2019-06-14 23:22 ` [PATCH net 2/4] tcp: add tcp_rx_skb_cache sysctl Eric Dumazet
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2019-06-14 23:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Willem de Bruijn, Feng Tang, Eric Dumazet

Convert proc_dointvec_minmax_bpf_stats() into a more generic
helper, since we are going to use jump labels more often.

Note that sysctl_bpf_stats_enabled is removed, since
it is no longer needed/used.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/bpf.h    |  1 -
 include/linux/sysctl.h |  3 +++
 kernel/bpf/core.c      |  1 -
 kernel/sysctl.c        | 44 ++++++++++++++++++++++--------------------
 4 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5df8e9e2a3933949af17dda1d77a4daccd5df611..b92ef9f73e42f1bcf0141aa21d0e9c17c5c7f05b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -600,7 +600,6 @@ void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
 extern int sysctl_unprivileged_bpf_disabled;
-extern int sysctl_bpf_stats_enabled;
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd41aad6fd339ba824c6bb622ac24d..aadd310769d080f1d45db14b2a72fc8ad36f1196 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -63,6 +63,9 @@ extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 				      void __user *, size_t *, loff_t *);
 extern int proc_do_large_bitmap(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
+extern int proc_do_static_key(struct ctl_table *table, int write,
+			      void __user *buffer, size_t *lenp,
+			      loff_t *ppos);
 
 /*
  * Register a set of sysctl names by calling register_sysctl_table
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7c473f208a1058de97434a57a2d47e2360ae80a8..080e2bb644ccd761b3d54fbad9b58a881086231e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2097,7 +2097,6 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
-int sysctl_bpf_stats_enabled __read_mostly;
 
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7d1008be6173313c807b2abb23f3171ef05cddc8..1beca96fb6252ddc4af07b6292f9dd95c4f53afd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -230,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
-#ifdef CONFIG_BPF_SYSCALL
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
-					  void __user *buffer, size_t *lenp,
-					  loff_t *ppos);
-#endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -1253,12 +1248,10 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname	= "bpf_stats_enabled",
-		.data		= &sysctl_bpf_stats_enabled,
-		.maxlen		= sizeof(sysctl_bpf_stats_enabled),
+		.data		= &bpf_stats_enabled_key.key,
+		.maxlen		= sizeof(bpf_stats_enabled_key),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax_bpf_stats,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.proc_handler	= proc_do_static_key,
 	},
 #endif
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
@@ -3374,26 +3367,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
-#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
-					  void __user *buffer, size_t *lenp,
-					  loff_t *ppos)
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+		       void __user *buffer, size_t *lenp,
+		       loff_t *ppos)
 {
-	int ret, bpf_stats = *(int *)table->data;
-	struct ctl_table tmp = *table;
+	struct static_key *key = (struct static_key *)table->data;
+	static DEFINE_MUTEX(static_key_mutex);
+	int val, ret;
+	struct ctl_table tmp = {
+		.data   = &val,
+		.maxlen = sizeof(val),
+		.mode   = table->mode,
+		.extra1 = &zero,
+		.extra2 = &one,
+	};
 
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	tmp.data = &bpf_stats;
+	mutex_lock(&static_key_mutex);
+	val = static_key_enabled(key);
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 	if (write && !ret) {
-		*(int *)table->data = bpf_stats;
-		if (bpf_stats)
-			static_branch_enable(&bpf_stats_enabled_key);
+		if (val)
+			static_key_enable(key);
 		else
-			static_branch_disable(&bpf_stats_enabled_key);
+			static_key_disable(key);
 	}
+	mutex_unlock(&static_key_mutex);
 	return ret;
 }
 #endif
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH net 2/4] tcp: add tcp_rx_skb_cache sysctl
  2019-06-14 23:22 [PATCH net 0/4] tcp: add three static keys Eric Dumazet
  2019-06-14 23:22 ` [PATCH net 1/4] sysctl: define proc_do_static_key() Eric Dumazet
@ 2019-06-14 23:22 ` Eric Dumazet
  2019-06-16  7:38   ` Feng Tang
  2019-06-14 23:22 ` [PATCH net 3/4] tcp: add tcp_tx_skb_cache sysctl Eric Dumazet
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2019-06-14 23:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Willem de Bruijn, Feng Tang, Eric Dumazet

Instead of relying on rps_needed, it is safer to use a separate
static key, since we do not want to enable TCP rx_skb_cache
by default. This feature can cause huge increase of memory
usage on hosts with millions of sockets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 Documentation/networking/ip-sysctl.txt | 8 ++++++++
 include/net/sock.h                     | 6 ++----
 net/ipv4/sysctl_net_ipv4.c             | 9 +++++++++
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 14fe93049d28e965d7349b03c5c8782c3d386e7d..288aa264ac26d98637a5bb1babc334bfc699bef1 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -772,6 +772,14 @@ tcp_challenge_ack_limit - INTEGER
 	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
 	Default: 100
 
+tcp_rx_skb_cache - BOOLEAN
+	Controls a per TCP socket cache of one skb, that might help
+	performance of some workloads. This might be dangerous
+	on systems with a lot of TCP sockets, since it increases
+	memory usage.
+
+	Default: 0 (disabled)
+
 UDP variables:
 
 udp_l3mdev_accept - BOOLEAN
diff --git a/include/net/sock.h b/include/net/sock.h
index e9d769c04637a3c0b967c9bfa6def724834796b9..b02645e2dfad722769c1455bcde76e46da9fc5ac 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2433,13 +2433,11 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
+DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
-	if (
-#ifdef CONFIG_RPS
-	    !static_branch_unlikely(&rps_needed) &&
-#endif
+	if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
 	    !sk->sk_rx_skb_cache) {
 		sk->sk_rx_skb_cache = skb;
 		skb_orphan(skb);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 875867b64d6a6597bf4fcd3498ed55741cbe33f7..886b58d31351df44725bdc34081e798bcb89ecf0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -51,6 +51,9 @@ static int comp_sack_nr_max = 255;
 static u32 u32_max_div_HZ = UINT_MAX / HZ;
 static int one_day_secs = 24 * 3600;
 
+DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
+EXPORT_SYMBOL(tcp_rx_skb_cache_key);
+
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
 
@@ -559,6 +562,12 @@ static struct ctl_table ipv4_table[] = {
 		.extra1		= &sysctl_fib_sync_mem_min,
 		.extra2		= &sysctl_fib_sync_mem_max,
 	},
+	{
+		.procname	= "tcp_rx_skb_cache",
+		.data		= &tcp_rx_skb_cache_key.key,
+		.mode		= 0644,
+		.proc_handler	= proc_do_static_key,
+	},
 	{ }
 };
 
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH net 3/4] tcp: add tcp_tx_skb_cache sysctl
  2019-06-14 23:22 [PATCH net 0/4] tcp: add three static keys Eric Dumazet
  2019-06-14 23:22 ` [PATCH net 1/4] sysctl: define proc_do_static_key() Eric Dumazet
  2019-06-14 23:22 ` [PATCH net 2/4] tcp: add tcp_rx_skb_cache sysctl Eric Dumazet
@ 2019-06-14 23:22 ` Eric Dumazet
  2019-06-16  7:42   ` Feng Tang
  2019-06-14 23:22 ` [PATCH net 4/4] net: add high_order_alloc_disable sysctl/static key Eric Dumazet
  2019-06-15  3:18 ` [PATCH net 0/4] tcp: add three static keys David Miller
  4 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2019-06-14 23:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Willem de Bruijn, Feng Tang, Eric Dumazet

Feng Tang reported a performance regression after introduction
of per TCP socket tx/rx caches, for TCP over loopback (netperf)

There is high chance the regression is caused by a change on
how well the 32 KB per-thread page (current->task_frag) can
be recycled, and lack of pcp caches for order-3 pages.

I could not reproduce the regression myself, cpus all being
spinning on the mm spinlocks for page allocs/freeing, regardless
of enabling or disabling the per tcp socket caches.

It seems best to disable the feature by default, and let
admins enabling it.

MM layer either needs to provide scalable order-3 pages
allocations, or could attempt a trylock on zone->lock if
the caller only attempts to get a high-order page and is
able to fallback to order-0 ones in case of pressure.

Tests run on a 56 cores host (112 hyper threads)

-	35.49%	netperf 		 [kernel.vmlinux]	  [k] queued_spin_lock_slowpath
   - 35.49% queued_spin_lock_slowpath
	  - 18.18% get_page_from_freelist
		 - __alloc_pages_nodemask
			- 18.18% alloc_pages_current
				 skb_page_frag_refill
				 sk_page_frag_refill
				 tcp_sendmsg_locked
				 tcp_sendmsg
				 inet_sendmsg
				 sock_sendmsg
				 __sys_sendto
				 __x64_sys_sendto
				 do_syscall_64
				 entry_SYSCALL_64_after_hwframe
				 __libc_send
	  + 17.31% __free_pages_ok
+	31.43%	swapper 		 [kernel.vmlinux]	  [k] intel_idle
+	 9.12%	netperf 		 [kernel.vmlinux]	  [k] copy_user_enhanced_fast_string
+	 6.53%	netserver		 [kernel.vmlinux]	  [k] copy_user_enhanced_fast_string
+	 0.69%	netserver		 [kernel.vmlinux]	  [k] queued_spin_lock_slowpath
+	 0.68%	netperf 		 [kernel.vmlinux]	  [k] skb_release_data
+	 0.52%	netperf 		 [kernel.vmlinux]	  [k] tcp_sendmsg_locked
	 0.46%	netperf 		 [kernel.vmlinux]	  [k] _raw_spin_lock_irqsave

Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Feng Tang <feng.tang@intel.com>
---
 include/net/sock.h         | 4 +++-
 net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index b02645e2dfad722769c1455bcde76e46da9fc5ac..7d7f4ce63bb2aae7c87a9445d11339b6e6b19724 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1463,12 +1463,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 		__sk_mem_reclaim(sk, 1 << 20);
 }
 
+DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 	sk->sk_wmem_queued -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
-	if (!sk->sk_tx_skb_cache && !skb_cloned(skb)) {
+	if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
+	    !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
 		skb_zcopy_clear(skb, true);
 		sk->sk_tx_skb_cache = skb;
 		return;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 886b58d31351df44725bdc34081e798bcb89ecf0..08a428a7b2749c4f2a03aa6352e44c053596ef75 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -54,6 +54,8 @@ static int one_day_secs = 24 * 3600;
 DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
 EXPORT_SYMBOL(tcp_rx_skb_cache_key);
 
+DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
+
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
 
@@ -568,6 +570,12 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_do_static_key,
 	},
+	{
+		.procname	= "tcp_tx_skb_cache",
+		.data		= &tcp_tx_skb_cache_key.key,
+		.mode		= 0644,
+		.proc_handler	= proc_do_static_key,
+	},
 	{ }
 };
 
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH net 4/4] net: add high_order_alloc_disable sysctl/static key
  2019-06-14 23:22 [PATCH net 0/4] tcp: add three static keys Eric Dumazet
                   ` (2 preceding siblings ...)
  2019-06-14 23:22 ` [PATCH net 3/4] tcp: add tcp_tx_skb_cache sysctl Eric Dumazet
@ 2019-06-14 23:22 ` Eric Dumazet
  2019-06-15  3:18 ` [PATCH net 0/4] tcp: add three static keys David Miller
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2019-06-14 23:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Willem de Bruijn, Feng Tang, Eric Dumazet

From linux-3.7, (commit 5640f7685831 "net: use a per task frag
allocator") TCP sendmsg() has preferred using order-3 allocations.

While it gives good results for most cases, we had reports
that heavy uses of TCP over loopback were hitting a spinlock
contention in page allocations/freeing.

This commits adds a sysctl so that admins can opt-in
for order-0 allocations. Hopefully mm layer might optimize
order-3 allocations in the future since it could give us
a nice boost  (see 8 lines of following benchmark)

The following benchmark shows a win when more than 8 TCP_STREAM
threads are running (56 x86 cores server in my tests)

for thr in {1..30}
do
 sysctl -wq net.core.high_order_alloc_disable=0
 T0=`./super_netperf $thr -H 127.0.0.1 -l 15`
 sysctl -wq net.core.high_order_alloc_disable=1
 T1=`./super_netperf $thr -H 127.0.0.1 -l 15`
 echo $thr:$T0:$T1
done

1: 49979: 37267
2: 98745: 76286
3: 141088: 110051
4: 177414: 144772
5: 197587: 173563
6: 215377: 208448
7: 241061: 234087
8: 267155: 263373
9: 295069: 297402
10: 312393: 335213
11: 340462: 368778
12: 371366: 403954
13: 412344: 443713
14: 426617: 473580
15: 474418: 507861
16: 503261: 538539
17: 522331: 563096
18: 532409: 567084
19: 550824: 605240
20: 525493: 641988
21: 564574: 665843
22: 567349: 690868
23: 583846: 710917
24: 588715: 736306
25: 603212: 763494
26: 604083: 792654
27: 602241: 796450
28: 604291: 797993
29: 611610: 833249
30: 577356: 841062

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sock.h         | 2 ++
 net/core/sock.c            | 4 +++-
 net/core/sysctl_net_core.c | 7 +++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 7d7f4ce63bb2aae7c87a9445d11339b6e6b19724..6cbc16136357d158cf1e84b98ecb7e06898269a6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2534,6 +2534,8 @@ extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
+
 static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
 {
 	/* Does this proto have per netns sysctl_wmem ? */
diff --git a/net/core/sock.c b/net/core/sock.c
index 2b3701958486219a26385c8fca1498c4e294dc1d..7f49392579a5826b0a8446bb88428396422cc0b6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2320,6 +2320,7 @@ static void sk_leave_memory_pressure(struct sock *sk)
 
 /* On 32bit arches, an skb frag is limited to 2^15 */
 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
+DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
 
 /**
  * skb_page_frag_refill - check that a page_frag contains enough room
@@ -2344,7 +2345,8 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 	}
 
 	pfrag->offset = 0;
-	if (SKB_FRAG_PAGE_ORDER) {
+	if (SKB_FRAG_PAGE_ORDER &&
+	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
 		/* Avoid direct reclaim but allow kswapd to wake */
 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
 					  __GFP_COMP | __GFP_NOWARN |
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 1a2685694abd537d7ae304754b84b237928fd298..f9204719aeeeb4700582d03fda244f2f8961f8a7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -562,6 +562,13 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
+	{
+		.procname	= "high_order_alloc_disable",
+		.data		= &net_high_order_alloc_disable_key.key,
+		.maxlen         = sizeof(net_high_order_alloc_disable_key),
+		.mode		= 0644,
+		.proc_handler	= proc_do_static_key,
+	},
 	{ }
 };
 
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net 1/4] sysctl: define proc_do_static_key()
  2019-06-14 23:22 ` [PATCH net 1/4] sysctl: define proc_do_static_key() Eric Dumazet
@ 2019-06-14 23:45   ` Alexei Starovoitov
  2019-06-14 23:55     ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2019-06-14 23:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Willem de Bruijn, Feng Tang, Eric Dumazet

On Fri, Jun 14, 2019 at 04:22:18PM -0700, Eric Dumazet wrote:
> Convert proc_dointvec_minmax_bpf_stats() into a more generic
> helper, since we are going to use jump labels more often.
> 
> Note that sysctl_bpf_stats_enabled is removed, since
> it is no longer needed/used.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/linux/bpf.h    |  1 -
>  include/linux/sysctl.h |  3 +++
>  kernel/bpf/core.c      |  1 -
>  kernel/sysctl.c        | 44 ++++++++++++++++++++++--------------------
>  4 files changed, 26 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5df8e9e2a3933949af17dda1d77a4daccd5df611..b92ef9f73e42f1bcf0141aa21d0e9c17c5c7f05b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -600,7 +600,6 @@ void bpf_map_area_free(void *base);
>  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
>  
>  extern int sysctl_unprivileged_bpf_disabled;
> -extern int sysctl_bpf_stats_enabled;
>  
>  int bpf_map_new_fd(struct bpf_map *map, int flags);
>  int bpf_prog_new_fd(struct bpf_prog *prog);
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index b769ecfcc3bd41aad6fd339ba824c6bb622ac24d..aadd310769d080f1d45db14b2a72fc8ad36f1196 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -63,6 +63,9 @@ extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
>  				      void __user *, size_t *, loff_t *);
>  extern int proc_do_large_bitmap(struct ctl_table *, int,
>  				void __user *, size_t *, loff_t *);
> +extern int proc_do_static_key(struct ctl_table *table, int write,
> +			      void __user *buffer, size_t *lenp,
> +			      loff_t *ppos);
>  
>  /*
>   * Register a set of sysctl names by calling register_sysctl_table
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 7c473f208a1058de97434a57a2d47e2360ae80a8..080e2bb644ccd761b3d54fbad9b58a881086231e 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2097,7 +2097,6 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
>  
>  DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>  EXPORT_SYMBOL(bpf_stats_enabled_key);
> -int sysctl_bpf_stats_enabled __read_mostly;
>  
>  /* All definitions of tracepoints related to BPF. */
>  #define CREATE_TRACE_POINTS
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 7d1008be6173313c807b2abb23f3171ef05cddc8..1beca96fb6252ddc4af07b6292f9dd95c4f53afd 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -230,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
>  #endif
>  static int proc_dopipe_max_size(struct ctl_table *table, int write,
>  		void __user *buffer, size_t *lenp, loff_t *ppos);
> -#ifdef CONFIG_BPF_SYSCALL
> -static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
> -					  void __user *buffer, size_t *lenp,
> -					  loff_t *ppos);
> -#endif
>  
>  #ifdef CONFIG_MAGIC_SYSRQ
>  /* Note: sysrq code uses its own private copy */
> @@ -1253,12 +1248,10 @@ static struct ctl_table kern_table[] = {
>  	},
>  	{
>  		.procname	= "bpf_stats_enabled",
> -		.data		= &sysctl_bpf_stats_enabled,
> -		.maxlen		= sizeof(sysctl_bpf_stats_enabled),
> +		.data		= &bpf_stats_enabled_key.key,
> +		.maxlen		= sizeof(bpf_stats_enabled_key),

maxlen is ignored by proc_do_static_key(), right?

Great idea, btw. I like the series.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net 1/4] sysctl: define proc_do_static_key()
  2019-06-14 23:45   ` Alexei Starovoitov
@ 2019-06-14 23:55     ` Eric Dumazet
  2019-06-15  0:04       ` Alexei Starovoitov
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2019-06-14 23:55 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric Dumazet
  Cc: David S . Miller, netdev, Willem de Bruijn, Feng Tang, Eric Dumazet



On 6/14/19 4:45 PM, Alexei Starovoitov wrote:
> On Fri, Jun 14, 2019 at 04:22:18PM -0700, Eric Dumazet wrote:

> maxlen is ignored by proc_do_static_key(), right?

That is right, I was not sure putting a zero or sizeof(int)
would make sense here.

Using sizeof(...key) is consistent with other sysctls,
even of proc_do_static_key() uses a temporary structure and
a temporary integer in its current implementation.

> 
> Great idea, btw. I like the series.
> 

Thanks :)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net 1/4] sysctl: define proc_do_static_key()
  2019-06-14 23:55     ` Eric Dumazet
@ 2019-06-15  0:04       ` Alexei Starovoitov
  0 siblings, 0 replies; 11+ messages in thread
From: Alexei Starovoitov @ 2019-06-15  0:04 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric Dumazet, David S . Miller, netdev, Willem de Bruijn, Feng Tang

On Fri, Jun 14, 2019 at 4:55 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> On 6/14/19 4:45 PM, Alexei Starovoitov wrote:
> > On Fri, Jun 14, 2019 at 04:22:18PM -0700, Eric Dumazet wrote:
>
> > maxlen is ignored by proc_do_static_key(), right?
>
> That is right, I was not sure putting a zero or sizeof(int)
> would make sense here.
>
> Using sizeof(...key) is consistent with other sysctls,

yes. that makes sense. I was just curious whether I missed something.

> even of proc_do_static_key() uses a temporary structure and
> a temporary integer in its current implementation.

yep.
Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net 0/4] tcp: add three static keys
  2019-06-14 23:22 [PATCH net 0/4] tcp: add three static keys Eric Dumazet
                   ` (3 preceding siblings ...)
  2019-06-14 23:22 ` [PATCH net 4/4] net: add high_order_alloc_disable sysctl/static key Eric Dumazet
@ 2019-06-15  3:18 ` David Miller
  4 siblings, 0 replies; 11+ messages in thread
From: David Miller @ 2019-06-15  3:18 UTC (permalink / raw)
  To: edumazet; +Cc: netdev, willemb, feng.tang, eric.dumazet

From: Eric Dumazet <edumazet@google.com>
Date: Fri, 14 Jun 2019 16:22:17 -0700

> Recent addition of per TCP socket rx/tx cache brought
> regressions for some workloads, as reported by Feng Tang.
> 
> It seems better to make them opt-in, before we adopt better
> heuristics.
> 
> The last patch adds high_order_alloc_disable sysctl
> to ask TCP sendmsg() to exclusively use order-0 allocations,
> as mm layer has specific optimizations.

Series applied.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net 2/4] tcp: add tcp_rx_skb_cache sysctl
  2019-06-14 23:22 ` [PATCH net 2/4] tcp: add tcp_rx_skb_cache sysctl Eric Dumazet
@ 2019-06-16  7:38   ` Feng Tang
  0 siblings, 0 replies; 11+ messages in thread
From: Feng Tang @ 2019-06-16  7:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Willem de Bruijn, Eric Dumazet

On Fri, Jun 14, 2019 at 04:22:19PM -0700, Eric Dumazet wrote:
> Instead of relying on rps_needed, it is safer to use a separate
> static key, since we do not want to enable TCP rx_skb_cache
> by default. This feature can cause huge increase of memory
> usage on hosts with millions of sockets.

Thanks for the effort!

Acked-by: Feng Tang <feng.tang@intel.com>

> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  Documentation/networking/ip-sysctl.txt | 8 ++++++++
>  include/net/sock.h                     | 6 ++----
>  net/ipv4/sysctl_net_ipv4.c             | 9 +++++++++
>  3 files changed, 19 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index 14fe93049d28e965d7349b03c5c8782c3d386e7d..288aa264ac26d98637a5bb1babc334bfc699bef1 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -772,6 +772,14 @@ tcp_challenge_ack_limit - INTEGER
>  	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
>  	Default: 100
>  
> +tcp_rx_skb_cache - BOOLEAN
> +	Controls a per TCP socket cache of one skb, that might help
> +	performance of some workloads. This might be dangerous
> +	on systems with a lot of TCP sockets, since it increases
> +	memory usage.
> +
> +	Default: 0 (disabled)
> +
>  UDP variables:
>  
>  udp_l3mdev_accept - BOOLEAN
> diff --git a/include/net/sock.h b/include/net/sock.h
> index e9d769c04637a3c0b967c9bfa6def724834796b9..b02645e2dfad722769c1455bcde76e46da9fc5ac 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2433,13 +2433,11 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
>   * This routine must be called with interrupts disabled or with the socket
>   * locked so that the sk_buff queue operation is ok.
>  */
> +DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
>  static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
>  {
>  	__skb_unlink(skb, &sk->sk_receive_queue);
> -	if (
> -#ifdef CONFIG_RPS
> -	    !static_branch_unlikely(&rps_needed) &&
> -#endif
> +	if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
>  	    !sk->sk_rx_skb_cache) {
>  		sk->sk_rx_skb_cache = skb;
>  		skb_orphan(skb);
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 875867b64d6a6597bf4fcd3498ed55741cbe33f7..886b58d31351df44725bdc34081e798bcb89ecf0 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -51,6 +51,9 @@ static int comp_sack_nr_max = 255;
>  static u32 u32_max_div_HZ = UINT_MAX / HZ;
>  static int one_day_secs = 24 * 3600;
>  
> +DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
> +EXPORT_SYMBOL(tcp_rx_skb_cache_key);
> +
>  /* obsolete */
>  static int sysctl_tcp_low_latency __read_mostly;
>  
> @@ -559,6 +562,12 @@ static struct ctl_table ipv4_table[] = {
>  		.extra1		= &sysctl_fib_sync_mem_min,
>  		.extra2		= &sysctl_fib_sync_mem_max,
>  	},
> +	{
> +		.procname	= "tcp_rx_skb_cache",
> +		.data		= &tcp_rx_skb_cache_key.key,
> +		.mode		= 0644,
> +		.proc_handler	= proc_do_static_key,
> +	},
>  	{ }
>  };
>  
> -- 
> 2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net 3/4] tcp: add tcp_tx_skb_cache sysctl
  2019-06-14 23:22 ` [PATCH net 3/4] tcp: add tcp_tx_skb_cache sysctl Eric Dumazet
@ 2019-06-16  7:42   ` Feng Tang
  0 siblings, 0 replies; 11+ messages in thread
From: Feng Tang @ 2019-06-16  7:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Willem de Bruijn, Eric Dumazet

Hi Eric,

On Fri, Jun 14, 2019 at 04:22:20PM -0700, Eric Dumazet wrote:
> Feng Tang reported a performance regression after introduction
> of per TCP socket tx/rx caches, for TCP over loopback (netperf)
> 
> There is high chance the regression is caused by a change on
> how well the 32 KB per-thread page (current->task_frag) can
> be recycled, and lack of pcp caches for order-3 pages.

Exactly! When I checked the regression, I did several experiments,
and thought of the simliar idea to add the per-CPU orderX pcp list,
the other idea is to add a order3 list in per-cpu softnet_data as
local cache.

Thanks,
Feng

> 
> I could not reproduce the regression myself, cpus all being
> spinning on the mm spinlocks for page allocs/freeing, regardless
> of enabling or disabling the per tcp socket caches.
> 
> It seems best to disable the feature by default, and let
> admins enabling it.
> 
> MM layer either needs to provide scalable order-3 pages
> allocations, or could attempt a trylock on zone->lock if
> the caller only attempts to get a high-order page and is
> able to fallback to order-0 ones in case of pressure.
> 
> Tests run on a 56 cores host (112 hyper threads)
> 
> -	35.49%	netperf 		 [kernel.vmlinux]	  [k] queued_spin_lock_slowpath
>    - 35.49% queued_spin_lock_slowpath
> 	  - 18.18% get_page_from_freelist
> 		 - __alloc_pages_nodemask
> 			- 18.18% alloc_pages_current
> 				 skb_page_frag_refill
> 				 sk_page_frag_refill
> 				 tcp_sendmsg_locked
> 				 tcp_sendmsg
> 				 inet_sendmsg
> 				 sock_sendmsg
> 				 __sys_sendto
> 				 __x64_sys_sendto
> 				 do_syscall_64
> 				 entry_SYSCALL_64_after_hwframe
> 				 __libc_send
> 	  + 17.31% __free_pages_ok
> +	31.43%	swapper 		 [kernel.vmlinux]	  [k] intel_idle
> +	 9.12%	netperf 		 [kernel.vmlinux]	  [k] copy_user_enhanced_fast_string
> +	 6.53%	netserver		 [kernel.vmlinux]	  [k] copy_user_enhanced_fast_string
> +	 0.69%	netserver		 [kernel.vmlinux]	  [k] queued_spin_lock_slowpath
> +	 0.68%	netperf 		 [kernel.vmlinux]	  [k] skb_release_data
> +	 0.52%	netperf 		 [kernel.vmlinux]	  [k] tcp_sendmsg_locked
> 	 0.46%	netperf 		 [kernel.vmlinux]	  [k] _raw_spin_lock_irqsave
> 
> Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: Feng Tang <feng.tang@intel.com>
> ---
>  include/net/sock.h         | 4 +++-
>  net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
>  2 files changed, 11 insertions(+), 1 deletion(-)
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, back to index

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-14 23:22 [PATCH net 0/4] tcp: add three static keys Eric Dumazet
2019-06-14 23:22 ` [PATCH net 1/4] sysctl: define proc_do_static_key() Eric Dumazet
2019-06-14 23:45   ` Alexei Starovoitov
2019-06-14 23:55     ` Eric Dumazet
2019-06-15  0:04       ` Alexei Starovoitov
2019-06-14 23:22 ` [PATCH net 2/4] tcp: add tcp_rx_skb_cache sysctl Eric Dumazet
2019-06-16  7:38   ` Feng Tang
2019-06-14 23:22 ` [PATCH net 3/4] tcp: add tcp_tx_skb_cache sysctl Eric Dumazet
2019-06-16  7:42   ` Feng Tang
2019-06-14 23:22 ` [PATCH net 4/4] net: add high_order_alloc_disable sysctl/static key Eric Dumazet
2019-06-15  3:18 ` [PATCH net 0/4] tcp: add three static keys David Miller

Netdev Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/netdev/0 netdev/git/0.git
	git clone --mirror https://lore.kernel.org/netdev/1 netdev/git/1.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 netdev netdev/ https://lore.kernel.org/netdev \
		netdev@vger.kernel.org
	public-inbox-index netdev

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.netdev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git