All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v2] net/core: add optional threading for rps backlog processing
@ 2023-02-17 10:06 Felix Fietkau
  2023-02-17 12:23 ` Eric Dumazet
  2023-02-17 13:24 ` kernel test robot
  0 siblings, 2 replies; 9+ messages in thread
From: Felix Fietkau @ 2023-02-17 10:06 UTC (permalink / raw)
  To: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: linux-kernel

When dealing with few flows or an imbalance on CPU utilization, static RPS
CPU assignment can be too inflexible. Add support for enabling threaded NAPI
for RPS backlog processing in order to allow the scheduler to better balance
processing. This helps better spread the load across idle CPUs.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
---

RFC v2:
 - fix rebase error in rps locking

 include/linux/netdevice.h  |  1 +
 net/core/dev.c             | 61 ++++++++++++++++++++++++++++++++++----
 net/core/sysctl_net_core.c | 27 +++++++++++++++++
 3 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d9cdbc047b49..9ee2162c907e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -522,6 +522,7 @@ static inline bool napi_complete(struct napi_struct *n)
 }
 
 int dev_set_threaded(struct net_device *dev, bool threaded);
+int rps_set_threaded(bool threaded);
 
 /**
  *	napi_disable - prevent NAPI from scheduling
diff --git a/net/core/dev.c b/net/core/dev.c
index 357081b0113c..c138e40536e4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4597,7 +4597,7 @@ static int napi_schedule_rps(struct softnet_data *sd)
 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 
 #ifdef CONFIG_RPS
-	if (sd != mysd) {
+	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
 		sd->rps_ipi_next = mysd->rps_ipi_list;
 		mysd->rps_ipi_list = sd;
 
@@ -5936,13 +5936,12 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		if (skb_queue_empty(&sd->input_pkt_queue)) {
 			/*
 			 * Inline a custom version of __napi_complete().
-			 * only current cpu owns and manipulates this napi,
-			 * and NAPI_STATE_SCHED is the only possible flag set
-			 * on backlog.
+			 * only current cpu owns and manipulates this napi.
 			 * We can use a plain write instead of clear_bit(),
 			 * and we dont need an smp_mb() memory barrier.
 			 */
-			napi->state = 0;
+			napi->state &= ~(NAPIF_STATE_SCHED |
+					 NAPIF_STATE_SCHED_THREADED);
 			again = false;
 		} else {
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
@@ -6356,6 +6355,55 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
 }
 EXPORT_SYMBOL(dev_set_threaded);
 
+#ifdef CONFIG_RPS
+int rps_set_threaded(bool threaded)
+{
+	static bool rps_threaded;
+	int err = 0;
+	int i;
+
+	if (rps_threaded == threaded)
+		return 0;
+
+	for_each_possible_cpu(i) {
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
+		struct napi_struct *n = &sd->backlog;
+
+		n->thread = kthread_run(napi_threaded_poll, n, "napi/rps-%d", i);
+		if (IS_ERR(n->thread)) {
+			err = PTR_ERR(n->thread);
+			pr_err("kthread_run failed with err %d\n", err);
+			n->thread = NULL;
+			threaded = false;
+			break;
+		}
+
+	}
+
+	rps_threaded = threaded;
+
+	/* Make sure kthread is created before THREADED bit
+	 * is set.
+	 */
+	smp_mb__before_atomic();
+
+	for_each_possible_cpu(i) {
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
+		struct napi_struct *n = &sd->backlog;
+		unsigned long flags;
+
+		rps_lock_irqsave(sd, &flags);
+		if (threaded)
+			n->state |= NAPIF_STATE_THREADED;
+		else
+			n->state &= ~NAPIF_STATE_THREADED;
+		rps_lock_irq_restore(sd, &flags);
+	}
+
+	return err;
+}
+#endif
+
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -11114,6 +11162,9 @@ static int dev_cpu_dead(unsigned int oldcpu)
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_enable();
 
+	if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
+		return 0;
+
 #ifdef CONFIG_RPS
 	remsd = oldsd->rps_ipi_list;
 	oldsd->rps_ipi_list = NULL;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7130e6d9e263..438957535e74 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -30,6 +30,7 @@ static int int_3600 = 3600;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
+static int rps_threaded;
 
 static int net_msg_warn;	/* Unused, but still a sysctl */
 
@@ -163,6 +164,23 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 
 	return ret;
 }
+
+static int rps_threaded_sysctl(struct ctl_table *table, int write,
+			       void *buffer, size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(rps_threaded_mutex);
+	int ret;
+
+	mutex_lock(&rps_threaded_mutex);
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (write && !ret)
+		ret = rps_set_threaded(rps_threaded);
+
+	mutex_unlock(&rps_threaded_mutex);
+
+	return ret;
+}
 #endif /* CONFIG_RPS */
 
 #ifdef CONFIG_NET_FLOW_LIMIT
@@ -513,6 +531,15 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= rps_default_mask_sysctl
 	},
+	{
+		.procname	= "rps_threaded",
+		.data		= &rps_threaded,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= rps_threaded_sysctl,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
 #endif
 #ifdef CONFIG_NET_FLOW_LIMIT
 	{
-- 
2.39.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC v2] net/core: add optional threading for rps backlog processing
  2023-02-17 10:06 [RFC v2] net/core: add optional threading for rps backlog processing Felix Fietkau
@ 2023-02-17 12:23 ` Eric Dumazet
  2023-02-17 12:35   ` Felix Fietkau
  2023-02-17 13:24 ` kernel test robot
  1 sibling, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2023-02-17 12:23 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: netdev, David S. Miller, Jakub Kicinski, Paolo Abeni, linux-kernel

On Fri, Feb 17, 2023 at 11:06 AM Felix Fietkau <nbd@nbd.name> wrote:
>
> When dealing with few flows or an imbalance on CPU utilization, static RPS
> CPU assignment can be too inflexible. Add support for enabling threaded NAPI
> for RPS backlog processing in order to allow the scheduler to better balance
> processing. This helps better spread the load across idle CPUs.
>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
>
> RFC v2:
>  - fix rebase error in rps locking

Why only deal with RPS ?

It seems you propose the sofnet_data backlog be processed by a thread,
instead than from softirq ?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v2] net/core: add optional threading for rps backlog processing
  2023-02-17 12:23 ` Eric Dumazet
@ 2023-02-17 12:35   ` Felix Fietkau
  2023-02-17 12:57     ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Fietkau @ 2023-02-17 12:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, David S. Miller, Jakub Kicinski, Paolo Abeni, linux-kernel

On 17.02.23 13:23, Eric Dumazet wrote:
> On Fri, Feb 17, 2023 at 11:06 AM Felix Fietkau <nbd@nbd.name> wrote:
>>
>> When dealing with few flows or an imbalance on CPU utilization, static RPS
>> CPU assignment can be too inflexible. Add support for enabling threaded NAPI
>> for RPS backlog processing in order to allow the scheduler to better balance
>> processing. This helps better spread the load across idle CPUs.
>>
>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>> ---
>>
>> RFC v2:
>>  - fix rebase error in rps locking
> 
> Why only deal with RPS ?
> 
> It seems you propose the sofnet_data backlog be processed by a thread,
> instead than from softirq ?
Right. I originally wanted to mainly improve RPS, but my patch does 
cover backlog in general. I will update the description in the next 
version. Does the approach in general make sense to you?

Thanks,

- Felix

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v2] net/core: add optional threading for rps backlog processing
  2023-02-17 12:35   ` Felix Fietkau
@ 2023-02-17 12:57     ` Eric Dumazet
  2023-02-17 13:40       ` Felix Fietkau
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2023-02-17 12:57 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: netdev, David S. Miller, Jakub Kicinski, Paolo Abeni, linux-kernel

On Fri, Feb 17, 2023 at 1:35 PM Felix Fietkau <nbd@nbd.name> wrote:
>
> On 17.02.23 13:23, Eric Dumazet wrote:
> > On Fri, Feb 17, 2023 at 11:06 AM Felix Fietkau <nbd@nbd.name> wrote:
> >>
> >> When dealing with few flows or an imbalance on CPU utilization, static RPS
> >> CPU assignment can be too inflexible. Add support for enabling threaded NAPI
> >> for RPS backlog processing in order to allow the scheduler to better balance
> >> processing. This helps better spread the load across idle CPUs.
> >>
> >> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> >> ---
> >>
> >> RFC v2:
> >>  - fix rebase error in rps locking
> >
> > Why only deal with RPS ?
> >
> > It seems you propose the sofnet_data backlog be processed by a thread,
> > instead than from softirq ?
> Right. I originally wanted to mainly improve RPS, but my patch does
> cover backlog in general. I will update the description in the next
> version. Does the approach in general make sense to you?
>

I do not know, this seems to lack some (perf) numbers, and
descriptions of added max latencies and stuff like that :)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v2] net/core: add optional threading for rps backlog processing
  2023-02-17 10:06 [RFC v2] net/core: add optional threading for rps backlog processing Felix Fietkau
  2023-02-17 12:23 ` Eric Dumazet
@ 2023-02-17 13:24 ` kernel test robot
  1 sibling, 0 replies; 9+ messages in thread
From: kernel test robot @ 2023-02-17 13:24 UTC (permalink / raw)
  To: Felix Fietkau; +Cc: oe-kbuild-all

Hi Felix,

[FYI, it's a private test report for your RFC patch.]
[auto build test ERROR on net-next/master]
[cannot apply to net/master horms-ipvs/master linus/master v6.2-rc8]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Felix-Fietkau/net-core-add-optional-threading-for-rps-backlog-processing/20230217-180801
patch link:    https://lore.kernel.org/r/20230217100606.1234-1-nbd%40nbd.name
patch subject: [RFC v2] net/core: add optional threading for rps backlog processing
config: powerpc-allmodconfig (https://download.01.org/0day-ci/archive/20230217/202302172108.u0t9b7qI-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/70f8857dc63eaa2125fdb9fa512f5ca42e15e3fb
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Felix-Fietkau/net-core-add-optional-threading-for-rps-backlog-processing/20230217-180801
        git checkout 70f8857dc63eaa2125fdb9fa512f5ca42e15e3fb
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash net/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202302172108.u0t9b7qI-lkp@intel.com/

All errors (new ones prefixed by >>):

   net/core/dev.c: In function 'rps_set_threaded':
>> net/core/dev.c:6400:17: error: implicit declaration of function 'rps_lock_irq_restore'; did you mean 'rps_unlock_irq_restore'? [-Werror=implicit-function-declaration]
    6400 |                 rps_lock_irq_restore(sd, &flags);
         |                 ^~~~~~~~~~~~~~~~~~~~
         |                 rps_unlock_irq_restore
   cc1: some warnings being treated as errors


vim +6400 net/core/dev.c

  6357	
  6358	#ifdef CONFIG_RPS
  6359	int rps_set_threaded(bool threaded)
  6360	{
  6361		static bool rps_threaded;
  6362		int err = 0;
  6363		int i;
  6364	
  6365		if (rps_threaded == threaded)
  6366			return 0;
  6367	
  6368		for_each_possible_cpu(i) {
  6369			struct softnet_data *sd = &per_cpu(softnet_data, i);
  6370			struct napi_struct *n = &sd->backlog;
  6371	
  6372			n->thread = kthread_run(napi_threaded_poll, n, "napi/rps-%d", i);
  6373			if (IS_ERR(n->thread)) {
  6374				err = PTR_ERR(n->thread);
  6375				pr_err("kthread_run failed with err %d\n", err);
  6376				n->thread = NULL;
  6377				threaded = false;
  6378				break;
  6379			}
  6380	
  6381		}
  6382	
  6383		rps_threaded = threaded;
  6384	
  6385		/* Make sure kthread is created before THREADED bit
  6386		 * is set.
  6387		 */
  6388		smp_mb__before_atomic();
  6389	
  6390		for_each_possible_cpu(i) {
  6391			struct softnet_data *sd = &per_cpu(softnet_data, i);
  6392			struct napi_struct *n = &sd->backlog;
  6393			unsigned long flags;
  6394	
  6395			rps_lock_irqsave(sd, &flags);
  6396			if (threaded)
  6397				n->state |= NAPIF_STATE_THREADED;
  6398			else
  6399				n->state &= ~NAPIF_STATE_THREADED;
> 6400			rps_lock_irq_restore(sd, &flags);
  6401		}
  6402	
  6403		return err;
  6404	}
  6405	#endif
  6406	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v2] net/core: add optional threading for rps backlog processing
  2023-02-17 12:57     ` Eric Dumazet
@ 2023-02-17 13:40       ` Felix Fietkau
  2023-02-17 14:38         ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Fietkau @ 2023-02-17 13:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, David S. Miller, Jakub Kicinski, Paolo Abeni, linux-kernel

On 17.02.23 13:57, Eric Dumazet wrote:
> On Fri, Feb 17, 2023 at 1:35 PM Felix Fietkau <nbd@nbd.name> wrote:
>>
>> On 17.02.23 13:23, Eric Dumazet wrote:
>> > On Fri, Feb 17, 2023 at 11:06 AM Felix Fietkau <nbd@nbd.name> wrote:
>> >>
>> >> When dealing with few flows or an imbalance on CPU utilization, static RPS
>> >> CPU assignment can be too inflexible. Add support for enabling threaded NAPI
>> >> for RPS backlog processing in order to allow the scheduler to better balance
>> >> processing. This helps better spread the load across idle CPUs.
>> >>
>> >> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>> >> ---
>> >>
>> >> RFC v2:
>> >>  - fix rebase error in rps locking
>> >
>> > Why only deal with RPS ?
>> >
>> > It seems you propose the sofnet_data backlog be processed by a thread,
>> > instead than from softirq ?
>> Right. I originally wanted to mainly improve RPS, but my patch does
>> cover backlog in general. I will update the description in the next
>> version. Does the approach in general make sense to you?
>>
> 
> I do not know, this seems to lack some (perf) numbers, and
> descriptions of added max latencies and stuff like that :)
I just ran some test where I used a MT7621 device (dual-core 800 MHz
MIPS, 4 threads) as a router doing NAT without flow offloading.

Using the flent RRUL test between 2 PCs connected through the router,
I get these results:

rps_threaded=0: (combined CPU idle time around 27%)
                              avg       median       99th %          # data pts
  Ping (ms) ICMP   :        26.08        28.70        54.74 ms              199
  Ping (ms) UDP BE :         1.96        24.12        37.28 ms              200
  Ping (ms) UDP BK :         1.88        15.86        27.30 ms              200
  Ping (ms) UDP EF :         1.98        31.77        54.10 ms              200
  Ping (ms) avg    :         1.94          N/A          N/A ms              200
  TCP download BE  :        69.25        70.20       139.55 Mbits/s         200
  TCP download BK  :        95.15        92.51       163.93 Mbits/s         200
  TCP download CS5 :       133.64       129.10       292.46 Mbits/s         200
  TCP download EF  :       129.86       127.70       254.47 Mbits/s         200
  TCP download avg :       106.97          N/A          N/A Mbits/s         200
  TCP download sum :       427.90          N/A          N/A Mbits/s         200
  TCP totals       :       864.43          N/A          N/A Mbits/s         200
  TCP upload BE    :        97.54        96.67       163.99 Mbits/s         200
  TCP upload BK    :       139.76       143.88       190.37 Mbits/s         200
  TCP upload CS5   :        97.52        94.70       206.60 Mbits/s         200
  TCP upload EF    :       101.71       106.72       147.88 Mbits/s         200
  TCP upload avg   :       109.13          N/A          N/A Mbits/s         200
  TCP upload sum   :       436.53          N/A          N/A Mbits/s         200

rps_threaded=1: (combined CPU idle time around 16%)
                              avg       median       99th %          # data pts
  Ping (ms) ICMP   :        13.70        16.10        27.60 ms              199
  Ping (ms) UDP BE :         2.03        18.35        24.16 ms              200
  Ping (ms) UDP BK :         2.03        18.36        29.13 ms              200
  Ping (ms) UDP EF :         2.36        25.20        41.50 ms              200
  Ping (ms) avg    :         2.14          N/A          N/A ms              200
  TCP download BE  :       118.69       120.94       160.12 Mbits/s         200
  TCP download BK  :       134.67       137.81       177.14 Mbits/s         200
  TCP download CS5 :       126.15       127.81       174.84 Mbits/s         200
  TCP download EF  :        78.36        79.41       143.31 Mbits/s         200
  TCP download avg :       114.47          N/A          N/A Mbits/s         200
  TCP download sum :       457.87          N/A          N/A Mbits/s         200
  TCP totals       :       918.19          N/A          N/A Mbits/s         200
  TCP upload BE    :       112.20       111.55       164.38 Mbits/s         200
  TCP upload BK    :       144.99       139.24       205.12 Mbits/s         200
  TCP upload CS5   :        93.09        95.50       132.39 Mbits/s         200
  TCP upload EF    :       110.04       108.21       207.00 Mbits/s         200
  TCP upload avg   :       115.08          N/A          N/A Mbits/s         200
  TCP upload sum   :       460.32          N/A          N/A Mbits/s         200

As you can see, both throughput and latency improve because load can be
better distributed across CPU cores.

- Felix

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v2] net/core: add optional threading for rps backlog processing
  2023-02-17 13:40       ` Felix Fietkau
@ 2023-02-17 14:38         ` Eric Dumazet
  2023-02-17 15:26           ` Felix Fietkau
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2023-02-17 14:38 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: netdev, David S. Miller, Jakub Kicinski, Paolo Abeni, linux-kernel

On Fri, Feb 17, 2023 at 2:40 PM Felix Fietkau <nbd@nbd.name> wrote:
>
> On 17.02.23 13:57, Eric Dumazet wrote:
> > On Fri, Feb 17, 2023 at 1:35 PM Felix Fietkau <nbd@nbd.name> wrote:
> >>
> >> On 17.02.23 13:23, Eric Dumazet wrote:
> >> > On Fri, Feb 17, 2023 at 11:06 AM Felix Fietkau <nbd@nbd.name> wrote:
> >> >>
> >> >> When dealing with few flows or an imbalance on CPU utilization, static RPS
> >> >> CPU assignment can be too inflexible. Add support for enabling threaded NAPI
> >> >> for RPS backlog processing in order to allow the scheduler to better balance
> >> >> processing. This helps better spread the load across idle CPUs.
> >> >>
> >> >> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> >> >> ---
> >> >>
> >> >> RFC v2:
> >> >>  - fix rebase error in rps locking
> >> >
> >> > Why only deal with RPS ?
> >> >
> >> > It seems you propose the sofnet_data backlog be processed by a thread,
> >> > instead than from softirq ?
> >> Right. I originally wanted to mainly improve RPS, but my patch does
> >> cover backlog in general. I will update the description in the next
> >> version. Does the approach in general make sense to you?
> >>
> >
> > I do not know, this seems to lack some (perf) numbers, and
> > descriptions of added max latencies and stuff like that :)
> I just ran some test where I used a MT7621 device (dual-core 800 MHz
> MIPS, 4 threads) as a router doing NAT without flow offloading.
>
> Using the flent RRUL test between 2 PCs connected through the router,
> I get these results:
>
> rps_threaded=0: (combined CPU idle time around 27%)
>                               avg       median       99th %          # data pts
>   Ping (ms) ICMP   :        26.08        28.70        54.74 ms              199
>   Ping (ms) UDP BE :         1.96        24.12        37.28 ms              200
>   Ping (ms) UDP BK :         1.88        15.86        27.30 ms              200
>   Ping (ms) UDP EF :         1.98        31.77        54.10 ms              200
>   Ping (ms) avg    :         1.94          N/A          N/A ms              200
>   TCP download BE  :        69.25        70.20       139.55 Mbits/s         200
>   TCP download BK  :        95.15        92.51       163.93 Mbits/s         200
>   TCP download CS5 :       133.64       129.10       292.46 Mbits/s         200
>   TCP download EF  :       129.86       127.70       254.47 Mbits/s         200
>   TCP download avg :       106.97          N/A          N/A Mbits/s         200
>   TCP download sum :       427.90          N/A          N/A Mbits/s         200
>   TCP totals       :       864.43          N/A          N/A Mbits/s         200
>   TCP upload BE    :        97.54        96.67       163.99 Mbits/s         200
>   TCP upload BK    :       139.76       143.88       190.37 Mbits/s         200
>   TCP upload CS5   :        97.52        94.70       206.60 Mbits/s         200
>   TCP upload EF    :       101.71       106.72       147.88 Mbits/s         200
>   TCP upload avg   :       109.13          N/A          N/A Mbits/s         200
>   TCP upload sum   :       436.53          N/A          N/A Mbits/s         200
>
> rps_threaded=1: (combined CPU idle time around 16%)
>                               avg       median       99th %          # data pts
>   Ping (ms) ICMP   :        13.70        16.10        27.60 ms              199
>   Ping (ms) UDP BE :         2.03        18.35        24.16 ms              200
>   Ping (ms) UDP BK :         2.03        18.36        29.13 ms              200
>   Ping (ms) UDP EF :         2.36        25.20        41.50 ms              200
>   Ping (ms) avg    :         2.14          N/A          N/A ms              200
>   TCP download BE  :       118.69       120.94       160.12 Mbits/s         200
>   TCP download BK  :       134.67       137.81       177.14 Mbits/s         200
>   TCP download CS5 :       126.15       127.81       174.84 Mbits/s         200
>   TCP download EF  :        78.36        79.41       143.31 Mbits/s         200
>   TCP download avg :       114.47          N/A          N/A Mbits/s         200
>   TCP download sum :       457.87          N/A          N/A Mbits/s         200
>   TCP totals       :       918.19          N/A          N/A Mbits/s         200
>   TCP upload BE    :       112.20       111.55       164.38 Mbits/s         200
>   TCP upload BK    :       144.99       139.24       205.12 Mbits/s         200
>   TCP upload CS5   :        93.09        95.50       132.39 Mbits/s         200
>   TCP upload EF    :       110.04       108.21       207.00 Mbits/s         200
>   TCP upload avg   :       115.08          N/A          N/A Mbits/s         200
>   TCP upload sum   :       460.32          N/A          N/A Mbits/s         200
>
> As you can see, both throughput and latency improve because load can be
> better distributed across CPU cores.
>

What happens if user threads are competing with your kthreads ?

It seems you are adding another variant of ksoftirqd.

More threads might look better in some cases, if we accept to be at
the mercy of process scheduling decisions.

NUMA affinities matter, I do not see how you are dealing with this.

Your patch assumes all cpus can participate in network processing,
but rps is fine grained:
Boxes with 128 or 256 cpus usually have different rps_mask per receive
queue, with 2 or 4 bits set in the per-rx-queue rps_mask.

Then, process_backlog() has been designed to run only from the cpu
tied to the per-cpu data (softnet_data)
There are multiple comments about this assumption, and various things
that would need to be changed
(eg sd_has_rps_ipi_waiting() would be wrong in its current implementation)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v2] net/core: add optional threading for rps backlog processing
  2023-02-17 14:38         ` Eric Dumazet
@ 2023-02-17 15:26           ` Felix Fietkau
  2023-02-17 15:54             ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Fietkau @ 2023-02-17 15:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, David S. Miller, Jakub Kicinski, Paolo Abeni, linux-kernel

On 17.02.23 15:38, Eric Dumazet wrote:
> On Fri, Feb 17, 2023 at 2:40 PM Felix Fietkau <nbd@nbd.name> wrote:
>>
>> On 17.02.23 13:57, Eric Dumazet wrote:
>> > On Fri, Feb 17, 2023 at 1:35 PM Felix Fietkau <nbd@nbd.name> wrote:
>> >>
>> >> On 17.02.23 13:23, Eric Dumazet wrote:
>> >> > On Fri, Feb 17, 2023 at 11:06 AM Felix Fietkau <nbd@nbd.name> wrote:
>> >> >>
>> >> >> When dealing with few flows or an imbalance on CPU utilization, static RPS
>> >> >> CPU assignment can be too inflexible. Add support for enabling threaded NAPI
>> >> >> for RPS backlog processing in order to allow the scheduler to better balance
>> >> >> processing. This helps better spread the load across idle CPUs.
>> >> >>
>> >> >> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>> >> >> ---
>> >> >>
>> >> >> RFC v2:
>> >> >>  - fix rebase error in rps locking
>> >> >
>> >> > Why only deal with RPS ?
>> >> >
>> >> > It seems you propose the sofnet_data backlog be processed by a thread,
>> >> > instead than from softirq ?
>> >> Right. I originally wanted to mainly improve RPS, but my patch does
>> >> cover backlog in general. I will update the description in the next
>> >> version. Does the approach in general make sense to you?
>> >>
>> >
>> > I do not know, this seems to lack some (perf) numbers, and
>> > descriptions of added max latencies and stuff like that :)
>> I just ran some test where I used a MT7621 device (dual-core 800 MHz
>> MIPS, 4 threads) as a router doing NAT without flow offloading.
>>
>> Using the flent RRUL test between 2 PCs connected through the router,
>> I get these results:
>>
>> rps_threaded=0: (combined CPU idle time around 27%)
>>                               avg       median       99th %          # data pts
>>   Ping (ms) ICMP   :        26.08        28.70        54.74 ms              199
>>   Ping (ms) UDP BE :         1.96        24.12        37.28 ms              200
>>   Ping (ms) UDP BK :         1.88        15.86        27.30 ms              200
>>   Ping (ms) UDP EF :         1.98        31.77        54.10 ms              200
>>   Ping (ms) avg    :         1.94          N/A          N/A ms              200
>>   TCP download BE  :        69.25        70.20       139.55 Mbits/s         200
>>   TCP download BK  :        95.15        92.51       163.93 Mbits/s         200
>>   TCP download CS5 :       133.64       129.10       292.46 Mbits/s         200
>>   TCP download EF  :       129.86       127.70       254.47 Mbits/s         200
>>   TCP download avg :       106.97          N/A          N/A Mbits/s         200
>>   TCP download sum :       427.90          N/A          N/A Mbits/s         200
>>   TCP totals       :       864.43          N/A          N/A Mbits/s         200
>>   TCP upload BE    :        97.54        96.67       163.99 Mbits/s         200
>>   TCP upload BK    :       139.76       143.88       190.37 Mbits/s         200
>>   TCP upload CS5   :        97.52        94.70       206.60 Mbits/s         200
>>   TCP upload EF    :       101.71       106.72       147.88 Mbits/s         200
>>   TCP upload avg   :       109.13          N/A          N/A Mbits/s         200
>>   TCP upload sum   :       436.53          N/A          N/A Mbits/s         200
>>
>> rps_threaded=1: (combined CPU idle time around 16%)
>>                               avg       median       99th %          # data pts
>>   Ping (ms) ICMP   :        13.70        16.10        27.60 ms              199
>>   Ping (ms) UDP BE :         2.03        18.35        24.16 ms              200
>>   Ping (ms) UDP BK :         2.03        18.36        29.13 ms              200
>>   Ping (ms) UDP EF :         2.36        25.20        41.50 ms              200
>>   Ping (ms) avg    :         2.14          N/A          N/A ms              200
>>   TCP download BE  :       118.69       120.94       160.12 Mbits/s         200
>>   TCP download BK  :       134.67       137.81       177.14 Mbits/s         200
>>   TCP download CS5 :       126.15       127.81       174.84 Mbits/s         200
>>   TCP download EF  :        78.36        79.41       143.31 Mbits/s         200
>>   TCP download avg :       114.47          N/A          N/A Mbits/s         200
>>   TCP download sum :       457.87          N/A          N/A Mbits/s         200
>>   TCP totals       :       918.19          N/A          N/A Mbits/s         200
>>   TCP upload BE    :       112.20       111.55       164.38 Mbits/s         200
>>   TCP upload BK    :       144.99       139.24       205.12 Mbits/s         200
>>   TCP upload CS5   :        93.09        95.50       132.39 Mbits/s         200
>>   TCP upload EF    :       110.04       108.21       207.00 Mbits/s         200
>>   TCP upload avg   :       115.08          N/A          N/A Mbits/s         200
>>   TCP upload sum   :       460.32          N/A          N/A Mbits/s         200
>>
>> As you can see, both throughput and latency improve because load can be
>> better distributed across CPU cores.
>>
> 
> What happens if user threads are competing with your kthreads ?
> 
> It seems you are adding another variant of ksoftirqd.
> 
> More threads might look better in some cases, if we accept to be at
> the mercy of process scheduling decisions.
> 
> NUMA affinities matter, I do not see how you are dealing with this.
> 
> Your patch assumes all cpus can participate in network processing,
> but rps is fine grained:
> Boxes with 128 or 256 cpus usually have different rps_mask per receive
> queue, with 2 or 4 bits set in the per-rx-queue rps_mask.
I'm assuming that in cases where this matters, user space can set the 
affinities and priority of the rps threads. The number in the rps-%d 
process name matches the bit number for rps_mask, so it can control 
things in a more fine grained way.

Would you prefer having support for selectively enabling threading on 
individual backlogs with a mask?

> Then, process_backlog() has been designed to run only from the cpu
> tied to the per-cpu data (softnet_data)
> There are multiple comments about this assumption, and various things
> that would need to be changed
> (eg sd_has_rps_ipi_waiting() would be wrong in its current implementation)
That's why I added the NAPI_STATE_THREADED check in napi_schedule_rps, 
so that sd_has_rps_ipi_waiting would always return false.
Or are you worried about a race when enabling threading?

- Felix

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC v2] net/core: add optional threading for rps backlog processing
  2023-02-17 15:26           ` Felix Fietkau
@ 2023-02-17 15:54             ` Eric Dumazet
  0 siblings, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2023-02-17 15:54 UTC (permalink / raw)
  To: Felix Fietkau
  Cc: netdev, David S. Miller, Jakub Kicinski, Paolo Abeni, linux-kernel

On Fri, Feb 17, 2023 at 4:26 PM Felix Fietkau <nbd@nbd.name> wrote:

>
> > Then, process_backlog() has been designed to run only from the cpu
> > tied to the per-cpu data (softnet_data)
> > There are multiple comments about this assumption, and various things
> > that would need to be changed
> > (eg sd_has_rps_ipi_waiting() would be wrong in its current implementation)
> That's why I added the NAPI_STATE_THREADED check in napi_schedule_rps,
> so that sd_has_rps_ipi_waiting would always return false.
> Or are you worried about a race when enabling threading?
>

Please look at all uses of sd->process_queue, without locking. They do
not care about NAPI_STATE_THREADED

flush_backlog() is one instance, but process_backlog() is also using
__skb_dequeue(&sd->process_queue)

I suspect the following patch would work today, and would show
process_queue lock is not used.

diff --git a/net/core/dev.c b/net/core/dev.c
index 5687b528d4c18ef2960edb6bf3161bbad666aece..bed540b417a1b4cd3e384611a4681b8e2a43fd30
100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11396,7 +11396,7 @@ static int __init net_dev_init(void)
                INIT_WORK(flush, flush_backlog);

                skb_queue_head_init(&sd->input_pkt_queue);
-               skb_queue_head_init(&sd->process_queue);
+               __skb_queue_head_init(&sd->process_queue);
 #ifdef CONFIG_XFRM_OFFLOAD
                skb_queue_head_init(&sd->xfrm_backlog);
 #endif

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-02-17 15:54 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-17 10:06 [RFC v2] net/core: add optional threading for rps backlog processing Felix Fietkau
2023-02-17 12:23 ` Eric Dumazet
2023-02-17 12:35   ` Felix Fietkau
2023-02-17 12:57     ` Eric Dumazet
2023-02-17 13:40       ` Felix Fietkau
2023-02-17 14:38         ` Eric Dumazet
2023-02-17 15:26           ` Felix Fietkau
2023-02-17 15:54             ` Eric Dumazet
2023-02-17 13:24 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.