All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1] net: dev_weight: TX/RX orthogonality
@ 2016-12-26  9:49 Matthias Tafelmeier
  2016-12-26 15:52 ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Matthias Tafelmeier @ 2016-12-26  9:49 UTC (permalink / raw)
  To: netdev; +Cc: hagen, fw, edumazet, daniel

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via procfs to
userland.
---
 include/linux/netdevice.h  |  3 ++-
 net/core/dev.c             |  7 ++++---
 net/core/sysctl_net_core.c | 12 ++++++++++--
 net/sched/sch_generic.c    |  2 +-
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..3616b35 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3794,7 +3794,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
-extern int		weight_p;
+extern int		weight_p_rx;
+extern int		weight_p_tx;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..fc9e506 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3427,7 +3427,8 @@ EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
-int weight_p __read_mostly = 64;            /* old backlog weight */
+int weight_p_rx __read_mostly = 64;            /* old backlog weight */
+int weight_p_tx __read_mostly = 64;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4834,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = weight_p_rx;
 	while (again) {
 		struct sk_buff *skb;
 
@@ -8377,7 +8378,7 @@ static int __init net_dev_init(void)
 #endif
 
 		sd->backlog.poll = process_backlog;
-		sd->backlog.weight = weight_p;
+		sd->backlog.weight = weight_p_rx;
 	}
 
 	dev_boot_phase = 0;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..7eaa33a 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -269,13 +269,21 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= &min_rcvbuf,
 	},
 	{
-		.procname	= "dev_weight",
-		.data		= &weight_p,
+		.procname	= "dev_weight_rx",
+		.data		= &weight_p_rx,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
 	{
+		.procname	= "dev_weight_tx",
+		.data		= &weight_p_tx,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+
+	{
 		.procname	= "netdev_max_backlog",
 		.data		= &netdev_max_backlog,
 		.maxlen		= sizeof(int),
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e..8457e8c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = weight_p_tx;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v1] net: dev_weight: TX/RX orthogonality
  2016-12-26  9:49 [PATCH v1] net: dev_weight: TX/RX orthogonality Matthias Tafelmeier
@ 2016-12-26 15:52 ` David Miller
       [not found]   ` <ae0712c3-61c6-432e-78d9-665d0c291c9f@gmx.net>
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2016-12-26 15:52 UTC (permalink / raw)
  To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel

From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Mon, 26 Dec 2016 10:49:23 +0100

> @@ -269,13 +269,21 @@ static struct ctl_table net_core_table[] = {
>  		.extra1		= &min_rcvbuf,
>  	},
>  	{
> -		.procname	= "dev_weight",
> -		.data		= &weight_p,
> +		.procname	= "dev_weight_rx",
> +		.data		= &weight_p_rx,
 ...
>  	{
> +		.procname	= "dev_weight_tx",

Sysctls are user visible APIs.  You cannot change them without
breaking userspace.  You particularly cannot change the name of
the sysctl.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v1] net: dev_weight: TX/RX orthogonality,Re: [PATCH v1] net: dev_weight: TX/RX orthogonality
       [not found]   ` <ae0712c3-61c6-432e-78d9-665d0c291c9f@gmx.net>
@ 2016-12-26 16:58     ` David Miller
  2016-12-27  8:25       ` [PATCH] " Matthias Tafelmeier
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2016-12-26 16:58 UTC (permalink / raw)
  To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel

From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Mon, 26 Dec 2016 17:43:08 +0100

> 
>> From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
>> Date: Mon, 26 Dec 2016 10:49:23 +0100
>>
>>> @@ -269,13 +269,21 @@ static struct ctl_table net_core_table[] = {
>>>  		.extra1		= &min_rcvbuf,
>>>  	},
>>>  	{
>>> -		.procname	= "dev_weight",
>>> -		.data		= &weight_p,
>>> +		.procname	= "dev_weight_rx",
>>> +		.data		= &weight_p_rx,
>>  ...
>>>  	{
>>> +		.procname	= "dev_weight_tx",
>> Sysctls are user visible APIs.  You cannot change them without
>> breaking userspace.  You particularly cannot change the name of
>> the sysctl.
> 
> What about leaving *dev_weight* in place for TX side as is and newly
> introducing a sysctl param
> *dev_weight_rx*. Though, am open to a better naming for the latter.

This changes behavior for existing users, you cannot do this.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] net: dev_weight: TX/RX orthogonality
  2016-12-26 16:58     ` [PATCH v1] net: dev_weight: TX/RX orthogonality,Re: " David Miller
@ 2016-12-27  8:25       ` Matthias Tafelmeier
  2016-12-27 16:47         ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 17+ messages in thread
From: Matthias Tafelmeier @ 2016-12-27  8:25 UTC (permalink / raw)
  To: netdev; +Cc: hagen, fw, edumazet, daniel

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.
---
 include/linux/netdevice.h  |  2 ++
 net/core/dev.c             |  4 +++-
 net/core/sysctl_net_core.c | 14 ++++++++++++++
 net/sched/sch_generic.c    |  2 +-
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..bb331e0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
+extern int		dev_w_rx_bias;
+extern int		dev_w_tx_bias;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..0dcbd28 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3428,6 +3428,8 @@ EXPORT_SYMBOL(netdev_max_backlog);
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;            /* old backlog weight */
+int dev_w_rx_bias __read_mostly = 1;            /* bias for backlog weight */
+int dev_w_tx_bias __read_mostly = 1;            /* bias for output_queue quota */
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4835,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = weight_p * dev_w_rx_bias;
 	while (again) {
 		struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..a2ab149 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -276,6 +276,20 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
+		.procname	= "dev_w_rx_bias",
+		.data		= &dev_w_rx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "dev_w_tx_bias",
+		.data		= &dev_w_tx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
 		.procname	= "netdev_max_backlog",
 		.data		= &netdev_max_backlog,
 		.maxlen		= sizeof(int),
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e..4c07780 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = weight_p * dev_w_tx_bias;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: dev_weight: TX/RX orthogonality
  2016-12-27  8:25       ` [PATCH] " Matthias Tafelmeier
@ 2016-12-27 16:47         ` Marcelo Ricardo Leitner
  2016-12-27 17:29           ` Matthias Tafelmeier
  2016-12-28  9:42           ` [PATCH v3] " Matthias Tafelmeier
  0 siblings, 2 replies; 17+ messages in thread
From: Marcelo Ricardo Leitner @ 2016-12-27 16:47 UTC (permalink / raw)
  To: Matthias Tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel

On Tue, Dec 27, 2016 at 09:25:47AM +0100, Matthias Tafelmeier wrote:
> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
> 
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior processing
> support (e.g. aRFS).
> 
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
> 
> This patch therefore introduces an independent configurability via sysctl to
> userland.
> ---
>  include/linux/netdevice.h  |  2 ++
>  net/core/dev.c             |  4 +++-
>  net/core/sysctl_net_core.c | 14 ++++++++++++++
>  net/sched/sch_generic.c    |  2 +-
>  4 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 994f742..bb331e0 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3795,6 +3795,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
>  extern int		netdev_max_backlog;
>  extern int		netdev_tstamp_prequeue;
>  extern int		weight_p;
> +extern int		dev_w_rx_bias;
> +extern int		dev_w_tx_bias;
>  
>  bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
>  struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 8db5a0b..0dcbd28 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3428,6 +3428,8 @@ EXPORT_SYMBOL(netdev_max_backlog);
>  int netdev_tstamp_prequeue __read_mostly = 1;
>  int netdev_budget __read_mostly = 300;
>  int weight_p __read_mostly = 64;            /* old backlog weight */
> +int dev_w_rx_bias __read_mostly = 1;            /* bias for backlog weight */
> +int dev_w_tx_bias __read_mostly = 1;            /* bias for output_queue quota */
>  
>  /* Called with irq disabled */
>  static inline void ____napi_schedule(struct softnet_data *sd,
> @@ -4833,7 +4835,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
>  		net_rps_action_and_irq_enable(sd);
>  	}
>  
> -	napi->weight = weight_p;
> +	napi->weight = weight_p * dev_w_rx_bias;
>  	while (again) {
>  		struct sk_buff *skb;
>  
> diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
> index 2a46e40..a2ab149 100644
> --- a/net/core/sysctl_net_core.c
> +++ b/net/core/sysctl_net_core.c
> @@ -276,6 +276,20 @@ static struct ctl_table net_core_table[] = {
>  		.proc_handler	= proc_dointvec
>  	},
>  	{
> +		.procname	= "dev_w_rx_bias",
> +		.data		= &dev_w_rx_bias,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec
> +	},
> +	{
> +		.procname	= "dev_w_tx_bias",
> +		.data		= &dev_w_tx_bias,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec
> +	},
> +	{

Please describe these at Documentation/sysctl/net.txt, probably right
after dev_weight. 

I'm not sure about the abbreviation, maybe it would be better the longer
name as it doesn't block tab completion.
dev_weight_tx_bias
dev_weight_rx_bias
dev_weight

>  		.procname	= "netdev_max_backlog",
>  		.data		= &netdev_max_backlog,
>  		.maxlen		= sizeof(int),
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 6eb9c8e..4c07780 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
>  
>  void __qdisc_run(struct Qdisc *q)
>  {
> -	int quota = weight_p;
> +	int quota = weight_p * dev_w_tx_bias;
>  	int packets;
>  
>  	while (qdisc_restart(q, &packets)) {
> -- 
> 2.7.4
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] net: dev_weight: TX/RX orthogonality
  2016-12-27 16:47         ` Marcelo Ricardo Leitner
@ 2016-12-27 17:29           ` Matthias Tafelmeier
  2016-12-28  9:42           ` [PATCH v3] " Matthias Tafelmeier
  1 sibling, 0 replies; 17+ messages in thread
From: Matthias Tafelmeier @ 2016-12-27 17:29 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner; +Cc: netdev, hagen, fw, edumazet, daniel


[-- Attachment #1.1.1: Type: text/plain, Size: 4028 bytes --]

On 12/27/2016 05:47 PM, Marcelo Ricardo Leitner wrote:
> On Tue, Dec 27, 2016 at 09:25:47AM +0100, Matthias Tafelmeier wrote:
>> Oftenly, introducing side effects on packet processing on the other half
>> of the stack by adjusting one of TX/RX via sysctl is not desirable.
>> There are cases of demand for asymmetric, orthogonal configurability.
>>
>> This holds true especially for nodes where RPS for RFS usage on top is
>> configured and therefore use the 'old dev_weight'. This is quite a
>> common base configuration setup nowadays, even with NICs of superior processing
>> support (e.g. aRFS).
>>
>> A good example use case are nodes acting as noSQL data bases with a
>> large number of tiny requests and rather fewer but large packets as responses.
>> It's affordable to have large budget and rx dev_weights for the
>> requests. But as a side effect having this large a number on TX
>> processed in one run can overwhelm drivers.
>>
>> This patch therefore introduces an independent configurability via sysctl to
>> userland.
>> ---
>>  include/linux/netdevice.h  |  2 ++
>>  net/core/dev.c             |  4 +++-
>>  net/core/sysctl_net_core.c | 14 ++++++++++++++
>>  net/sched/sch_generic.c    |  2 +-
>>  4 files changed, 20 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 994f742..bb331e0 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -3795,6 +3795,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
>>  extern int		netdev_max_backlog;
>>  extern int		netdev_tstamp_prequeue;
>>  extern int		weight_p;
>> +extern int		dev_w_rx_bias;
>> +extern int		dev_w_tx_bias;
>>  
>>  bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
>>  struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 8db5a0b..0dcbd28 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -3428,6 +3428,8 @@ EXPORT_SYMBOL(netdev_max_backlog);
>>  int netdev_tstamp_prequeue __read_mostly = 1;
>>  int netdev_budget __read_mostly = 300;
>>  int weight_p __read_mostly = 64;            /* old backlog weight */
>> +int dev_w_rx_bias __read_mostly = 1;            /* bias for backlog weight */
>> +int dev_w_tx_bias __read_mostly = 1;            /* bias for output_queue quota */
>>  
>>  /* Called with irq disabled */
>>  static inline void ____napi_schedule(struct softnet_data *sd,
>> @@ -4833,7 +4835,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
>>  		net_rps_action_and_irq_enable(sd);
>>  	}
>>  
>> -	napi->weight = weight_p;
>> +	napi->weight = weight_p * dev_w_rx_bias;
>>  	while (again) {
>>  		struct sk_buff *skb;
>>  
>> diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
>> index 2a46e40..a2ab149 100644
>> --- a/net/core/sysctl_net_core.c
>> +++ b/net/core/sysctl_net_core.c
>> @@ -276,6 +276,20 @@ static struct ctl_table net_core_table[] = {
>>  		.proc_handler	= proc_dointvec
>>  	},
>>  	{
>> +		.procname	= "dev_w_rx_bias",
>> +		.data		= &dev_w_rx_bias,
>> +		.maxlen		= sizeof(int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec
>> +	},
>> +	{
>> +		.procname	= "dev_w_tx_bias",
>> +		.data		= &dev_w_tx_bias,
>> +		.maxlen		= sizeof(int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec
>> +	},
>> +	{
> Please describe these at Documentation/sysctl/net.txt, probably right
> after dev_weight. 
Sure, I'll do that.

> I'm not sure about the abbreviation, maybe it would be better the longer
> name as it doesn't block tab completion.
> dev_weight_tx_bias
> dev_weight_rx_bias
> dev_weight
>
Do not find the abbreviation/naming satisfactory, either. Rather saw
them as a draft. Could think of dev_weight distant naming:

ns_rps_cpu_rx_bias
ns_cpu_tx_bias

Though, makes me concerned about association etc. Maybe, that's nit
picking.



[-- Attachment #1.1.2: 0x8ADF343B.asc --]
[-- Type: application/pgp-keys, Size: 4806 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 538 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v3] net: dev_weight: TX/RX orthogonality
  2016-12-27 16:47         ` Marcelo Ricardo Leitner
  2016-12-27 17:29           ` Matthias Tafelmeier
@ 2016-12-28  9:42           ` Matthias Tafelmeier
  2016-12-28 19:17             ` David Miller
  1 sibling, 1 reply; 17+ messages in thread
From: Matthias Tafelmeier @ 2016-12-28  9:42 UTC (permalink / raw)
  To: netdev; +Cc: hagen, fw, edumazet, daniel

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.
---
 Documentation/sysctl/net.txt | 21 +++++++++++++++++++++
 include/linux/netdevice.h    |  2 ++
 net/core/dev.c               |  4 +++-
 net/core/sysctl_net_core.c   | 14 ++++++++++++++
 net/sched/sch_generic.c      |  2 +-
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function 
+of the driver for the per softirq cycle netdev_budget. This parameter influences 
+the proportion of the configured netdev_budget that is spent on RPS based packet 
+processing during RX softirq cycles. It is further meant for making current 
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based 
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric 
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..46b4b66 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
+extern int		dev_weight_rx_bias;
+extern int		dev_weight_tx_bias;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..7ce1736 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3428,6 +3428,8 @@ EXPORT_SYMBOL(netdev_max_backlog);
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;            /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;            /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1;            /* bias for output_queue quota */
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4835,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = weight_p * dev_weight_rx_bias;
 	while (again) {
 		struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..2197388 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -276,6 +276,20 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
+		.procname	= "dev_weight_rx_bias",
+		.data		= &dev_weight_rx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "dev_weight_tx_bias",
+		.data		= &dev_weight_tx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
 		.procname	= "netdev_max_backlog",
 		.data		= &netdev_max_backlog,
 		.maxlen		= sizeof(int),
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e..19374ef 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = weight_p * dev_weight_tx_bias;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v3] net: dev_weight: TX/RX orthogonality
  2016-12-28  9:42           ` [PATCH v3] " Matthias Tafelmeier
@ 2016-12-28 19:17             ` David Miller
  2016-12-29  9:58               ` [PATCH v4] " Matthias Tafelmeier
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2016-12-28 19:17 UTC (permalink / raw)
  To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel

From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Wed, 28 Dec 2016 10:42:14 +0100

> @@ -3428,6 +3428,8 @@ EXPORT_SYMBOL(netdev_max_backlog);
>  int netdev_tstamp_prequeue __read_mostly = 1;
>  int netdev_budget __read_mostly = 300;
>  int weight_p __read_mostly = 64;            /* old backlog weight */
> +int dev_weight_rx_bias __read_mostly = 1;            /* bias for backlog weight */
> +int dev_weight_tx_bias __read_mostly = 1;            /* bias for output_queue quota */
>  
>  /* Called with irq disabled */
>  static inline void ____napi_schedule(struct softnet_data *sd,
> @@ -4833,7 +4835,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
>  		net_rps_action_and_irq_enable(sd);
>  	}
>  
> -	napi->weight = weight_p;
> +	napi->weight = weight_p * dev_weight_rx_bias;
>  	while (again) {
>  		struct sk_buff *skb;
>  
 ...
> @@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
>  
>  void __qdisc_run(struct Qdisc *q)
>  {
> -	int quota = weight_p;
> +	int quota = weight_p * dev_weight_tx_bias;

Ok, this is a lot better than what you proposed initially.

However, being that this is the fast path for all packet processing,
introducing a multiply here doesn't sit well.

I think there are two possible ways to address this:

1) Make the bias instead be a "shift".

2) Precompute the dev_tx_weight and dev_rx_weight into two variables
   in net/core/dev.c  Install a special proc_dointvec handler for
   "dev_weight" that, upon proc_dointvec() success, updates both
   dev_tx_weight and dev_rx_weight based upon the bias settings.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v4] net: dev_weight: TX/RX orthogonality
  2016-12-28 19:17             ` David Miller
@ 2016-12-29  9:58               ` Matthias Tafelmeier
  2016-12-29 19:08                 ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Matthias Tafelmeier @ 2016-12-29  9:58 UTC (permalink / raw)
  To: netdev; +Cc: hagen, fw, edumazet, daniel

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.
---
 Documentation/sysctl/net.txt | 21 +++++++++++++++++++++
 include/linux/netdevice.h    |  4 ++++
 net/core/dev.c               |  6 +++++-
 net/core/sysctl_net_core.c   | 31 ++++++++++++++++++++++++++++++-
 net/sched/sch_generic.c      |  2 +-
 5 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..ecd78b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
+extern int		dev_weight_rx_bias;
+extern int		dev_weight_tx_bias;
+extern int		dev_rx_weight;
+extern int		dev_tx_weight;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..f2fe98b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3428,6 +3428,10 @@ EXPORT_SYMBOL(netdev_max_backlog);
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;            /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;            /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1;            /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = weight_p;
+int dev_tx_weight __read_mostly = weight_p;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = dev_rx_weight;
 	while (again) {
 		struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..698ddd7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret != 0)
+		return ret;
+
+	dev_rx_weight = weight_p * dev_weight_rx_bias;
+	dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+	return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
 		.data		= &weight_p,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_rx_bias",
+		.data		= &dev_weight_rx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_tx_bias",
+		.data		= &dev_weight_tx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
 	},
 	{
 		.procname	= "netdev_max_backlog",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e..b052b27 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = dev_tx_weight;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
  2016-12-29  9:58               ` [PATCH v4] " Matthias Tafelmeier
@ 2016-12-29 19:08                 ` David Miller
  2016-12-29 19:23                   ` Matthias Tafelmeier
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2016-12-29 19:08 UTC (permalink / raw)
  To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel

From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Thu, 29 Dec 2016 10:58:41 +0100

> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
> 
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior processing
> support (e.g. aRFS).
> 
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
> 
> This patch therefore introduces an independent configurability via sysctl to
> userland.

This is missing a signoff.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v4] net: dev_weight: TX/RX orthogonality
  2016-12-29 19:08                 ` David Miller
@ 2016-12-29 19:23                   ` Matthias Tafelmeier
  2016-12-29 19:44                     ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Matthias Tafelmeier @ 2016-12-29 19:23 UTC (permalink / raw)
  To: netdev; +Cc: hagen, fw, edumazet, daniel

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.

Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
---
 Documentation/sysctl/net.txt | 21 +++++++++++++++++++++
 include/linux/netdevice.h    |  4 ++++
 net/core/dev.c               |  6 +++++-
 net/core/sysctl_net_core.c   | 31 ++++++++++++++++++++++++++++++-
 net/sched/sch_generic.c      |  2 +-
 5 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..ecd78b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
+extern int		dev_weight_rx_bias;
+extern int		dev_weight_tx_bias;
+extern int		dev_rx_weight;
+extern int		dev_tx_weight;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..f2fe98b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3428,6 +3428,10 @@ EXPORT_SYMBOL(netdev_max_backlog);
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;            /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;            /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1;            /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = weight_p;
+int dev_tx_weight __read_mostly = weight_p;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = dev_rx_weight;
 	while (again) {
 		struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..698ddd7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret != 0)
+		return ret;
+
+	dev_rx_weight = weight_p * dev_weight_rx_bias;
+	dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+	return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
 		.data		= &weight_p,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_rx_bias",
+		.data		= &dev_weight_rx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_tx_bias",
+		.data		= &dev_weight_tx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
 	},
 	{
 		.procname	= "netdev_max_backlog",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e..b052b27 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = dev_tx_weight;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
  2016-12-29 19:23                   ` Matthias Tafelmeier
@ 2016-12-29 19:44                     ` David Miller
  2016-12-29 19:45                       ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2016-12-29 19:44 UTC (permalink / raw)
  To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel

From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Thu, 29 Dec 2016 20:23:18 +0100

> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
> 
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior processing
> support (e.g. aRFS).
> 
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
> 
> This patch therefore introduces an independent configurability via sysctl to
> userland.
> 
> Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>

Applied.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
  2016-12-29 19:44                     ` David Miller
@ 2016-12-29 19:45                       ` David Miller
  2016-12-29 19:53                         ` Matthias Tafelmeier
  2016-12-29 20:37                         ` [PATCH v5] " Matthias Tafelmeier
  0 siblings, 2 replies; 17+ messages in thread
From: David Miller @ 2016-12-29 19:45 UTC (permalink / raw)
  To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel


Actually, reverted, you didn't even build test this:

net/core/dev.c:3433:35: error: initializer element is not constant
 int dev_rx_weight __read_mostly = weight_p;
                                   ^~~~~~~~
net/core/dev.c:3434:35: error: initializer element is not constant
 int dev_tx_weight __read_mostly = weight_p;
                                   ^~~~~~~~

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
  2016-12-29 19:45                       ` David Miller
@ 2016-12-29 19:53                         ` Matthias Tafelmeier
  2016-12-29 20:37                         ` [PATCH v5] " Matthias Tafelmeier
  1 sibling, 0 replies; 17+ messages in thread
From: Matthias Tafelmeier @ 2016-12-29 19:53 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, hagen, fw, edumazet, daniel


[-- Attachment #1.1.1: Type: text/plain, Size: 435 bytes --]


> Actually, reverted, you didn't even build test this:
>
> net/core/dev.c:3433:35: error: initializer element is not constant
>  int dev_rx_weight __read_mostly = weight_p;
>                                    ^~~~~~~~
> net/core/dev.c:3434:35: error: initializer element is not constant
>  int dev_tx_weight __read_mostly = weight_p;
>                                    ^~~~~~~~

Thought I would have ... let me check.


[-- Attachment #1.1.2: 0x8ADF343B.asc --]
[-- Type: application/pgp-keys, Size: 4806 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 538 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v5] net: dev_weight: TX/RX orthogonality
  2016-12-29 19:45                       ` David Miller
  2016-12-29 19:53                         ` Matthias Tafelmeier
@ 2016-12-29 20:37                         ` Matthias Tafelmeier
  2016-12-30  1:16                           ` David Miller
  1 sibling, 1 reply; 17+ messages in thread
From: Matthias Tafelmeier @ 2016-12-29 20:37 UTC (permalink / raw)
  To: netdev; +Cc: hagen, fw, edumazet, daniel

Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.

This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).

A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.

This patch therefore introduces an independent configurability via sysctl to
userland.

Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
---
 Documentation/sysctl/net.txt | 21 +++++++++++++++++++++
 include/linux/netdevice.h    |  4 ++++
 net/core/dev.c               |  8 ++++++--
 net/core/sysctl_net_core.c   | 31 ++++++++++++++++++++++++++++++-
 net/sched/sch_generic.c      |  2 +-
 5 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
 it's a Per-CPU variable.
 Default: 64
 
+dev_weight_rx_bias
+--------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
 default_qdisc
 --------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..ecd78b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
 extern int		weight_p;
+extern int		dev_weight_rx_bias;
+extern int		dev_weight_tx_bias;
+extern int		dev_rx_weight;
+extern int		dev_tx_weight;
 
 bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..0d34e1c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3427,7 +3427,11 @@ EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
-int weight_p __read_mostly = 64;            /* old backlog weight */
+int weight_p __read_mostly = 64;           /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = 64;
+int dev_tx_weight __read_mostly = 64;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = weight_p;
+	napi->weight = dev_rx_weight;
 	while (again) {
 		struct sk_buff *skb;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..698ddd7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
 }
 #endif
 
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret != 0)
+		return ret;
+
+	dev_rx_weight = weight_p * dev_weight_rx_bias;
+	dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+	return ret;
+}
+
 static int proc_do_rss_key(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
 		.data		= &weight_p,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_rx_bias",
+		.data		= &dev_weight_rx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
+	},
+	{
+		.procname	= "dev_weight_tx_bias",
+		.data		= &dev_weight_tx_bias,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_dev_weight,
 	},
 	{
 		.procname	= "netdev_max_backlog",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e..b052b27 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = weight_p;
+	int quota = dev_tx_weight;
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v5] net: dev_weight: TX/RX orthogonality
  2016-12-29 20:37                         ` [PATCH v5] " Matthias Tafelmeier
@ 2016-12-30  1:16                           ` David Miller
  2017-02-13 20:22                             ` Matthias Tafelmeier
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2016-12-30  1:16 UTC (permalink / raw)
  To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel

From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Thu, 29 Dec 2016 21:37:21 +0100

> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
> 
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior processing
> support (e.g. aRFS).
> 
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
> 
> This patch therefore introduces an independent configurability via sysctl to
> userland.
> 
> Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>

Much better, applied, thanks.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v5] net: dev_weight: TX/RX orthogonality
  2016-12-30  1:16                           ` David Miller
@ 2017-02-13 20:22                             ` Matthias Tafelmeier
  0 siblings, 0 replies; 17+ messages in thread
From: Matthias Tafelmeier @ 2017-02-13 20:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


[-- Attachment #1.1.1: Type: text/plain, Size: 1196 bytes --]


>> Oftenly, introducing side effects on packet processing on the other half
>> of the stack by adjusting one of TX/RX via sysctl is not desirable.
>> There are cases of demand for asymmetric, orthogonal configurability.
>>
>> This holds true especially for nodes where RPS for RFS usage on top is
>> configured and therefore use the 'old dev_weight'. This is quite a
>> common base configuration setup nowadays, even with NICs of superior processing
>> support (e.g. aRFS).
>>
>> A good example use case are nodes acting as noSQL data bases with a
>> large number of tiny requests and rather fewer but large packets as responses.
>> It's affordable to have large budget and rx dev_weights for the
>> requests. But as a side effect having this large a number on TX
>> processed in one run can overwhelm drivers.
>>
>> This patch therefore introduces an independent configurability via sysctl to
>> userland.
>>
>> Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
> Much better, applied, thanks.
>

Excuse me, have you rejected that one in the meantime/afterwards?
Checked in Hartman's and Linus tree and your current net. It's not applied.

Thanks!



[-- Attachment #1.1.2: 0x8ADF343B.asc --]
[-- Type: application/pgp-keys, Size: 4806 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 538 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2017-02-13 20:22 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-26  9:49 [PATCH v1] net: dev_weight: TX/RX orthogonality Matthias Tafelmeier
2016-12-26 15:52 ` David Miller
     [not found]   ` <ae0712c3-61c6-432e-78d9-665d0c291c9f@gmx.net>
2016-12-26 16:58     ` [PATCH v1] net: dev_weight: TX/RX orthogonality,Re: " David Miller
2016-12-27  8:25       ` [PATCH] " Matthias Tafelmeier
2016-12-27 16:47         ` Marcelo Ricardo Leitner
2016-12-27 17:29           ` Matthias Tafelmeier
2016-12-28  9:42           ` [PATCH v3] " Matthias Tafelmeier
2016-12-28 19:17             ` David Miller
2016-12-29  9:58               ` [PATCH v4] " Matthias Tafelmeier
2016-12-29 19:08                 ` David Miller
2016-12-29 19:23                   ` Matthias Tafelmeier
2016-12-29 19:44                     ` David Miller
2016-12-29 19:45                       ` David Miller
2016-12-29 19:53                         ` Matthias Tafelmeier
2016-12-29 20:37                         ` [PATCH v5] " Matthias Tafelmeier
2016-12-30  1:16                           ` David Miller
2017-02-13 20:22                             ` Matthias Tafelmeier

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.