All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] net: add support for threaded NAPI polling
@ 2020-07-29 16:50 Felix Fietkau
  2020-07-29 17:44 ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Felix Fietkau @ 2020-07-29 16:50 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Hillf Danton

For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
poll function does not perform well. Since NAPI poll is bound to the CPU it
was scheduled from, we can easily end up with a few very busy CPUs spending
most of their time in softirq/ksoftirqd and some idle ones.

Introduce threaded NAPI for such drivers based on a workqueue. The API is the
same except for using netif_threaded_napi_add instead of netif_napi_add.

In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
thread.

With threaded NAPI, throughput seems stable and consistent (and higher than
the best results I got without it).

Based on a patch by Hillf Danton

Cc: Hillf Danton <hdanton@sina.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
Changes since RFC v2:
- fix unused but set variable reported by kbuild test robot

Changes since RFC:
- disable softirq around threaded poll functions
- reuse most parts of napi_poll()
- fix re-schedule condition

 include/linux/netdevice.h |  23 ++++++
 net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
 2 files changed, 133 insertions(+), 52 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ac2cd3f49aba..3a39211c7598 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -347,6 +347,7 @@ struct napi_struct {
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	unsigned int		napi_id;
+	struct work_struct	work;
 };
 
 enum {
@@ -357,6 +358,7 @@ enum {
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
 	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
 	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+	NAPI_STATE_THREADED,	/* Use threaded NAPI */
 };
 
 enum {
@@ -367,6 +369,7 @@ enum {
 	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
 	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
 	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED),
 };
 
 enum gro_result {
@@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight);
 
+/**
+ *	netif_threaded_napi_add - initialize a NAPI context
+ *	@dev:  network device
+ *	@napi: NAPI context
+ *	@poll: polling function
+ *	@weight: default weight
+ *
+ * This variant of netif_napi_add() should be used from drivers using NAPI
+ * with CPU intensive poll functions.
+ * This will schedule polling from a high priority workqueue that
+ */
+static inline void netif_threaded_napi_add(struct net_device *dev,
+					   struct napi_struct *napi,
+					   int (*poll)(struct napi_struct *, int),
+					   int weight)
+{
+	set_bit(NAPI_STATE_THREADED, &napi->state);
+	netif_napi_add(dev, napi, poll, weight);
+}
+
 /**
  *	netif_tx_napi_add - initialize a NAPI context
  *	@dev:  network device
diff --git a/net/core/dev.c b/net/core/dev.c
index 19f1abc26fcd..11b027f3a2b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 struct list_head ptype_all __read_mostly;	/* Taps */
 static struct list_head offload_base __read_mostly;
+static struct workqueue_struct *napi_workq __read_mostly;
 
 static int netif_rx_internal(struct sk_buff *skb);
 static int call_netdevice_notifiers_info(unsigned long val,
@@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
 {
 	unsigned long flags;
 
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+		queue_work(napi_workq, &n->work);
+		return;
+	}
+
 	local_irq_save(flags);
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 	local_irq_restore(flags);
@@ -6333,6 +6339,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
  */
 void __napi_schedule_irqoff(struct napi_struct *n)
 {
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+		queue_work(napi_workq, &n->work);
+		return;
+	}
+
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 }
 EXPORT_SYMBOL(__napi_schedule_irqoff);
@@ -6601,6 +6612,95 @@ static void init_gro_hash(struct napi_struct *napi)
 	napi->gro_bitmask = 0;
 }
 
+static int __napi_poll(struct napi_struct *n, bool *repoll)
+{
+	int work, weight;
+
+	weight = n->weight;
+
+	/* This NAPI_STATE_SCHED test is for avoiding a race
+	 * with netpoll's poll_napi().  Only the entity which
+	 * obtains the lock and sees NAPI_STATE_SCHED set will
+	 * actually make the ->poll() call.  Therefore we avoid
+	 * accidentally calling ->poll() when NAPI is not scheduled.
+	 */
+	work = 0;
+	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+		work = n->poll(n, weight);
+		trace_napi_poll(n, work, weight);
+	}
+
+	if (unlikely(work > weight))
+		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+			    n->poll, work, weight);
+
+	if (likely(work < weight))
+		return work;
+
+	/* Drivers must not modify the NAPI state if they
+	 * consume the entire weight.  In such cases this code
+	 * still "owns" the NAPI instance and therefore can
+	 * move the instance around on the list at-will.
+	 */
+	if (unlikely(napi_disable_pending(n))) {
+		napi_complete(n);
+		return work;
+	}
+
+	if (n->gro_bitmask) {
+		/* flush too old packets
+		 * If HZ < 1000, flush all packets.
+		 */
+		napi_gro_flush(n, HZ >= 1000);
+	}
+
+	gro_normal_list(n);
+
+	/* Some drivers may have called napi_schedule
+	 * prior to exhausting their budget.
+	 */
+	if (unlikely(!list_empty(&n->poll_list))) {
+		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+			     n->dev ? n->dev->name : "backlog");
+		return work;
+	}
+
+	*repoll = true;
+
+	return work;
+}
+
+static void napi_workfn(struct work_struct *work)
+{
+	struct napi_struct *n = container_of(work, struct napi_struct, work);
+	void *have;
+
+	for (;;) {
+		bool repoll = false;
+
+		local_bh_disable();
+
+		have = netpoll_poll_lock(n);
+		__napi_poll(n, &repoll);
+		netpoll_poll_unlock(have);
+
+		local_bh_enable();
+
+		if (!repoll)
+			return;
+
+		if (!need_resched())
+			continue;
+
+		/*
+		 * have to pay for the latency of task switch even if
+		 * napi is scheduled
+		 */
+		queue_work(napi_workq, work);
+		return;
+	}
+}
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -6621,6 +6721,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 #ifdef CONFIG_NETPOLL
 	napi->poll_owner = -1;
 #endif
+	INIT_WORK(&napi->work, napi_workfn);
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	napi_hash_add(napi);
 }
@@ -6671,65 +6772,18 @@ EXPORT_SYMBOL(netif_napi_del);
 
 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 {
+	bool do_repoll = false;
 	void *have;
-	int work, weight;
+	int work;
 
 	list_del_init(&n->poll_list);
 
 	have = netpoll_poll_lock(n);
 
-	weight = n->weight;
-
-	/* This NAPI_STATE_SCHED test is for avoiding a race
-	 * with netpoll's poll_napi().  Only the entity which
-	 * obtains the lock and sees NAPI_STATE_SCHED set will
-	 * actually make the ->poll() call.  Therefore we avoid
-	 * accidentally calling ->poll() when NAPI is not scheduled.
-	 */
-	work = 0;
-	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
-		work = n->poll(n, weight);
-		trace_napi_poll(n, work, weight);
-	}
-
-	if (unlikely(work > weight))
-		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
-			    n->poll, work, weight);
-
-	if (likely(work < weight))
-		goto out_unlock;
-
-	/* Drivers must not modify the NAPI state if they
-	 * consume the entire weight.  In such cases this code
-	 * still "owns" the NAPI instance and therefore can
-	 * move the instance around on the list at-will.
-	 */
-	if (unlikely(napi_disable_pending(n))) {
-		napi_complete(n);
-		goto out_unlock;
-	}
-
-	if (n->gro_bitmask) {
-		/* flush too old packets
-		 * If HZ < 1000, flush all packets.
-		 */
-		napi_gro_flush(n, HZ >= 1000);
-	}
-
-	gro_normal_list(n);
-
-	/* Some drivers may have called napi_schedule
-	 * prior to exhausting their budget.
-	 */
-	if (unlikely(!list_empty(&n->poll_list))) {
-		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
-			     n->dev ? n->dev->name : "backlog");
-		goto out_unlock;
-	}
-
-	list_add_tail(&n->poll_list, repoll);
+	work = __napi_poll(n, &do_repoll);
+	if (do_repoll)
+		list_add_tail(&n->poll_list, repoll);
 
-out_unlock:
 	netpoll_poll_unlock(have);
 
 	return work;
@@ -10676,6 +10730,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 	}
 
+	napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
+				     WQ_UNBOUND_MAX_ACTIVE);
+	BUG_ON(!napi_workq);
+
 	dev_boot_phase = 0;
 
 	/* The loopback device is special if any other network devices
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: add support for threaded NAPI polling
  2020-07-29 16:50 [PATCH] net: add support for threaded NAPI polling Felix Fietkau
@ 2020-07-29 17:44 ` Eric Dumazet
  2020-07-30 14:30   ` Sebastian Gottschall
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2020-07-29 17:44 UTC (permalink / raw)
  To: Felix Fietkau, netdev; +Cc: Eric Dumazet, Hillf Danton



On 7/29/20 9:50 AM, Felix Fietkau wrote:
> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
> poll function does not perform well. Since NAPI poll is bound to the CPU it
> was scheduled from, we can easily end up with a few very busy CPUs spending
> most of their time in softirq/ksoftirqd and some idle ones.
> 
> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
> same except for using netif_threaded_napi_add instead of netif_napi_add.
> 
> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
> thread.
> 
> With threaded NAPI, throughput seems stable and consistent (and higher than
> the best results I got without it).
> 
> Based on a patch by Hillf Danton
> 
> Cc: Hillf Danton <hdanton@sina.com>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
> Changes since RFC v2:
> - fix unused but set variable reported by kbuild test robot
> 
> Changes since RFC:
> - disable softirq around threaded poll functions
> - reuse most parts of napi_poll()
> - fix re-schedule condition
> 
>  include/linux/netdevice.h |  23 ++++++
>  net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
>  2 files changed, 133 insertions(+), 52 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ac2cd3f49aba..3a39211c7598 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -347,6 +347,7 @@ struct napi_struct {
>  	struct list_head	dev_list;
>  	struct hlist_node	napi_hash_node;
>  	unsigned int		napi_id;
> +	struct work_struct	work;
>  };
>  
>  enum {
> @@ -357,6 +358,7 @@ enum {
>  	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
>  	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>  	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
> +	NAPI_STATE_THREADED,	/* Use threaded NAPI */
>  };
>  
>  enum {
> @@ -367,6 +369,7 @@ enum {
>  	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
>  	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>  	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
> +	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED),
>  };
>  
>  enum gro_result {
> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>  		    int (*poll)(struct napi_struct *, int), int weight);
>  
> +/**
> + *	netif_threaded_napi_add - initialize a NAPI context
> + *	@dev:  network device
> + *	@napi: NAPI context
> + *	@poll: polling function
> + *	@weight: default weight
> + *
> + * This variant of netif_napi_add() should be used from drivers using NAPI
> + * with CPU intensive poll functions.
> + * This will schedule polling from a high priority workqueue that
> + */
> +static inline void netif_threaded_napi_add(struct net_device *dev,
> +					   struct napi_struct *napi,
> +					   int (*poll)(struct napi_struct *, int),
> +					   int weight)
> +{
> +	set_bit(NAPI_STATE_THREADED, &napi->state);
> +	netif_napi_add(dev, napi, poll, weight);
> +}
> +
>  /**
>   *	netif_tx_napi_add - initialize a NAPI context
>   *	@dev:  network device
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 19f1abc26fcd..11b027f3a2b9 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>  struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>  struct list_head ptype_all __read_mostly;	/* Taps */
>  static struct list_head offload_base __read_mostly;
> +static struct workqueue_struct *napi_workq __read_mostly;
>  
>  static int netif_rx_internal(struct sk_buff *skb);
>  static int call_netdevice_notifiers_info(unsigned long val,
> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>  {
>  	unsigned long flags;
>  
> +	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
> +		queue_work(napi_workq, &n->work);
> +		return;
> +	}
> +


Where is the corresponding cancel_work_sync() or flush_work() at device dismantle ?

Just hoping the thread will eventually run seems optimistic to me.


Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
that can be changed dynamically, at admin request, instead of having to change/recompile
a driver.



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: add support for threaded NAPI polling
  2020-07-29 17:44 ` Eric Dumazet
@ 2020-07-30 14:30   ` Sebastian Gottschall
  2020-07-30 15:42     ` David Laight
  2020-07-30 16:08     ` Eric Dumazet
  0 siblings, 2 replies; 10+ messages in thread
From: Sebastian Gottschall @ 2020-07-30 14:30 UTC (permalink / raw)
  To: Eric Dumazet, Felix Fietkau, netdev; +Cc: Hillf Danton


Am 29.07.2020 um 19:44 schrieb Eric Dumazet:
>
> On 7/29/20 9:50 AM, Felix Fietkau wrote:
>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>> was scheduled from, we can easily end up with a few very busy CPUs spending
>> most of their time in softirq/ksoftirqd and some idle ones.
>>
>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>
>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>> thread.
>>
>> With threaded NAPI, throughput seems stable and consistent (and higher than
>> the best results I got without it).
>>
>> Based on a patch by Hillf Danton
>>
>> Cc: Hillf Danton <hdanton@sina.com>
>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>> ---
>> Changes since RFC v2:
>> - fix unused but set variable reported by kbuild test robot
>>
>> Changes since RFC:
>> - disable softirq around threaded poll functions
>> - reuse most parts of napi_poll()
>> - fix re-schedule condition
>>
>>   include/linux/netdevice.h |  23 ++++++
>>   net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
>>   2 files changed, 133 insertions(+), 52 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index ac2cd3f49aba..3a39211c7598 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -347,6 +347,7 @@ struct napi_struct {
>>   	struct list_head	dev_list;
>>   	struct hlist_node	napi_hash_node;
>>   	unsigned int		napi_id;
>> +	struct work_struct	work;
>>   };
>>   
>>   enum {
>> @@ -357,6 +358,7 @@ enum {
>>   	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
>>   	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>>   	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
>> +	NAPI_STATE_THREADED,	/* Use threaded NAPI */
>>   };
>>   
>>   enum {
>> @@ -367,6 +369,7 @@ enum {
>>   	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
>>   	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>>   	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>> +	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED),
>>   };
>>   
>>   enum gro_result {
>> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>>   void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>>   		    int (*poll)(struct napi_struct *, int), int weight);
>>   
>> +/**
>> + *	netif_threaded_napi_add - initialize a NAPI context
>> + *	@dev:  network device
>> + *	@napi: NAPI context
>> + *	@poll: polling function
>> + *	@weight: default weight
>> + *
>> + * This variant of netif_napi_add() should be used from drivers using NAPI
>> + * with CPU intensive poll functions.
>> + * This will schedule polling from a high priority workqueue that
>> + */
>> +static inline void netif_threaded_napi_add(struct net_device *dev,
>> +					   struct napi_struct *napi,
>> +					   int (*poll)(struct napi_struct *, int),
>> +					   int weight)
>> +{
>> +	set_bit(NAPI_STATE_THREADED, &napi->state);
>> +	netif_napi_add(dev, napi, poll, weight);
>> +}
>> +
>>   /**
>>    *	netif_tx_napi_add - initialize a NAPI context
>>    *	@dev:  network device
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 19f1abc26fcd..11b027f3a2b9 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>>   struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>>   struct list_head ptype_all __read_mostly;	/* Taps */
>>   static struct list_head offload_base __read_mostly;
>> +static struct workqueue_struct *napi_workq __read_mostly;
>>   
>>   static int netif_rx_internal(struct sk_buff *skb);
>>   static int call_netdevice_notifiers_info(unsigned long val,
>> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>>   {
>>   	unsigned long flags;
>>   
>> +	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
>> +		queue_work(napi_workq, &n->work);
>> +		return;
>> +	}
>> +
>
> Where is the corresponding cancel_work_sync() or flush_work() at device dismantle ?
>
> Just hoping the thread will eventually run seems optimistic to me.
>
>
> Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
> that can be changed dynamically, at admin request, instead of having to change/recompile
> a driver.
thats not that easy. wifi devices do use dummy netdev devices. they are 
not visible to sysfs and other administrative options.
so changing it would just be possible if a special mac80211 based 
control would be implemented for these drivers.
for standard netdev devices it isnt a big thing to implement a 
administrative control by sysfs (if you are talking about such a feature)
>
>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] net: add support for threaded NAPI polling
  2020-07-30 14:30   ` Sebastian Gottschall
@ 2020-07-30 15:42     ` David Laight
  2020-07-30 17:19       ` Sebastian Gottschall
  2020-07-30 16:08     ` Eric Dumazet
  1 sibling, 1 reply; 10+ messages in thread
From: David Laight @ 2020-07-30 15:42 UTC (permalink / raw)
  To: 'Sebastian Gottschall', Eric Dumazet, Felix Fietkau, netdev
  Cc: Hillf Danton

From: Sebastian Gottschall
> Sent: 30 July 2020 15:30
...
> > Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
> > that can be changed dynamically, at admin request, instead of having to change/recompile
> > a driver.

> thats not that easy. wifi devices do use dummy netdev devices. they are
> not visible to sysfs and other administrative options.
> so changing it would just be possible if a special mac80211 based
> control would be implemented for these drivers.
> for standard netdev devices it isnt a big thing to implement a
> administrative control by sysfs (if you are talking about such a feature)

ISTM that a global flag that made all NAPI callbacks be made
from a worker thread rather than softint would be more approriate.
Or even something that made the softint callbacks themselves
only run an a specific high(ish) priority kernel thread.

While it might slow down setups that need very low ethernet
latency it will help those that don't want application RT threads
to be 'stolen' by the softint code while they hold application
mutex or are waiting to be woken by a cv.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: add support for threaded NAPI polling
  2020-07-30 14:30   ` Sebastian Gottschall
  2020-07-30 15:42     ` David Laight
@ 2020-07-30 16:08     ` Eric Dumazet
  2020-07-30 17:21       ` Sebastian Gottschall
  1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2020-07-30 16:08 UTC (permalink / raw)
  To: Sebastian Gottschall, Eric Dumazet, Felix Fietkau, netdev; +Cc: Hillf Danton



On 7/30/20 7:30 AM, Sebastian Gottschall wrote:
> 
> Am 29.07.2020 um 19:44 schrieb Eric Dumazet:
>>
>> On 7/29/20 9:50 AM, Felix Fietkau wrote:
>>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>>> was scheduled from, we can easily end up with a few very busy CPUs spending
>>> most of their time in softirq/ksoftirqd and some idle ones.
>>>
>>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>>
>>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>>> thread.
>>>
>>> With threaded NAPI, throughput seems stable and consistent (and higher than
>>> the best results I got without it).
>>>
>>> Based on a patch by Hillf Danton
>>>
>>> Cc: Hillf Danton <hdanton@sina.com>
>>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>>> ---
>>> Changes since RFC v2:
>>> - fix unused but set variable reported by kbuild test robot
>>>
>>> Changes since RFC:
>>> - disable softirq around threaded poll functions
>>> - reuse most parts of napi_poll()
>>> - fix re-schedule condition
>>>
>>>   include/linux/netdevice.h |  23 ++++++
>>>   net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
>>>   2 files changed, 133 insertions(+), 52 deletions(-)
>>>
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index ac2cd3f49aba..3a39211c7598 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -347,6 +347,7 @@ struct napi_struct {
>>>       struct list_head    dev_list;
>>>       struct hlist_node    napi_hash_node;
>>>       unsigned int        napi_id;
>>> +    struct work_struct    work;
>>>   };
>>>     enum {
>>> @@ -357,6 +358,7 @@ enum {
>>>       NAPI_STATE_HASHED,    /* In NAPI hash (busy polling possible) */
>>>       NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>>>       NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
>>> +    NAPI_STATE_THREADED,    /* Use threaded NAPI */
>>>   };
>>>     enum {
>>> @@ -367,6 +369,7 @@ enum {
>>>       NAPIF_STATE_HASHED     = BIT(NAPI_STATE_HASHED),
>>>       NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>>>       NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>>> +    NAPIF_STATE_THREADED     = BIT(NAPI_STATE_THREADED),
>>>   };
>>>     enum gro_result {
>>> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>>>   void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>>>               int (*poll)(struct napi_struct *, int), int weight);
>>>   +/**
>>> + *    netif_threaded_napi_add - initialize a NAPI context
>>> + *    @dev:  network device
>>> + *    @napi: NAPI context
>>> + *    @poll: polling function
>>> + *    @weight: default weight
>>> + *
>>> + * This variant of netif_napi_add() should be used from drivers using NAPI
>>> + * with CPU intensive poll functions.
>>> + * This will schedule polling from a high priority workqueue that
>>> + */
>>> +static inline void netif_threaded_napi_add(struct net_device *dev,
>>> +                       struct napi_struct *napi,
>>> +                       int (*poll)(struct napi_struct *, int),
>>> +                       int weight)
>>> +{
>>> +    set_bit(NAPI_STATE_THREADED, &napi->state);
>>> +    netif_napi_add(dev, napi, poll, weight);
>>> +}
>>> +
>>>   /**
>>>    *    netif_tx_napi_add - initialize a NAPI context
>>>    *    @dev:  network device
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index 19f1abc26fcd..11b027f3a2b9 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>>>   struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>>>   struct list_head ptype_all __read_mostly;    /* Taps */
>>>   static struct list_head offload_base __read_mostly;
>>> +static struct workqueue_struct *napi_workq __read_mostly;
>>>     static int netif_rx_internal(struct sk_buff *skb);
>>>   static int call_netdevice_notifiers_info(unsigned long val,
>>> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>>>   {
>>>       unsigned long flags;
>>>   +    if (test_bit(NAPI_STATE_THREADED, &n->state)) {
>>> +        queue_work(napi_workq, &n->work);
>>> +        return;
>>> +    }
>>> +
>>
>> Where is the corresponding cancel_work_sync() or flush_work() at device dismantle ?
>>
>> Just hoping the thread will eventually run seems optimistic to me.
>>
>>
>> Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
>> that can be changed dynamically, at admin request, instead of having to change/recompile
>> a driver.
> thats not that easy. wifi devices do use dummy netdev devices. they are not visible to sysfs and other administrative options.
> so changing it would just be possible if a special mac80211 based control would be implemented for these drivers.
> for standard netdev devices it isnt a big thing to implement a administrative control by sysfs (if you are talking about such a feature)

We do not want to add code in fast path only for one device. We need something truly generic.

I am not saying only the admin can chose, it is fine if a driver does not give the choice
and will simply call netif_threaded_napi_add()


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: add support for threaded NAPI polling
  2020-07-30 15:42     ` David Laight
@ 2020-07-30 17:19       ` Sebastian Gottschall
  0 siblings, 0 replies; 10+ messages in thread
From: Sebastian Gottschall @ 2020-07-30 17:19 UTC (permalink / raw)
  To: David Laight, Eric Dumazet, Felix Fietkau, netdev; +Cc: Hillf Danton


Am 30.07.2020 um 17:42 schrieb David Laight:
> From: Sebastian Gottschall
>> Sent: 30 July 2020 15:30
> ...
>>> Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
>>> that can be changed dynamically, at admin request, instead of having to change/recompile
>>> a driver.
>> thats not that easy. wifi devices do use dummy netdev devices. they are
>> not visible to sysfs and other administrative options.
>> so changing it would just be possible if a special mac80211 based
>> control would be implemented for these drivers.
>> for standard netdev devices it isnt a big thing to implement a
>> administrative control by sysfs (if you are talking about such a feature)
> ISTM that a global flag that made all NAPI callbacks be made
> from a worker thread rather than softint would be more approriate.
> Or even something that made the softint callbacks themselves
> only run an a specific high(ish) priority kernel thread.
>
> While it might slow down setups that need very low ethernet
> latency it will help those that don't want application RT threads
> to be 'stolen' by the softint code while they hold application
> mutex or are waiting to be woken by a cv.
this will not work either. i already identified drivers which are 
incompatible with that approach. (marvell mvneta for instance)
so threading should be a feature which should be enabled or disabled on 
known working chipsets. but it cannot be used as
set and forget feature for everything unless the specific driver are fixed.
>
> 	David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: add support for threaded NAPI polling
  2020-07-30 16:08     ` Eric Dumazet
@ 2020-07-30 17:21       ` Sebastian Gottschall
  2020-07-31 16:36         ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Sebastian Gottschall @ 2020-07-30 17:21 UTC (permalink / raw)
  To: Eric Dumazet, Felix Fietkau, netdev; +Cc: Hillf Danton


Am 30.07.2020 um 18:08 schrieb Eric Dumazet:
>
> On 7/30/20 7:30 AM, Sebastian Gottschall wrote:
>> Am 29.07.2020 um 19:44 schrieb Eric Dumazet:
>>> On 7/29/20 9:50 AM, Felix Fietkau wrote:
>>>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>>>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>>>> was scheduled from, we can easily end up with a few very busy CPUs spending
>>>> most of their time in softirq/ksoftirqd and some idle ones.
>>>>
>>>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>>>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>>>
>>>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>>>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>>>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>>>> thread.
>>>>
>>>> With threaded NAPI, throughput seems stable and consistent (and higher than
>>>> the best results I got without it).
>>>>
>>>> Based on a patch by Hillf Danton
>>>>
>>>> Cc: Hillf Danton <hdanton@sina.com>
>>>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>>>> ---
>>>> Changes since RFC v2:
>>>> - fix unused but set variable reported by kbuild test robot
>>>>
>>>> Changes since RFC:
>>>> - disable softirq around threaded poll functions
>>>> - reuse most parts of napi_poll()
>>>> - fix re-schedule condition
>>>>
>>>>    include/linux/netdevice.h |  23 ++++++
>>>>    net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
>>>>    2 files changed, 133 insertions(+), 52 deletions(-)
>>>>
>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>> index ac2cd3f49aba..3a39211c7598 100644
>>>> --- a/include/linux/netdevice.h
>>>> +++ b/include/linux/netdevice.h
>>>> @@ -347,6 +347,7 @@ struct napi_struct {
>>>>        struct list_head    dev_list;
>>>>        struct hlist_node    napi_hash_node;
>>>>        unsigned int        napi_id;
>>>> +    struct work_struct    work;
>>>>    };
>>>>      enum {
>>>> @@ -357,6 +358,7 @@ enum {
>>>>        NAPI_STATE_HASHED,    /* In NAPI hash (busy polling possible) */
>>>>        NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>>>>        NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
>>>> +    NAPI_STATE_THREADED,    /* Use threaded NAPI */
>>>>    };
>>>>      enum {
>>>> @@ -367,6 +369,7 @@ enum {
>>>>        NAPIF_STATE_HASHED     = BIT(NAPI_STATE_HASHED),
>>>>        NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>>>>        NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>>>> +    NAPIF_STATE_THREADED     = BIT(NAPI_STATE_THREADED),
>>>>    };
>>>>      enum gro_result {
>>>> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>>>>    void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>>>>                int (*poll)(struct napi_struct *, int), int weight);
>>>>    +/**
>>>> + *    netif_threaded_napi_add - initialize a NAPI context
>>>> + *    @dev:  network device
>>>> + *    @napi: NAPI context
>>>> + *    @poll: polling function
>>>> + *    @weight: default weight
>>>> + *
>>>> + * This variant of netif_napi_add() should be used from drivers using NAPI
>>>> + * with CPU intensive poll functions.
>>>> + * This will schedule polling from a high priority workqueue that
>>>> + */
>>>> +static inline void netif_threaded_napi_add(struct net_device *dev,
>>>> +                       struct napi_struct *napi,
>>>> +                       int (*poll)(struct napi_struct *, int),
>>>> +                       int weight)
>>>> +{
>>>> +    set_bit(NAPI_STATE_THREADED, &napi->state);
>>>> +    netif_napi_add(dev, napi, poll, weight);
>>>> +}
>>>> +
>>>>    /**
>>>>     *    netif_tx_napi_add - initialize a NAPI context
>>>>     *    @dev:  network device
>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>> index 19f1abc26fcd..11b027f3a2b9 100644
>>>> --- a/net/core/dev.c
>>>> +++ b/net/core/dev.c
>>>> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>>>>    struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>>>>    struct list_head ptype_all __read_mostly;    /* Taps */
>>>>    static struct list_head offload_base __read_mostly;
>>>> +static struct workqueue_struct *napi_workq __read_mostly;
>>>>      static int netif_rx_internal(struct sk_buff *skb);
>>>>    static int call_netdevice_notifiers_info(unsigned long val,
>>>> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>>>>    {
>>>>        unsigned long flags;
>>>>    +    if (test_bit(NAPI_STATE_THREADED, &n->state)) {
>>>> +        queue_work(napi_workq, &n->work);
>>>> +        return;
>>>> +    }
>>>> +
>>> Where is the corresponding cancel_work_sync() or flush_work() at device dismantle ?
>>>
>>> Just hoping the thread will eventually run seems optimistic to me.
>>>
>>>
>>> Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
>>> that can be changed dynamically, at admin request, instead of having to change/recompile
>>> a driver.
>> thats not that easy. wifi devices do use dummy netdev devices. they are not visible to sysfs and other administrative options.
>> so changing it would just be possible if a special mac80211 based control would be implemented for these drivers.
>> for standard netdev devices it isnt a big thing to implement a administrative control by sysfs (if you are talking about such a feature)
> We do not want to add code in fast path only for one device. We need something truly generic.
>
> I am not saying only the admin can chose, it is fine if a driver does not give the choice
> and will simply call netif_threaded_napi_add()
what could make sense if the feature can be disabled / enabled, but it 
will only affect drivers using the netif_threaded_napi_add call, but it 
should not affect drivers
using the old api in any way since not all drivers will work with this 
feature.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: add support for threaded NAPI polling
  2020-07-30 17:21       ` Sebastian Gottschall
@ 2020-07-31 16:36         ` Eric Dumazet
  2020-08-02 14:27           ` Sebastian Gottschall
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2020-07-31 16:36 UTC (permalink / raw)
  To: Sebastian Gottschall, Eric Dumazet, Felix Fietkau, netdev; +Cc: Hillf Danton



On 7/30/20 10:21 AM, Sebastian Gottschall wrote:
> 
> Am 30.07.2020 um 18:08 schrieb Eric Dumazet:
>>
>> On 7/30/20 7:30 AM, Sebastian Gottschall wrote:
>>> Am 29.07.2020 um 19:44 schrieb Eric Dumazet:
>>>> On 7/29/20 9:50 AM, Felix Fietkau wrote:
>>>>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>>>>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>>>>> was scheduled from, we can easily end up with a few very busy CPUs spending
>>>>> most of their time in softirq/ksoftirqd and some idle ones.
>>>>>
>>>>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>>>>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>>>>
>>>>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>>>>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>>>>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>>>>> thread.
>>>>>
>>>>> With threaded NAPI, throughput seems stable and consistent (and higher than
>>>>> the best results I got without it).
>>>>>
>>>>> Based on a patch by Hillf Danton
>>>>>
>>>>> Cc: Hillf Danton <hdanton@sina.com>
>>>>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>>>>> ---
>>>>> Changes since RFC v2:
>>>>> - fix unused but set variable reported by kbuild test robot
>>>>>
>>>>> Changes since RFC:
>>>>> - disable softirq around threaded poll functions
>>>>> - reuse most parts of napi_poll()
>>>>> - fix re-schedule condition
>>>>>
>>>>>    include/linux/netdevice.h |  23 ++++++
>>>>>    net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
>>>>>    2 files changed, 133 insertions(+), 52 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>> index ac2cd3f49aba..3a39211c7598 100644
>>>>> --- a/include/linux/netdevice.h
>>>>> +++ b/include/linux/netdevice.h
>>>>> @@ -347,6 +347,7 @@ struct napi_struct {
>>>>>        struct list_head    dev_list;
>>>>>        struct hlist_node    napi_hash_node;
>>>>>        unsigned int        napi_id;
>>>>> +    struct work_struct    work;
>>>>>    };
>>>>>      enum {
>>>>> @@ -357,6 +358,7 @@ enum {
>>>>>        NAPI_STATE_HASHED,    /* In NAPI hash (busy polling possible) */
>>>>>        NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>>>>>        NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
>>>>> +    NAPI_STATE_THREADED,    /* Use threaded NAPI */
>>>>>    };
>>>>>      enum {
>>>>> @@ -367,6 +369,7 @@ enum {
>>>>>        NAPIF_STATE_HASHED     = BIT(NAPI_STATE_HASHED),
>>>>>        NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>>>>>        NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>>>>> +    NAPIF_STATE_THREADED     = BIT(NAPI_STATE_THREADED),
>>>>>    };
>>>>>      enum gro_result {
>>>>> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>>>>>    void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>>>>>                int (*poll)(struct napi_struct *, int), int weight);
>>>>>    +/**
>>>>> + *    netif_threaded_napi_add - initialize a NAPI context
>>>>> + *    @dev:  network device
>>>>> + *    @napi: NAPI context
>>>>> + *    @poll: polling function
>>>>> + *    @weight: default weight
>>>>> + *
>>>>> + * This variant of netif_napi_add() should be used from drivers using NAPI
>>>>> + * with CPU intensive poll functions.
>>>>> + * This will schedule polling from a high priority workqueue that
>>>>> + */
>>>>> +static inline void netif_threaded_napi_add(struct net_device *dev,
>>>>> +                       struct napi_struct *napi,
>>>>> +                       int (*poll)(struct napi_struct *, int),
>>>>> +                       int weight)
>>>>> +{
>>>>> +    set_bit(NAPI_STATE_THREADED, &napi->state);
>>>>> +    netif_napi_add(dev, napi, poll, weight);
>>>>> +}
>>>>> +
>>>>>    /**
>>>>>     *    netif_tx_napi_add - initialize a NAPI context
>>>>>     *    @dev:  network device
>>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>>> index 19f1abc26fcd..11b027f3a2b9 100644
>>>>> --- a/net/core/dev.c
>>>>> +++ b/net/core/dev.c
>>>>> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>>>>>    struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>>>>>    struct list_head ptype_all __read_mostly;    /* Taps */
>>>>>    static struct list_head offload_base __read_mostly;
>>>>> +static struct workqueue_struct *napi_workq __read_mostly;
>>>>>      static int netif_rx_internal(struct sk_buff *skb);
>>>>>    static int call_netdevice_notifiers_info(unsigned long val,
>>>>> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>>>>>    {
>>>>>        unsigned long flags;
>>>>>    +    if (test_bit(NAPI_STATE_THREADED, &n->state)) {
>>>>> +        queue_work(napi_workq, &n->work);
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>> Where is the corresponding cancel_work_sync() or flush_work() at device dismantle ?
>>>>
>>>> Just hoping the thread will eventually run seems optimistic to me.
>>>>
>>>>
>>>> Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
>>>> that can be changed dynamically, at admin request, instead of having to change/recompile
>>>> a driver.
>>> thats not that easy. wifi devices do use dummy netdev devices. they are not visible to sysfs and other administrative options.
>>> so changing it would just be possible if a special mac80211 based control would be implemented for these drivers.
>>> for standard netdev devices it isnt a big thing to implement a administrative control by sysfs (if you are talking about such a feature)
>> We do not want to add code in fast path only for one device. We need something truly generic.
>>
>> I am not saying only the admin can chose, it is fine if a driver does not give the choice
>> and will simply call netif_threaded_napi_add()
> what could make sense if the feature can be disabled / enabled, but it will only affect drivers using the netif_threaded_napi_add call, but it should not affect drivers
> using the old api in any way since not all drivers will work with this feature.


If we provide something in core NAPI stack, we want to make sure we can test/use it with other drivers.

ethtool, or a /sys/class/net/ethXXX entry could be used.

The argument about not affecting other drivers is misleading, since the patch adds another conditional test in
standard NAPI layer.

Lets keep NAPI generic please.

Lets make sure syzbot will find bugs without having to attach a specific mac80211 hardware.

Another concern I have with this patch is that we no longer can contain NIC processing is done
on a selected set of cpus (as commanded in /proc/irq/XXX/smp_affinity).
Or can we ?


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: add support for threaded NAPI polling
  2020-07-31 16:36         ` Eric Dumazet
@ 2020-08-02 14:27           ` Sebastian Gottschall
  2020-08-04 20:41             ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Sebastian Gottschall @ 2020-08-02 14:27 UTC (permalink / raw)
  To: Eric Dumazet, Felix Fietkau, netdev; +Cc: Hillf Danton


Am 31.07.2020 um 18:36 schrieb Eric Dumazet:
>
> On 7/30/20 10:21 AM, Sebastian Gottschall wrote:
>> Am 30.07.2020 um 18:08 schrieb Eric Dumazet:
>>> On 7/30/20 7:30 AM, Sebastian Gottschall wrote:
>>>> Am 29.07.2020 um 19:44 schrieb Eric Dumazet:
>>>>> On 7/29/20 9:50 AM, Felix Fietkau wrote:
>>>>>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>>>>>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>>>>>> was scheduled from, we can easily end up with a few very busy CPUs spending
>>>>>> most of their time in softirq/ksoftirqd and some idle ones.
>>>>>>
>>>>>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>>>>>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>>>>>
>>>>>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>>>>>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>>>>>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>>>>>> thread.
>>>>>>
>>>>>> With threaded NAPI, throughput seems stable and consistent (and higher than
>>>>>> the best results I got without it).
>>>>>>
>>>>>> Based on a patch by Hillf Danton
>>>>>>
>>>>>> Cc: Hillf Danton <hdanton@sina.com>
>>>>>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>>>>>> ---
>>>>>> Changes since RFC v2:
>>>>>> - fix unused but set variable reported by kbuild test robot
>>>>>>
>>>>>> Changes since RFC:
>>>>>> - disable softirq around threaded poll functions
>>>>>> - reuse most parts of napi_poll()
>>>>>> - fix re-schedule condition
>>>>>>
>>>>>>     include/linux/netdevice.h |  23 ++++++
>>>>>>     net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
>>>>>>     2 files changed, 133 insertions(+), 52 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>>> index ac2cd3f49aba..3a39211c7598 100644
>>>>>> --- a/include/linux/netdevice.h
>>>>>> +++ b/include/linux/netdevice.h
>>>>>> @@ -347,6 +347,7 @@ struct napi_struct {
>>>>>>         struct list_head    dev_list;
>>>>>>         struct hlist_node    napi_hash_node;
>>>>>>         unsigned int        napi_id;
>>>>>> +    struct work_struct    work;
>>>>>>     };
>>>>>>       enum {
>>>>>> @@ -357,6 +358,7 @@ enum {
>>>>>>         NAPI_STATE_HASHED,    /* In NAPI hash (busy polling possible) */
>>>>>>         NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>>>>>>         NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
>>>>>> +    NAPI_STATE_THREADED,    /* Use threaded NAPI */
>>>>>>     };
>>>>>>       enum {
>>>>>> @@ -367,6 +369,7 @@ enum {
>>>>>>         NAPIF_STATE_HASHED     = BIT(NAPI_STATE_HASHED),
>>>>>>         NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>>>>>>         NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>>>>>> +    NAPIF_STATE_THREADED     = BIT(NAPI_STATE_THREADED),
>>>>>>     };
>>>>>>       enum gro_result {
>>>>>> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>>>>>>     void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>>>>>>                 int (*poll)(struct napi_struct *, int), int weight);
>>>>>>     +/**
>>>>>> + *    netif_threaded_napi_add - initialize a NAPI context
>>>>>> + *    @dev:  network device
>>>>>> + *    @napi: NAPI context
>>>>>> + *    @poll: polling function
>>>>>> + *    @weight: default weight
>>>>>> + *
>>>>>> + * This variant of netif_napi_add() should be used from drivers using NAPI
>>>>>> + * with CPU intensive poll functions.
>>>>>> + * This will schedule polling from a high priority workqueue that
>>>>>> + */
>>>>>> +static inline void netif_threaded_napi_add(struct net_device *dev,
>>>>>> +                       struct napi_struct *napi,
>>>>>> +                       int (*poll)(struct napi_struct *, int),
>>>>>> +                       int weight)
>>>>>> +{
>>>>>> +    set_bit(NAPI_STATE_THREADED, &napi->state);
>>>>>> +    netif_napi_add(dev, napi, poll, weight);
>>>>>> +}
>>>>>> +
>>>>>>     /**
>>>>>>      *    netif_tx_napi_add - initialize a NAPI context
>>>>>>      *    @dev:  network device
>>>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>>>> index 19f1abc26fcd..11b027f3a2b9 100644
>>>>>> --- a/net/core/dev.c
>>>>>> +++ b/net/core/dev.c
>>>>>> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>>>>>>     struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>>>>>>     struct list_head ptype_all __read_mostly;    /* Taps */
>>>>>>     static struct list_head offload_base __read_mostly;
>>>>>> +static struct workqueue_struct *napi_workq __read_mostly;
>>>>>>       static int netif_rx_internal(struct sk_buff *skb);
>>>>>>     static int call_netdevice_notifiers_info(unsigned long val,
>>>>>> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>>>>>>     {
>>>>>>         unsigned long flags;
>>>>>>     +    if (test_bit(NAPI_STATE_THREADED, &n->state)) {
>>>>>> +        queue_work(napi_workq, &n->work);
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>> Where is the corresponding cancel_work_sync() or flush_work() at device dismantle ?
>>>>>
>>>>> Just hoping the thread will eventually run seems optimistic to me.
>>>>>
>>>>>
>>>>> Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
>>>>> that can be changed dynamically, at admin request, instead of having to change/recompile
>>>>> a driver.
>>>> thats not that easy. wifi devices do use dummy netdev devices. they are not visible to sysfs and other administrative options.
>>>> so changing it would just be possible if a special mac80211 based control would be implemented for these drivers.
>>>> for standard netdev devices it isnt a big thing to implement a administrative control by sysfs (if you are talking about such a feature)
>>> We do not want to add code in fast path only for one device. We need something truly generic.
>>>
>>> I am not saying only the admin can chose, it is fine if a driver does not give the choice
>>> and will simply call netif_threaded_napi_add()
>> what could make sense if the feature can be disabled / enabled, but it will only affect drivers using the netif_threaded_napi_add call, but it should not affect drivers
>> using the old api in any way since not all drivers will work with this feature.
>
> If we provide something in core NAPI stack, we want to make sure we can test/use it with other drivers.
>
> ethtool, or a /sys/class/net/ethXXX entry could be used.
but this doesnt work for wifi drivers. since wifi drivers are using 
dummy netdev devices. we are running in circles here
i mean a sane way could be also that dummy netdev devices are present in 
sysfs too which is not the case right now.
so changing the api, so the driver is forced to set sane virtual dummy 
netdev name (like the driver name for instance). so it can be accessed 
by sysfs.
>
> The argument about not affecting other drivers is misleading, since the patch adds another conditional test in
> standard NAPI layer.
>
> Lets keep NAPI generic please.
>
> Lets make sure syzbot will find bugs without having to attach a specific mac80211 hardware.
the patch is not mac80211 specific. i  tested it already with network 
drivers. it is generic.
>
> Another concern I have with this patch is that we no longer can contain NIC processing is done
> on a selected set of cpus (as commanded in /proc/irq/XXX/smp_affinity).
> Or can we ?
i had this discussion already with felix in a phonecall last week. 
kthread vs. workq. his oppinion is that workq works more effective than 
kthread's
since kthreads required application support for good balancing like 
irqbalance. personally i have no real oppinion here. the good point on 
kthreads is
that i'm able to track the system load per thread with simple process 
watching. and its possible to force the thread on a specific cpuset.

the good thing on workq is, its more simple to implement and usually 
more risc free, even if i havent seen any problems with kthreads.
maybe felix should say something here about this. the background of this 
patch is simply performance. especially on embedded devices.
it started with a ath10k patch which was introducing napi threading for 
a specific chipset which leaded to some research on my side until i found
a historic napi kthread patch from 2016 which you have denied at that 
time. i tested that patch and saw heavy performance boost for ath10k.
which leaded to this workq patch at the end

see also this discussion here. (you may remember this discussion since 
you where involved in it)
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1142611.html

Sebastian

>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: add support for threaded NAPI polling
  2020-08-02 14:27           ` Sebastian Gottschall
@ 2020-08-04 20:41             ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2020-08-04 20:41 UTC (permalink / raw)
  To: Sebastian Gottschall, Eric Dumazet, Felix Fietkau, netdev
  Cc: Hillf Danton, Wei Wang



On 8/2/20 7:27 AM, Sebastian Gottschall wrote:
> 
> Am 31.07.2020 um 18:36 schrieb Eric Dumazet:
>>
>> On 7/30/20 10:21 AM, Sebastian Gottschall wrote:
>>> Am 30.07.2020 um 18:08 schrieb Eric Dumazet:
>>>> On 7/30/20 7:30 AM, Sebastian Gottschall wrote:
>>>>> Am 29.07.2020 um 19:44 schrieb Eric Dumazet:
>>>>>> On 7/29/20 9:50 AM, Felix Fietkau wrote:
>>>>>>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>>>>>>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>>>>>>> was scheduled from, we can easily end up with a few very busy CPUs spending
>>>>>>> most of their time in softirq/ksoftirqd and some idle ones.
>>>>>>>
>>>>>>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>>>>>>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>>>>>>
>>>>>>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>>>>>>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>>>>>>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>>>>>>> thread.
>>>>>>>
>>>>>>> With threaded NAPI, throughput seems stable and consistent (and higher than
>>>>>>> the best results I got without it).
>>>>>>>
>>>>>>> Based on a patch by Hillf Danton
>>>>>>>
>>>>>>> Cc: Hillf Danton <hdanton@sina.com>
>>>>>>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>>>>>>> ---
>>>>>>> Changes since RFC v2:
>>>>>>> - fix unused but set variable reported by kbuild test robot
>>>>>>>
>>>>>>> Changes since RFC:
>>>>>>> - disable softirq around threaded poll functions
>>>>>>> - reuse most parts of napi_poll()
>>>>>>> - fix re-schedule condition
>>>>>>>
>>>>>>>     include/linux/netdevice.h |  23 ++++++
>>>>>>>     net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
>>>>>>>     2 files changed, 133 insertions(+), 52 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>>>> index ac2cd3f49aba..3a39211c7598 100644
>>>>>>> --- a/include/linux/netdevice.h
>>>>>>> +++ b/include/linux/netdevice.h
>>>>>>> @@ -347,6 +347,7 @@ struct napi_struct {
>>>>>>>         struct list_head    dev_list;
>>>>>>>         struct hlist_node    napi_hash_node;
>>>>>>>         unsigned int        napi_id;
>>>>>>> +    struct work_struct    work;
>>>>>>>     };
>>>>>>>       enum {
>>>>>>> @@ -357,6 +358,7 @@ enum {
>>>>>>>         NAPI_STATE_HASHED,    /* In NAPI hash (busy polling possible) */
>>>>>>>         NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>>>>>>>         NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
>>>>>>> +    NAPI_STATE_THREADED,    /* Use threaded NAPI */
>>>>>>>     };
>>>>>>>       enum {
>>>>>>> @@ -367,6 +369,7 @@ enum {
>>>>>>>         NAPIF_STATE_HASHED     = BIT(NAPI_STATE_HASHED),
>>>>>>>         NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>>>>>>>         NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>>>>>>> +    NAPIF_STATE_THREADED     = BIT(NAPI_STATE_THREADED),
>>>>>>>     };
>>>>>>>       enum gro_result {
>>>>>>> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>>>>>>>     void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>>>>>>>                 int (*poll)(struct napi_struct *, int), int weight);
>>>>>>>     +/**
>>>>>>> + *    netif_threaded_napi_add - initialize a NAPI context
>>>>>>> + *    @dev:  network device
>>>>>>> + *    @napi: NAPI context
>>>>>>> + *    @poll: polling function
>>>>>>> + *    @weight: default weight
>>>>>>> + *
>>>>>>> + * This variant of netif_napi_add() should be used from drivers using NAPI
>>>>>>> + * with CPU intensive poll functions.
>>>>>>> + * This will schedule polling from a high priority workqueue that
>>>>>>> + */
>>>>>>> +static inline void netif_threaded_napi_add(struct net_device *dev,
>>>>>>> +                       struct napi_struct *napi,
>>>>>>> +                       int (*poll)(struct napi_struct *, int),
>>>>>>> +                       int weight)
>>>>>>> +{
>>>>>>> +    set_bit(NAPI_STATE_THREADED, &napi->state);
>>>>>>> +    netif_napi_add(dev, napi, poll, weight);
>>>>>>> +}
>>>>>>> +
>>>>>>>     /**
>>>>>>>      *    netif_tx_napi_add - initialize a NAPI context
>>>>>>>      *    @dev:  network device
>>>>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>>>>> index 19f1abc26fcd..11b027f3a2b9 100644
>>>>>>> --- a/net/core/dev.c
>>>>>>> +++ b/net/core/dev.c
>>>>>>> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>>>>>>>     struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>>>>>>>     struct list_head ptype_all __read_mostly;    /* Taps */
>>>>>>>     static struct list_head offload_base __read_mostly;
>>>>>>> +static struct workqueue_struct *napi_workq __read_mostly;
>>>>>>>       static int netif_rx_internal(struct sk_buff *skb);
>>>>>>>     static int call_netdevice_notifiers_info(unsigned long val,
>>>>>>> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>>>>>>>     {
>>>>>>>         unsigned long flags;
>>>>>>>     +    if (test_bit(NAPI_STATE_THREADED, &n->state)) {
>>>>>>> +        queue_work(napi_workq, &n->work);
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>> Where is the corresponding cancel_work_sync() or flush_work() at device dismantle ?
>>>>>>
>>>>>> Just hoping the thread will eventually run seems optimistic to me.
>>>>>>
>>>>>>
>>>>>> Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
>>>>>> that can be changed dynamically, at admin request, instead of having to change/recompile
>>>>>> a driver.
>>>>> thats not that easy. wifi devices do use dummy netdev devices. they are not visible to sysfs and other administrative options.
>>>>> so changing it would just be possible if a special mac80211 based control would be implemented for these drivers.
>>>>> for standard netdev devices it isnt a big thing to implement a administrative control by sysfs (if you are talking about such a feature)
>>>> We do not want to add code in fast path only for one device. We need something truly generic.
>>>>
>>>> I am not saying only the admin can chose, it is fine if a driver does not give the choice
>>>> and will simply call netif_threaded_napi_add()
>>> what could make sense if the feature can be disabled / enabled, but it will only affect drivers using the netif_threaded_napi_add call, but it should not affect drivers
>>> using the old api in any way since not all drivers will work with this feature.
>>
>> If we provide something in core NAPI stack, we want to make sure we can test/use it with other drivers.
>>
>> ethtool, or a /sys/class/net/ethXXX entry could be used.
> but this doesnt work for wifi drivers. since wifi drivers are using dummy netdev devices. we are running in circles here
> i mean a sane way could be also that dummy netdev devices are present in sysfs too which is not the case right now.
> so changing the api, so the driver is forced to set sane virtual dummy netdev name (like the driver name for instance). so it can be accessed by sysfs.
>>
>> The argument about not affecting other drivers is misleading, since the patch adds another conditional test in
>> standard NAPI layer.
>>
>> Lets keep NAPI generic please.
>>
>> Lets make sure syzbot will find bugs without having to attach a specific mac80211 hardware.
> the patch is not mac80211 specific. i  tested it already with network drivers. it is generic.

Not really generic as you have to compile a new driver/kernel, since a driver
will either call netif_threaded_napi_add() or netif_napi_add().

Just to be extra clear : We have reasons to like the proposal, and want to use it,
without manual changes in NAPI drivers.


>>
>> Another concern I have with this patch is that we no longer can contain NIC processing is done
>> on a selected set of cpus (as commanded in /proc/irq/XXX/smp_affinity).
>> Or can we ?
> i had this discussion already with felix in a phonecall last week. kthread vs. workq. his oppinion is that workq works more effective than kthread's
> since kthreads required application support for good balancing like irqbalance. personally i have no real oppinion here. the good point on kthreads is
> that i'm able to track the system load per thread with simple process watching. and its possible to force the thread on a specific cpuset.

In either cases we want to be able tune things from scripts without requesting human looking at a process list :)

Hint : patch does not use WQ_SYSFS






^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-08-04 20:41 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-29 16:50 [PATCH] net: add support for threaded NAPI polling Felix Fietkau
2020-07-29 17:44 ` Eric Dumazet
2020-07-30 14:30   ` Sebastian Gottschall
2020-07-30 15:42     ` David Laight
2020-07-30 17:19       ` Sebastian Gottschall
2020-07-30 16:08     ` Eric Dumazet
2020-07-30 17:21       ` Sebastian Gottschall
2020-07-31 16:36         ` Eric Dumazet
2020-08-02 14:27           ` Sebastian Gottschall
2020-08-04 20:41             ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.