All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] net: add support for threaded NAPI polling
@ 2020-08-06  9:55 Felix Fietkau
  2020-08-06 11:51   ` kernel test robot
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Felix Fietkau @ 2020-08-06  9:55 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Hillf Danton

For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
poll function does not perform well. Since NAPI poll is bound to the CPU it
was scheduled from, we can easily end up with a few very busy CPUs spending
most of their time in softirq/ksoftirqd and some idle ones.

Introduce threaded NAPI for such drivers based on a workqueue. The API is the
same except for using netif_threaded_napi_add instead of netif_napi_add.

In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
thread.

With threaded NAPI, throughput seems stable and consistent (and higher than
the best results I got without it).

Based on a patch by Hillf Danton

Cc: Hillf Danton <hdanton@sina.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
Changes since PATCH v1:
- use WQ_SYSFS to make workqueue configurable from user space
- cancel work in netif_napi_del
- add a sysfs file to enable/disable threaded NAPI for a netdev

Changes since RFC v2:
- fix unused but set variable reported by kbuild test robot

Changes since RFC:
- disable softirq around threaded poll functions
- reuse most parts of napi_poll()
- fix re-schedule condition

 include/linux/netdevice.h |  23 ++++++
 net/core/dev.c            | 163 ++++++++++++++++++++++++++------------
 net/core/net-sysfs.c      |  42 ++++++++++
 3 files changed, 176 insertions(+), 52 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ac2cd3f49aba..3a39211c7598 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -347,6 +347,7 @@ struct napi_struct {
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	unsigned int		napi_id;
+	struct work_struct	work;
 };
 
 enum {
@@ -357,6 +358,7 @@ enum {
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
 	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
 	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+	NAPI_STATE_THREADED,	/* Use threaded NAPI */
 };
 
 enum {
@@ -367,6 +369,7 @@ enum {
 	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
 	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
 	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED),
 };
 
 enum gro_result {
@@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight);
 
+/**
+ *	netif_threaded_napi_add - initialize a NAPI context
+ *	@dev:  network device
+ *	@napi: NAPI context
+ *	@poll: polling function
+ *	@weight: default weight
+ *
+ * This variant of netif_napi_add() should be used from drivers using NAPI
+ * with CPU intensive poll functions.
+ * This will schedule polling from a high priority workqueue that
+ */
+static inline void netif_threaded_napi_add(struct net_device *dev,
+					   struct napi_struct *napi,
+					   int (*poll)(struct napi_struct *, int),
+					   int weight)
+{
+	set_bit(NAPI_STATE_THREADED, &napi->state);
+	netif_napi_add(dev, napi, poll, weight);
+}
+
 /**
  *	netif_tx_napi_add - initialize a NAPI context
  *	@dev:  network device
diff --git a/net/core/dev.c b/net/core/dev.c
index 19f1abc26fcd..4b0dbea68a09 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 struct list_head ptype_all __read_mostly;	/* Taps */
 static struct list_head offload_base __read_mostly;
+static struct workqueue_struct *napi_workq __read_mostly;
 
 static int netif_rx_internal(struct sk_buff *skb);
 static int call_netdevice_notifiers_info(unsigned long val,
@@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
 {
 	unsigned long flags;
 
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+		queue_work(napi_workq, &n->work);
+		return;
+	}
+
 	local_irq_save(flags);
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 	local_irq_restore(flags);
@@ -6333,6 +6339,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
  */
 void __napi_schedule_irqoff(struct napi_struct *n)
 {
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+		queue_work(napi_workq, &n->work);
+		return;
+	}
+
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 }
 EXPORT_SYMBOL(__napi_schedule_irqoff);
@@ -6601,6 +6612,95 @@ static void init_gro_hash(struct napi_struct *napi)
 	napi->gro_bitmask = 0;
 }
 
+static int __napi_poll(struct napi_struct *n, bool *repoll)
+{
+	int work, weight;
+
+	weight = n->weight;
+
+	/* This NAPI_STATE_SCHED test is for avoiding a race
+	 * with netpoll's poll_napi().  Only the entity which
+	 * obtains the lock and sees NAPI_STATE_SCHED set will
+	 * actually make the ->poll() call.  Therefore we avoid
+	 * accidentally calling ->poll() when NAPI is not scheduled.
+	 */
+	work = 0;
+	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+		work = n->poll(n, weight);
+		trace_napi_poll(n, work, weight);
+	}
+
+	if (unlikely(work > weight))
+		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+			    n->poll, work, weight);
+
+	if (likely(work < weight))
+		return work;
+
+	/* Drivers must not modify the NAPI state if they
+	 * consume the entire weight.  In such cases this code
+	 * still "owns" the NAPI instance and therefore can
+	 * move the instance around on the list at-will.
+	 */
+	if (unlikely(napi_disable_pending(n))) {
+		napi_complete(n);
+		return work;
+	}
+
+	if (n->gro_bitmask) {
+		/* flush too old packets
+		 * If HZ < 1000, flush all packets.
+		 */
+		napi_gro_flush(n, HZ >= 1000);
+	}
+
+	gro_normal_list(n);
+
+	/* Some drivers may have called napi_schedule
+	 * prior to exhausting their budget.
+	 */
+	if (unlikely(!list_empty(&n->poll_list))) {
+		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+			     n->dev ? n->dev->name : "backlog");
+		return work;
+	}
+
+	*repoll = true;
+
+	return work;
+}
+
+static void napi_workfn(struct work_struct *work)
+{
+	struct napi_struct *n = container_of(work, struct napi_struct, work);
+	void *have;
+
+	for (;;) {
+		bool repoll = false;
+
+		local_bh_disable();
+
+		have = netpoll_poll_lock(n);
+		__napi_poll(n, &repoll);
+		netpoll_poll_unlock(have);
+
+		local_bh_enable();
+
+		if (!repoll)
+			return;
+
+		if (!need_resched())
+			continue;
+
+		/*
+		 * have to pay for the latency of task switch even if
+		 * napi is scheduled
+		 */
+		queue_work(napi_workq, work);
+		return;
+	}
+}
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -6621,6 +6721,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 #ifdef CONFIG_NETPOLL
 	napi->poll_owner = -1;
 #endif
+	INIT_WORK(&napi->work, napi_workfn);
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	napi_hash_add(napi);
 }
@@ -6659,6 +6760,7 @@ static void flush_gro_hash(struct napi_struct *napi)
 void netif_napi_del(struct napi_struct *napi)
 {
 	might_sleep();
+	cancel_work_sync(&napi->work);
 	if (napi_hash_del(napi))
 		synchronize_net();
 	list_del_init(&napi->dev_list);
@@ -6671,65 +6773,18 @@ EXPORT_SYMBOL(netif_napi_del);
 
 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 {
+	bool do_repoll = false;
 	void *have;
-	int work, weight;
+	int work;
 
 	list_del_init(&n->poll_list);
 
 	have = netpoll_poll_lock(n);
 
-	weight = n->weight;
-
-	/* This NAPI_STATE_SCHED test is for avoiding a race
-	 * with netpoll's poll_napi().  Only the entity which
-	 * obtains the lock and sees NAPI_STATE_SCHED set will
-	 * actually make the ->poll() call.  Therefore we avoid
-	 * accidentally calling ->poll() when NAPI is not scheduled.
-	 */
-	work = 0;
-	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
-		work = n->poll(n, weight);
-		trace_napi_poll(n, work, weight);
-	}
-
-	if (unlikely(work > weight))
-		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
-			    n->poll, work, weight);
-
-	if (likely(work < weight))
-		goto out_unlock;
-
-	/* Drivers must not modify the NAPI state if they
-	 * consume the entire weight.  In such cases this code
-	 * still "owns" the NAPI instance and therefore can
-	 * move the instance around on the list at-will.
-	 */
-	if (unlikely(napi_disable_pending(n))) {
-		napi_complete(n);
-		goto out_unlock;
-	}
-
-	if (n->gro_bitmask) {
-		/* flush too old packets
-		 * If HZ < 1000, flush all packets.
-		 */
-		napi_gro_flush(n, HZ >= 1000);
-	}
-
-	gro_normal_list(n);
-
-	/* Some drivers may have called napi_schedule
-	 * prior to exhausting their budget.
-	 */
-	if (unlikely(!list_empty(&n->poll_list))) {
-		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
-			     n->dev ? n->dev->name : "backlog");
-		goto out_unlock;
-	}
-
-	list_add_tail(&n->poll_list, repoll);
+	work = __napi_poll(n, &do_repoll);
+	if (do_repoll)
+		list_add_tail(&n->poll_list, repoll);
 
-out_unlock:
 	netpoll_poll_unlock(have);
 
 	return work;
@@ -10676,6 +10731,10 @@ static int __init net_dev_init(void)
 		sd->backlog.weight = weight_p;
 	}
 
+	napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
+				     WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
+	BUG_ON(!napi_workq);
+
 	dev_boot_phase = 0;
 
 	/* The loopback device is special if any other network devices
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e353b822bb15..99233e86f4c5 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -471,6 +471,47 @@ static ssize_t proto_down_store(struct device *dev,
 }
 NETDEVICE_SHOW_RW(proto_down, fmt_dec);
 
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
+{
+	struct napi_struct *napi;
+
+	if (list_empty(&dev->napi_list))
+		return -EOPNOTSUPP;
+
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+		if (val)
+			set_bit(NAPI_STATE_THREADED, &napi->state);
+		else
+			clear_bit(NAPI_STATE_THREADED, &napi->state);
+	}
+
+	return 0;
+}
+
+static ssize_t napi_threaded_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_napi_threaded);
+}
+
+static ssize_t napi_threaded_show(struct device *dev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct napi_struct *napi;
+	bool enabled = false;
+
+	list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+		if (test_bit(NAPI_STATE_THREADED, &napi->state))
+			enabled = true;
+	}
+
+	return sprintf(buf, fmt_dec, enabled);
+}
+DEVICE_ATTR_RW(napi_threaded);
+
 static ssize_t phys_port_id_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
@@ -563,6 +604,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_tx_queue_len.attr,
 	&dev_attr_gro_flush_timeout.attr,
 	&dev_attr_napi_defer_hard_irqs.attr,
+	&dev_attr_napi_threaded.attr,
 	&dev_attr_phys_port_id.attr,
 	&dev_attr_phys_port_name.attr,
 	&dev_attr_phys_switch_id.attr,
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] net: add support for threaded NAPI polling
  2020-08-06  9:55 [PATCH v2] net: add support for threaded NAPI polling Felix Fietkau
@ 2020-08-06 11:51   ` kernel test robot
  2020-08-06 11:51   ` kernel test robot
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2020-08-06 11:51 UTC (permalink / raw)
  To: Felix Fietkau, netdev; +Cc: kbuild-all, Eric Dumazet, Hillf Danton

[-- Attachment #1: Type: text/plain, Size: 1402 bytes --]

Hi Felix,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]
[also build test WARNING on linus/master next-20200806]
[cannot apply to net/master ipvs/master v5.8]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Felix-Fietkau/net-add-support-for-threaded-NAPI-polling/20200806-175752
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git c1055b76ad00aed0e8b79417080f212d736246b6
config: i386-randconfig-s001-20200805 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.2-117-g8c7aee71-dirty
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)

>> net/core/net-sysfs.c:513:1: sparse: sparse: symbol 'dev_attr_napi_threaded' was not declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 26068 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] net: add support for threaded NAPI polling
@ 2020-08-06 11:51   ` kernel test robot
  0 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2020-08-06 11:51 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 1438 bytes --]

Hi Felix,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]
[also build test WARNING on linus/master next-20200806]
[cannot apply to net/master ipvs/master v5.8]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Felix-Fietkau/net-add-support-for-threaded-NAPI-polling/20200806-175752
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git c1055b76ad00aed0e8b79417080f212d736246b6
config: i386-randconfig-s001-20200805 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.2-117-g8c7aee71-dirty
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)

>> net/core/net-sysfs.c:513:1: sparse: sparse: symbol 'dev_attr_napi_threaded' was not declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 26068 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC PATCH] net: dev_attr_napi_threaded can be static
  2020-08-06  9:55 [PATCH v2] net: add support for threaded NAPI polling Felix Fietkau
@ 2020-08-06 11:51   ` kernel test robot
  2020-08-06 11:51   ` kernel test robot
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2020-08-06 11:51 UTC (permalink / raw)
  To: Felix Fietkau, netdev; +Cc: kbuild-all, Eric Dumazet, Hillf Danton


Signed-off-by: kernel test robot <lkp@intel.com>
---
 net-sysfs.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 8765e075d7e94..2fbbf4b818df4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -510,7 +510,7 @@ static ssize_t napi_threaded_show(struct device *dev,
 
 	return sprintf(buf, fmt_dec, enabled);
 }
-DEVICE_ATTR_RW(napi_threaded);
+static DEVICE_ATTR_RW(napi_threaded);
 
 static ssize_t phys_port_id_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH] net: dev_attr_napi_threaded can be static
@ 2020-08-06 11:51   ` kernel test robot
  0 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2020-08-06 11:51 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 590 bytes --]


Signed-off-by: kernel test robot <lkp@intel.com>
---
 net-sysfs.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 8765e075d7e94..2fbbf4b818df4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -510,7 +510,7 @@ static ssize_t napi_threaded_show(struct device *dev,
 
 	return sprintf(buf, fmt_dec, enabled);
 }
-DEVICE_ATTR_RW(napi_threaded);
+static DEVICE_ATTR_RW(napi_threaded);
 
 static ssize_t phys_port_id_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] net: add support for threaded NAPI polling
  2020-08-06  9:55 [PATCH v2] net: add support for threaded NAPI polling Felix Fietkau
  2020-08-06 11:51   ` kernel test robot
  2020-08-06 11:51   ` kernel test robot
@ 2020-08-06 17:39 ` Eric Dumazet
  2020-08-06 18:55 ` Jakub Kicinski
  2020-08-06 22:48 ` Wei Wang
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2020-08-06 17:39 UTC (permalink / raw)
  To: Felix Fietkau, netdev; +Cc: Eric Dumazet, Hillf Danton



On 8/6/20 2:55 AM, Felix Fietkau wrote:
> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
> poll function does not perform well. Since NAPI poll is bound to the CPU it
> was scheduled from, we can easily end up with a few very busy CPUs spending
> most of their time in softirq/ksoftirqd and some idle ones.
> 
> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
> same except for using netif_threaded_napi_add instead of netif_napi_add.
> 
> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
> thread.
> 
> With threaded NAPI, throughput seems stable and consistent (and higher than
> the best results I got without it).
> 
> Based on a patch by Hillf Danton
> 
> Cc: Hillf Danton <hdanton@sina.com>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>

...

> index e353b822bb15..99233e86f4c5 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -471,6 +471,47 @@ static ssize_t proto_down_store(struct device *dev,
>  }
>  NETDEVICE_SHOW_RW(proto_down, fmt_dec);
>  


This belongs to a separate patch, with correct attribution.

> +static int change_napi_threaded(struct net_device *dev, unsigned long val)
> +{
> +	struct napi_struct *napi;
> +
> +	if (list_empty(&dev->napi_list))
> +		return -EOPNOTSUPP;
> +	list_for_each_entry(napi, &dev->napi_list, dev_list) {
> +		if (val)
> +			set_bit(NAPI_STATE_THREADED, &napi->state);
> +		else
> +			clear_bit(NAPI_STATE_THREADED, &napi->state);
> +	}
> +
> +	return 0;
> +}
> +
> +static ssize_t napi_threaded_store(struct device *dev,
> +				struct device_attribute *attr,
> +				const char *buf, size_t len)
> +{
> +	return netdev_store(dev, attr, buf, len, change_napi_threaded);
> +}
> +
> +static ssize_t napi_threaded_show(struct device *dev,
> +				  struct device_attribute *attr,
> +				  char *buf)
> +{
> +	struct net_device *netdev = to_net_dev(dev);
> +	struct napi_struct *napi;
> +	bool enabled = false;
> +


You probably want to use RTNL protection, list could change under us otherwise.

The write side part is protected already in netdev_store()


> +	list_for_each_entry(napi, &netdev->napi_list, dev_list) {
> +		if (test_bit(NAPI_STATE_THREADED, &napi->state))
> +			enabled = true;
> +	}
> +
> +	return sprintf(buf, fmt_dec, enabled);
> +}
> +DEVICE_ATTR_RW(napi_threaded);
> +
>  static ssize_t phys_port_id_show(struct device *dev,
>  				 struct device_attribute *attr, char *buf)
>  {
> @@ -563,6 +604,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
>  	&dev_attr_tx_queue_len.attr,
>  	&dev_attr_gro_flush_timeout.attr,
>  	&dev_attr_napi_defer_hard_irqs.attr,
> +	&dev_attr_napi_threaded.attr,
>  	&dev_attr_phys_port_id.attr,
>  	&dev_attr_phys_port_name.attr,
>  	&dev_attr_phys_switch_id.attr,
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] net: add support for threaded NAPI polling
  2020-08-06  9:55 [PATCH v2] net: add support for threaded NAPI polling Felix Fietkau
                   ` (2 preceding siblings ...)
  2020-08-06 17:39 ` [PATCH v2] net: add support for threaded NAPI polling Eric Dumazet
@ 2020-08-06 18:55 ` Jakub Kicinski
  2020-08-06 19:25   ` Eric Dumazet
  2020-08-06 22:48 ` Wei Wang
  4 siblings, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2020-08-06 18:55 UTC (permalink / raw)
  To: Felix Fietkau; +Cc: netdev, Eric Dumazet, Hillf Danton

On Thu,  6 Aug 2020 11:55:58 +0200 Felix Fietkau wrote:
> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
> poll function does not perform well. Since NAPI poll is bound to the CPU it
> was scheduled from, we can easily end up with a few very busy CPUs spending
> most of their time in softirq/ksoftirqd and some idle ones.
> 
> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
> same except for using netif_threaded_napi_add instead of netif_napi_add.
> 
> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
> thread.
> 
> With threaded NAPI, throughput seems stable and consistent (and higher than
> the best results I got without it).

I'm still trying to wrap my head around this.

Am I understanding correctly that you have one IRQ and multiple NAPI
instances?

Are we not going to end up with pretty terrible cache locality here if
the scheduler starts to throw rx and tx completions around to random
CPUs?

I understand that implementing separate kthreads would be more LoC, but
we do have ksoftirqs already... maybe we should make the NAPI ->
ksoftirq mapping more flexible, and improve the logic which decides to
load ksoftirq rather than make $current() pay?

Sorry for being slow.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] net: add support for threaded NAPI polling
  2020-08-06 18:55 ` Jakub Kicinski
@ 2020-08-06 19:25   ` Eric Dumazet
  2020-08-06 19:57     ` Jakub Kicinski
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2020-08-06 19:25 UTC (permalink / raw)
  To: Jakub Kicinski, Felix Fietkau; +Cc: netdev, Eric Dumazet, Hillf Danton



On 8/6/20 11:55 AM, Jakub Kicinski wrote:
> On Thu,  6 Aug 2020 11:55:58 +0200 Felix Fietkau wrote:
>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>> was scheduled from, we can easily end up with a few very busy CPUs spending
>> most of their time in softirq/ksoftirqd and some idle ones.
>>
>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>
>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>> thread.
>>
>> With threaded NAPI, throughput seems stable and consistent (and higher than
>> the best results I got without it).
> 
> I'm still trying to wrap my head around this.
> 
> Am I understanding correctly that you have one IRQ and multiple NAPI
> instances?
> 
> Are we not going to end up with pretty terrible cache locality here if
> the scheduler starts to throw rx and tx completions around to random
> CPUs?
> 
> I understand that implementing separate kthreads would be more LoC, but
> we do have ksoftirqs already... maybe we should make the NAPI ->
> ksoftirq mapping more flexible, and improve the logic which decides to
> load ksoftirq rather than make $current() pay?
> 
> Sorry for being slow.
> 


Issue with ksoftirqd is that
- it is bound to a cpu
- Its nice value is 0, meaning that user threads can sometime compete too much with it.
- It handles all kinds of softirqs, so messing with it might hurt some other layer.

Note that the patch is using a dedicate work queue. It is going to be not practical
in case you need to handle two different NIC, and want separate pools for each of them.

Ideally, having one kthread per queue would be nice, but then there is more plumbing
work to let these kthreads being visible in a convenient way (/sys/class/net/ethX/queues/..../kthread)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] net: add support for threaded NAPI polling
  2020-08-06 19:25   ` Eric Dumazet
@ 2020-08-06 19:57     ` Jakub Kicinski
  2020-08-06 21:18       ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2020-08-06 19:57 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Felix Fietkau, netdev, Hillf Danton

On Thu, 6 Aug 2020 12:25:08 -0700 Eric Dumazet wrote:
> On 8/6/20 11:55 AM, Jakub Kicinski wrote:
> > I'm still trying to wrap my head around this.
> > 
> > Am I understanding correctly that you have one IRQ and multiple NAPI
> > instances?
> > 
> > Are we not going to end up with pretty terrible cache locality here if
> > the scheduler starts to throw rx and tx completions around to random
> > CPUs?
> > 
> > I understand that implementing separate kthreads would be more LoC, but
> > we do have ksoftirqs already... maybe we should make the NAPI ->
> > ksoftirq mapping more flexible, and improve the logic which decides to
> > load ksoftirq rather than make $current() pay?
> > 
> > Sorry for being slow.
> 
> Issue with ksoftirqd is that
> - it is bound to a cpu

Do you envision the scheduler balancing or work stealing being
advantageous in some configurations?

I was guessing that for compute workloads having ksoftirq bound will
actually make things more predictable/stable.

For pure routers (where we expect multiple cores to reach 100% just
doing packet forwarding) as long as there is an API to re-balance NAPIs
to cores - a simple specialized user space daemon would probably do a
better job as it can consult packet drop metrics etc.

Obviously I have no data to back up these claims..

> - Its nice value is 0, meaning that user threads can sometime compete too much with it.

True, I thought we could assume user level tuning.

> - It handles all kinds of softirqs, so messing with it might hurt some other layer.

Right, I have no data on how much this hurts in practice.

> Note that the patch is using a dedicate work queue. It is going to be not practical
> in case you need to handle two different NIC, and want separate pools for each of them.
> 
> Ideally, having one kthread per queue would be nice, but then there is more plumbing
> work to let these kthreads being visible in a convenient way (/sys/class/net/ethX/queues/..../kthread)

Is context switching cost negligible?

ksoftirq-like thread replicates all the NAPI budget-level mixing we
already do today.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] net: add support for threaded NAPI polling
  2020-08-06 19:57     ` Jakub Kicinski
@ 2020-08-06 21:18       ` Eric Dumazet
  0 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2020-08-06 21:18 UTC (permalink / raw)
  To: Jakub Kicinski, Eric Dumazet; +Cc: Felix Fietkau, netdev, Hillf Danton



On 8/6/20 12:57 PM, Jakub Kicinski wrote:
> On Thu, 6 Aug 2020 12:25:08 -0700 Eric Dumazet wrote:
>> On 8/6/20 11:55 AM, Jakub Kicinski wrote:
>>> I'm still trying to wrap my head around this.
>>>
>>> Am I understanding correctly that you have one IRQ and multiple NAPI
>>> instances?
>>>
>>> Are we not going to end up with pretty terrible cache locality here if
>>> the scheduler starts to throw rx and tx completions around to random
>>> CPUs?
>>>
>>> I understand that implementing separate kthreads would be more LoC, but
>>> we do have ksoftirqs already... maybe we should make the NAPI ->
>>> ksoftirq mapping more flexible, and improve the logic which decides to
>>> load ksoftirq rather than make $current() pay?
>>>
>>> Sorry for being slow.
>>
>> Issue with ksoftirqd is that
>> - it is bound to a cpu
> 
> Do you envision the scheduler balancing or work stealing being
> advantageous in some configurations?

It seems that softirq stealing too many cycles has been a problem
for process scheduler for a very long time. Maybe dealing with threads
will help it to take decisions instead of having to deal with
interruptions.

> 
> I was guessing that for compute workloads having ksoftirq bound will
> actually make things more predictable/stable.
> 
> For pure routers (where we expect multiple cores to reach 100% just
> doing packet forwarding) as long as there is an API to re-balance NAPIs
> to cores - a simple specialized user space daemon would probably do a
> better job as it can consult packet drop metrics etc.
> 
> Obviously I have no data to back up these claims..
> 
>> - Its nice value is 0, meaning that user threads can sometime compete too much with it.
> 
> True, I thought we could assume user level tuning.
> 
>> - It handles all kinds of softirqs, so messing with it might hurt some other layer.
> 
> Right, I have no data on how much this hurts in practice.
> 
>> Note that the patch is using a dedicate work queue. It is going to be not practical
>> in case you need to handle two different NIC, and want separate pools for each of them.
>>
>> Ideally, having one kthread per queue would be nice, but then there is more plumbing
>> work to let these kthreads being visible in a convenient way (/sys/class/net/ethX/queues/..../kthread)
> 
> Is context switching cost negligible?

Context switch to kernel thread is cheap (compared to arbitrary context switch,
from process A to process B since), no MMU games need to be played.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] net: add support for threaded NAPI polling
  2020-08-06  9:55 [PATCH v2] net: add support for threaded NAPI polling Felix Fietkau
                   ` (3 preceding siblings ...)
  2020-08-06 18:55 ` Jakub Kicinski
@ 2020-08-06 22:48 ` Wei Wang
  4 siblings, 0 replies; 11+ messages in thread
From: Wei Wang @ 2020-08-06 22:48 UTC (permalink / raw)
  To: Felix Fietkau; +Cc: Linux Kernel Network Developers, Eric Dumazet, Hillf Danton

Nice patch!
One question inline.

On Thu, Aug 6, 2020 at 2:58 AM Felix Fietkau <nbd@nbd.name> wrote:
>
> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
> poll function does not perform well. Since NAPI poll is bound to the CPU it
> was scheduled from, we can easily end up with a few very busy CPUs spending
> most of their time in softirq/ksoftirqd and some idle ones.
>
> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
> same except for using netif_threaded_napi_add instead of netif_napi_add.
>
> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
> thread.
>
> With threaded NAPI, throughput seems stable and consistent (and higher than
> the best results I got without it).
>
> Based on a patch by Hillf Danton
>
> Cc: Hillf Danton <hdanton@sina.com>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
> Changes since PATCH v1:
> - use WQ_SYSFS to make workqueue configurable from user space
> - cancel work in netif_napi_del
> - add a sysfs file to enable/disable threaded NAPI for a netdev
>
> Changes since RFC v2:
> - fix unused but set variable reported by kbuild test robot
>
> Changes since RFC:
> - disable softirq around threaded poll functions
> - reuse most parts of napi_poll()
> - fix re-schedule condition
>
>  include/linux/netdevice.h |  23 ++++++
>  net/core/dev.c            | 163 ++++++++++++++++++++++++++------------
>  net/core/net-sysfs.c      |  42 ++++++++++
>  3 files changed, 176 insertions(+), 52 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ac2cd3f49aba..3a39211c7598 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -347,6 +347,7 @@ struct napi_struct {
>         struct list_head        dev_list;
>         struct hlist_node       napi_hash_node;
>         unsigned int            napi_id;
> +       struct work_struct      work;
>  };
>
>  enum {
> @@ -357,6 +358,7 @@ enum {
>         NAPI_STATE_HASHED,      /* In NAPI hash (busy polling possible) */
>         NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>         NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
> +       NAPI_STATE_THREADED,    /* Use threaded NAPI */
>  };
>
>  enum {
> @@ -367,6 +369,7 @@ enum {
>         NAPIF_STATE_HASHED       = BIT(NAPI_STATE_HASHED),
>         NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>         NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
> +       NAPIF_STATE_THREADED     = BIT(NAPI_STATE_THREADED),
>  };
>
>  enum gro_result {
> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>                     int (*poll)(struct napi_struct *, int), int weight);
>
> +/**
> + *     netif_threaded_napi_add - initialize a NAPI context
> + *     @dev:  network device
> + *     @napi: NAPI context
> + *     @poll: polling function
> + *     @weight: default weight
> + *
> + * This variant of netif_napi_add() should be used from drivers using NAPI
> + * with CPU intensive poll functions.
> + * This will schedule polling from a high priority workqueue that
> + */
> +static inline void netif_threaded_napi_add(struct net_device *dev,
> +                                          struct napi_struct *napi,
> +                                          int (*poll)(struct napi_struct *, int),
> +                                          int weight)
> +{
> +       set_bit(NAPI_STATE_THREADED, &napi->state);
> +       netif_napi_add(dev, napi, poll, weight);
> +}
> +
>  /**
>   *     netif_tx_napi_add - initialize a NAPI context
>   *     @dev:  network device
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 19f1abc26fcd..4b0dbea68a09 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>  struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>  struct list_head ptype_all __read_mostly;      /* Taps */
>  static struct list_head offload_base __read_mostly;
> +static struct workqueue_struct *napi_workq __read_mostly;
>
>  static int netif_rx_internal(struct sk_buff *skb);
>  static int call_netdevice_notifiers_info(unsigned long val,
> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>  {
>         unsigned long flags;
>
> +       if (test_bit(NAPI_STATE_THREADED, &n->state)) {
> +               queue_work(napi_workq, &n->work);
> +               return;
> +       }
> +
>         local_irq_save(flags);
>         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
>         local_irq_restore(flags);
> @@ -6333,6 +6339,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
>   */
>  void __napi_schedule_irqoff(struct napi_struct *n)
>  {
> +       if (test_bit(NAPI_STATE_THREADED, &n->state)) {
> +               queue_work(napi_workq, &n->work);
> +               return;
> +       }
> +
>         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
>  }
>  EXPORT_SYMBOL(__napi_schedule_irqoff);
> @@ -6601,6 +6612,95 @@ static void init_gro_hash(struct napi_struct *napi)
>         napi->gro_bitmask = 0;
>  }
>
> +static int __napi_poll(struct napi_struct *n, bool *repoll)
> +{
> +       int work, weight;
> +
> +       weight = n->weight;
> +
> +       /* This NAPI_STATE_SCHED test is for avoiding a race
> +        * with netpoll's poll_napi().  Only the entity which
> +        * obtains the lock and sees NAPI_STATE_SCHED set will
> +        * actually make the ->poll() call.  Therefore we avoid
> +        * accidentally calling ->poll() when NAPI is not scheduled.
> +        */
> +       work = 0;
> +       if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> +               work = n->poll(n, weight);
> +               trace_napi_poll(n, work, weight);
> +       }
> +
> +       if (unlikely(work > weight))
> +               pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
> +                           n->poll, work, weight);
> +
> +       if (likely(work < weight))
> +               return work;
> +
> +       /* Drivers must not modify the NAPI state if they
> +        * consume the entire weight.  In such cases this code
> +        * still "owns" the NAPI instance and therefore can
> +        * move the instance around on the list at-will.
> +        */
> +       if (unlikely(napi_disable_pending(n))) {
> +               napi_complete(n);
> +               return work;
> +       }
> +
> +       if (n->gro_bitmask) {
> +               /* flush too old packets
> +                * If HZ < 1000, flush all packets.
> +                */
> +               napi_gro_flush(n, HZ >= 1000);
> +       }
> +
> +       gro_normal_list(n);
> +
> +       /* Some drivers may have called napi_schedule
> +        * prior to exhausting their budget.
> +        */
> +       if (unlikely(!list_empty(&n->poll_list))) {
> +               pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
> +                            n->dev ? n->dev->name : "backlog");
> +               return work;
> +       }
> +
> +       *repoll = true;
> +
> +       return work;
> +}
> +
> +static void napi_workfn(struct work_struct *work)
> +{
> +       struct napi_struct *n = container_of(work, struct napi_struct, work);
> +       void *have;
> +
> +       for (;;) {
> +               bool repoll = false;
> +
> +               local_bh_disable();
> +
> +               have = netpoll_poll_lock(n);
> +               __napi_poll(n, &repoll);
> +               netpoll_poll_unlock(have);
> +
> +               local_bh_enable();
> +
> +               if (!repoll)
> +                       return;
> +
> +               if (!need_resched())
> +                       continue;
> +
> +               /*
> +                * have to pay for the latency of task switch even if
> +                * napi is scheduled
> +                */
> +               queue_work(napi_workq, work);
> +               return;
> +       }
> +}
> +
>  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>                     int (*poll)(struct napi_struct *, int), int weight)
>  {
> @@ -6621,6 +6721,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>  #ifdef CONFIG_NETPOLL
>         napi->poll_owner = -1;
>  #endif
> +       INIT_WORK(&napi->work, napi_workfn);
>         set_bit(NAPI_STATE_SCHED, &napi->state);
>         napi_hash_add(napi);
>  }
> @@ -6659,6 +6760,7 @@ static void flush_gro_hash(struct napi_struct *napi)
>  void netif_napi_del(struct napi_struct *napi)
>  {
>         might_sleep();
> +       cancel_work_sync(&napi->work);
>         if (napi_hash_del(napi))
>                 synchronize_net();
>         list_del_init(&napi->dev_list);
> @@ -6671,65 +6773,18 @@ EXPORT_SYMBOL(netif_napi_del);
>
>  static int napi_poll(struct napi_struct *n, struct list_head *repoll)
>  {
> +       bool do_repoll = false;
>         void *have;
> -       int work, weight;
> +       int work;
>
>         list_del_init(&n->poll_list);
>
>         have = netpoll_poll_lock(n);
>
> -       weight = n->weight;
> -
> -       /* This NAPI_STATE_SCHED test is for avoiding a race
> -        * with netpoll's poll_napi().  Only the entity which
> -        * obtains the lock and sees NAPI_STATE_SCHED set will
> -        * actually make the ->poll() call.  Therefore we avoid
> -        * accidentally calling ->poll() when NAPI is not scheduled.
> -        */
> -       work = 0;
> -       if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> -               work = n->poll(n, weight);
> -               trace_napi_poll(n, work, weight);
> -       }
> -
> -       if (unlikely(work > weight))
> -               pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
> -                           n->poll, work, weight);
> -
> -       if (likely(work < weight))
> -               goto out_unlock;
> -
> -       /* Drivers must not modify the NAPI state if they
> -        * consume the entire weight.  In such cases this code
> -        * still "owns" the NAPI instance and therefore can
> -        * move the instance around on the list at-will.
> -        */
> -       if (unlikely(napi_disable_pending(n))) {
> -               napi_complete(n);
> -               goto out_unlock;
> -       }
> -
> -       if (n->gro_bitmask) {
> -               /* flush too old packets
> -                * If HZ < 1000, flush all packets.
> -                */
> -               napi_gro_flush(n, HZ >= 1000);
> -       }
> -
> -       gro_normal_list(n);
> -
> -       /* Some drivers may have called napi_schedule
> -        * prior to exhausting their budget.
> -        */
> -       if (unlikely(!list_empty(&n->poll_list))) {
> -               pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
> -                            n->dev ? n->dev->name : "backlog");
> -               goto out_unlock;
> -       }
> -
> -       list_add_tail(&n->poll_list, repoll);
> +       work = __napi_poll(n, &do_repoll);
> +       if (do_repoll)
> +               list_add_tail(&n->poll_list, repoll);
>
> -out_unlock:
>         netpoll_poll_unlock(have);
>
>         return work;
> @@ -10676,6 +10731,10 @@ static int __init net_dev_init(void)
>                 sd->backlog.weight = weight_p;
>         }
>
> +       napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
> +                                    WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
> +       BUG_ON(!napi_workq);
> +
>         dev_boot_phase = 0;
>
>         /* The loopback device is special if any other network devices
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index e353b822bb15..99233e86f4c5 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -471,6 +471,47 @@ static ssize_t proto_down_store(struct device *dev,
>  }
>  NETDEVICE_SHOW_RW(proto_down, fmt_dec);
>
> +static int change_napi_threaded(struct net_device *dev, unsigned long val)
> +{
> +       struct napi_struct *napi;
> +
> +       if (list_empty(&dev->napi_list))
> +               return -EOPNOTSUPP;
> +
> +       list_for_each_entry(napi, &dev->napi_list, dev_list) {
> +               if (val)
> +                       set_bit(NAPI_STATE_THREADED, &napi->state);
> +               else
> +                       clear_bit(NAPI_STATE_THREADED, &napi->state);
> +       }
> +
> +       return 0;
> +}

I am wondering if there will be issues to modify NAPI_STATE_THREADED
when the napi is currently being queued in sd->poll_list. I think it
could happen, right?
In this case, this napi will be processed in softirq context later.
But in the meanwhile, it is possible that this napi also gets queued
to the work_queue. Will there be any issues with it?
Thinking about it more, since __napi_poll() checks for
NAPI_STATE_SCHED bit before calling the poll() function. And the
work_queue handler disables softirq, so it should be OK?

> +
> +static ssize_t napi_threaded_store(struct device *dev,
> +                               struct device_attribute *attr,
> +                               const char *buf, size_t len)
> +{
> +       return netdev_store(dev, attr, buf, len, change_napi_threaded);
> +}
> +
> +static ssize_t napi_threaded_show(struct device *dev,
> +                                 struct device_attribute *attr,
> +                                 char *buf)
> +{
> +       struct net_device *netdev = to_net_dev(dev);
> +       struct napi_struct *napi;
> +       bool enabled = false;
> +
> +       list_for_each_entry(napi, &netdev->napi_list, dev_list) {
> +               if (test_bit(NAPI_STATE_THREADED, &napi->state))
> +                       enabled = true;
> +       }
> +
> +       return sprintf(buf, fmt_dec, enabled);
> +}
> +DEVICE_ATTR_RW(napi_threaded);
> +
>  static ssize_t phys_port_id_show(struct device *dev,
>                                  struct device_attribute *attr, char *buf)
>  {
> @@ -563,6 +604,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
>         &dev_attr_tx_queue_len.attr,
>         &dev_attr_gro_flush_timeout.attr,
>         &dev_attr_napi_defer_hard_irqs.attr,
> +       &dev_attr_napi_threaded.attr,
>         &dev_attr_phys_port_id.attr,
>         &dev_attr_phys_port_name.attr,
>         &dev_attr_phys_switch_id.attr,
> --
> 2.28.0
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-08-06 22:48 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-06  9:55 [PATCH v2] net: add support for threaded NAPI polling Felix Fietkau
2020-08-06 11:51 ` kernel test robot
2020-08-06 11:51   ` kernel test robot
2020-08-06 11:51 ` [RFC PATCH] net: dev_attr_napi_threaded can be static kernel test robot
2020-08-06 11:51   ` kernel test robot
2020-08-06 17:39 ` [PATCH v2] net: add support for threaded NAPI polling Eric Dumazet
2020-08-06 18:55 ` Jakub Kicinski
2020-08-06 19:25   ` Eric Dumazet
2020-08-06 19:57     ` Jakub Kicinski
2020-08-06 21:18       ` Eric Dumazet
2020-08-06 22:48 ` Wei Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.