* [PATCH net-next v10 1/3] net: extract napi poll functionality to __napi_poll()
2021-02-04 21:31 [PATCH net-next v10 0/3] implement kthread based napi poll Wei Wang
@ 2021-02-04 21:31 ` Wei Wang
2021-02-04 21:31 ` [PATCH net-next v10 2/3] net: implement threaded-able napi poll loop support Wei Wang
2021-02-04 21:31 ` [PATCH net-next v10 3/3] net: add sysfs attribute to control napi threaded mode Wei Wang
2 siblings, 0 replies; 7+ messages in thread
From: Wei Wang @ 2021-02-04 21:31 UTC (permalink / raw)
To: David Miller, netdev, Jakub Kicinski
Cc: Paolo Abeni, Hannes Frederic Sowa, Eric Dumazet, Felix Fietkau,
Alexander Duyck
From: Felix Fietkau <nbd@nbd.name>
This commit introduces a new function __napi_poll() which does the main
logic of the existing napi_poll() function, and will be called by other
functions in later commits.
This idea and implementation is done by Felix Fietkau <nbd@nbd.name> and
is proposed as part of the patch to move napi work to work_queue
context.
This commit by itself is a code restructure.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
---
net/core/dev.c | 36 ++++++++++++++++++++++++------------
1 file changed, 24 insertions(+), 12 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index aae116d059da..0fd40b9847c3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6781,15 +6781,10 @@ void __netif_napi_del(struct napi_struct *napi)
}
EXPORT_SYMBOL(__netif_napi_del);
-static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+static int __napi_poll(struct napi_struct *n, bool *repoll)
{
- void *have;
int work, weight;
- list_del_init(&n->poll_list);
-
- have = netpoll_poll_lock(n);
-
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
@@ -6809,7 +6804,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
n->poll, work, weight);
if (likely(work < weight))
- goto out_unlock;
+ return work;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
@@ -6818,7 +6813,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
- goto out_unlock;
+ return work;
}
/* The NAPI context has more processing work, but busy-polling
@@ -6831,7 +6826,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
*/
napi_schedule(n);
}
- goto out_unlock;
+ return work;
}
if (n->gro_bitmask) {
@@ -6849,12 +6844,29 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
- goto out_unlock;
+ return work;
}
- list_add_tail(&n->poll_list, repoll);
+ *repoll = true;
+
+ return work;
+}
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+ bool do_repoll = false;
+ void *have;
+ int work;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ work = __napi_poll(n, &do_repoll);
+
+ if (do_repoll)
+ list_add_tail(&n->poll_list, repoll);
-out_unlock:
netpoll_poll_unlock(have);
return work;
--
2.30.0.365.g02bc693789-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH net-next v10 2/3] net: implement threaded-able napi poll loop support
2021-02-04 21:31 [PATCH net-next v10 0/3] implement kthread based napi poll Wei Wang
2021-02-04 21:31 ` [PATCH net-next v10 1/3] net: extract napi poll functionality to __napi_poll() Wei Wang
@ 2021-02-04 21:31 ` Wei Wang
2021-02-04 21:37 ` Alexander Duyck
2021-02-04 21:31 ` [PATCH net-next v10 3/3] net: add sysfs attribute to control napi threaded mode Wei Wang
2 siblings, 1 reply; 7+ messages in thread
From: Wei Wang @ 2021-02-04 21:31 UTC (permalink / raw)
To: David Miller, netdev, Jakub Kicinski
Cc: Paolo Abeni, Hannes Frederic Sowa, Eric Dumazet, Felix Fietkau,
Alexander Duyck
This patch allows running each napi poll loop inside its own
kernel thread.
The kthread is created during netif_napi_add() if dev->threaded
is set. And threaded mode is enabled in napi_enable(). We will
provide a way to set dev->threaded and enable threaded mode
without a device up/down in the following patch.
Once that threaded mode is enabled and the kthread is
started, napi_schedule() will wake-up such thread instead
of scheduling the softirq.
The threaded poll loop behaves quite likely the net_rx_action,
but it does not have to manipulate local irqs and uses
an explicit scheduling point based on netdev_budget.
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
---
include/linux/netdevice.h | 21 +++----
net/core/dev.c | 112 ++++++++++++++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 14 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e9e7ada07ea1..99fb4ec9573e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -347,6 +347,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
+ struct task_struct *thread;
};
enum {
@@ -358,6 +359,7 @@ enum {
NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
+ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
};
enum {
@@ -369,6 +371,7 @@ enum {
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL),
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
};
enum gro_result {
@@ -503,20 +506,7 @@ static inline bool napi_complete(struct napi_struct *n)
*/
void napi_disable(struct napi_struct *n);
-/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
- *
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
- */
-static inline void napi_enable(struct napi_struct *n)
-{
- BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
- smp_mb__before_atomic();
- clear_bit(NAPI_STATE_SCHED, &n->state);
- clear_bit(NAPI_STATE_NPSVC, &n->state);
-}
+void napi_enable(struct napi_struct *n);
/**
* napi_synchronize - wait until NAPI is not running
@@ -1827,6 +1817,8 @@ enum netdev_priv_flags {
*
* @wol_enabled: Wake-on-LAN is enabled
*
+ * @threaded: napi threaded mode is enabled
+ *
* @net_notifier_list: List of per-net netdev notifier block
* that follow this device when it is moved
* to another network namespace.
@@ -2145,6 +2137,7 @@ struct net_device {
struct lock_class_key *qdisc_running_key;
bool proto_down;
unsigned wol_enabled:1;
+ unsigned threaded:1;
struct list_head net_notifier_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0fd40b9847c3..a8c5eca17074 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -91,6 +91,7 @@
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/skbuff.h>
+#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
@@ -1493,6 +1494,27 @@ void netdev_notify_peers(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_notify_peers);
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+ int err = 0;
+
+ /* Create and wake up the kthread once to put it in
+ * TASK_INTERRUPTIBLE mode to avoid the blocked task
+ * warning and work with loadavg.
+ */
+ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+ n->dev->name, n->napi_id);
+ if (IS_ERR(n->thread)) {
+ err = PTR_ERR(n->thread);
+ pr_err("kthread_run failed with err %d\n", err);
+ n->thread = NULL;
+ }
+
+ return err;
+}
+
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -4264,6 +4286,21 @@ int gro_normal_batch __read_mostly = 8;
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
+ struct task_struct *thread;
+
+ if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+ /* Paired with smp_mb__before_atomic() in
+ * napi_enable(). Use READ_ONCE() to guarantee
+ * a complete read on napi->thread. Only call
+ * wake_up_process() when it's not NULL.
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
+ wake_up_process(thread);
+ return;
+ }
+ }
+
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
@@ -6733,6 +6770,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
napi_hash_add(napi);
+ /* Create kthread for this napi if dev->threaded is set.
+ * Clear dev->threaded if kthread creation failed so that
+ * threaded mode will not be enabled in napi_enable().
+ */
+ if (dev->threaded && napi_kthread_create(napi))
+ dev->threaded = 0;
}
EXPORT_SYMBOL(netif_napi_add);
@@ -6750,9 +6793,28 @@ void napi_disable(struct napi_struct *n)
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
+ clear_bit(NAPI_STATE_THREADED, &n->state);
}
EXPORT_SYMBOL(napi_disable);
+/**
+ * napi_enable - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Resume NAPI from being scheduled on this context.
+ * Must be paired with napi_disable.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ smp_mb__before_atomic();
+ clear_bit(NAPI_STATE_SCHED, &n->state);
+ clear_bit(NAPI_STATE_NPSVC, &n->state);
+ if (n->dev->threaded && n->thread)
+ set_bit(NAPI_STATE_THREADED, &n->state);
+}
+EXPORT_SYMBOL(napi_enable);
+
static void flush_gro_hash(struct napi_struct *napi)
{
int i;
@@ -6778,6 +6840,11 @@ void __netif_napi_del(struct napi_struct *napi)
flush_gro_hash(napi);
napi->gro_bitmask = 0;
+
+ if (napi->thread) {
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+ }
}
EXPORT_SYMBOL(__netif_napi_del);
@@ -6872,6 +6939,51 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
return work;
}
+static int napi_thread_wait(struct napi_struct *napi)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+ if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ WARN_ON(!list_empty(&napi->poll_list));
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return -1;
+}
+
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
+ void *have;
+
+ while (!napi_thread_wait(napi)) {
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
+
+ __kfree_skb_flush();
+ local_bh_enable();
+
+ if (!repoll)
+ break;
+
+ cond_resched();
+ }
+ }
+ return 0;
+}
+
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
--
2.30.0.365.g02bc693789-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH net-next v10 2/3] net: implement threaded-able napi poll loop support
2021-02-04 21:31 ` [PATCH net-next v10 2/3] net: implement threaded-able napi poll loop support Wei Wang
@ 2021-02-04 21:37 ` Alexander Duyck
0 siblings, 0 replies; 7+ messages in thread
From: Alexander Duyck @ 2021-02-04 21:37 UTC (permalink / raw)
To: Wei Wang
Cc: David Miller, Netdev, Jakub Kicinski, Paolo Abeni,
Hannes Frederic Sowa, Eric Dumazet, Felix Fietkau,
Alexander Duyck
On Thu, Feb 4, 2021 at 1:34 PM Wei Wang <weiwan@google.com> wrote:
>
> This patch allows running each napi poll loop inside its own
> kernel thread.
> The kthread is created during netif_napi_add() if dev->threaded
> is set. And threaded mode is enabled in napi_enable(). We will
> provide a way to set dev->threaded and enable threaded mode
> without a device up/down in the following patch.
>
> Once that threaded mode is enabled and the kthread is
> started, napi_schedule() will wake-up such thread instead
> of scheduling the softirq.
>
> The threaded poll loop behaves quite likely the net_rx_action,
> but it does not have to manipulate local irqs and uses
> an explicit scheduling point based on netdev_budget.
>
> Co-developed-by: Paolo Abeni <pabeni@redhat.com>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Co-developed-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Wei Wang <weiwan@google.com>
> ---
> include/linux/netdevice.h | 21 +++----
> net/core/dev.c | 112 ++++++++++++++++++++++++++++++++++++++
> 2 files changed, 119 insertions(+), 14 deletions(-)
>
Looks good.
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH net-next v10 3/3] net: add sysfs attribute to control napi threaded mode
2021-02-04 21:31 [PATCH net-next v10 0/3] implement kthread based napi poll Wei Wang
2021-02-04 21:31 ` [PATCH net-next v10 1/3] net: extract napi poll functionality to __napi_poll() Wei Wang
2021-02-04 21:31 ` [PATCH net-next v10 2/3] net: implement threaded-able napi poll loop support Wei Wang
@ 2021-02-04 21:31 ` Wei Wang
2021-02-04 22:16 ` Alexander Duyck
2 siblings, 1 reply; 7+ messages in thread
From: Wei Wang @ 2021-02-04 21:31 UTC (permalink / raw)
To: David Miller, netdev, Jakub Kicinski
Cc: Paolo Abeni, Hannes Frederic Sowa, Eric Dumazet, Felix Fietkau,
Alexander Duyck
This patch adds a new sysfs attribute to the network device class.
Said attribute provides a per-device control to enable/disable the
threaded mode for all the napi instances of the given network device,
without the need for a device up/down.
User sets it to 1 or 0 to enable or disable threaded mode.
Note: when switching between threaded and the current softirq based mode
for a napi instance, it will not immediately take effect if the napi is
currently being polled. The mode switch will happen for the next time
napi_schedule() is called.
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Wei Wang <weiwan@google.com>
---
Documentation/ABI/testing/sysfs-class-net | 15 +++++
include/linux/netdevice.h | 2 +
net/core/dev.c | 67 ++++++++++++++++++++++-
net/core/net-sysfs.c | 45 +++++++++++++++
4 files changed, 127 insertions(+), 2 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 1f2002df5ba2..1419103d11f9 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -337,3 +337,18 @@ Contact: netdev@vger.kernel.org
Description:
32-bit unsigned integer counting the number of times the link has
been down
+
+What: /sys/class/net/<iface>/threaded
+Date: Jan 2021
+KernelVersion: 5.12
+Contact: netdev@vger.kernel.org
+Description:
+ Boolean value to control the threaded mode per device. User could
+ set this value to enable/disable threaded mode for all napi
+ belonging to this device, without the need to do device up/down.
+
+ Possible values:
+ == ==================================
+ 0 threaded mode disabled for this dev
+ 1 threaded mode enabled for this dev
+ == ==================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 99fb4ec9573e..1340327f7abf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)
return napi_complete_done(n, 0);
}
+int dev_set_threaded(struct net_device *dev, bool threaded);
+
/**
* napi_disable - prevent NAPI from scheduling
* @n: NAPI context
diff --git a/net/core/dev.c b/net/core/dev.c
index a8c5eca17074..9cc9b245419e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4290,8 +4290,9 @@ static inline void ____napi_schedule(struct softnet_data *sd,
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
/* Paired with smp_mb__before_atomic() in
- * napi_enable(). Use READ_ONCE() to guarantee
- * a complete read on napi->thread. Only call
+ * napi_enable()/napi_set_threaded().
+ * Use READ_ONCE() to guarantee a complete
+ * read on napi->thread. Only call
* wake_up_process() when it's not NULL.
*/
thread = READ_ONCE(napi->thread);
@@ -6743,6 +6744,68 @@ static void init_gro_hash(struct napi_struct *napi)
napi->gro_bitmask = 0;
}
+/* Setting/unsetting threaded mode on a napi might not immediately
+ * take effect, if the current napi instance is actively being
+ * polled. In this case, the switch between threaded mode and
+ * softirq mode will happen in the next round of napi_schedule().
+ * This should not cause hiccups/stalls to the live traffic.
+ */
+static int napi_set_threaded(struct napi_struct *n, bool threaded)
+{
+ int err = 0;
+
+ if (threaded == !!test_bit(NAPI_STATE_THREADED, &n->state))
+ return 0;
+
+ if (!threaded) {
+ clear_bit(NAPI_STATE_THREADED, &n->state);
+ return 0;
+ }
+
+ if (!n->thread) {
+ err = napi_kthread_create(n);
+ if (err)
+ return err;
+ }
+
+ /* Make sure kthread is created before THREADED bit
+ * is set.
+ */
+ smp_mb__before_atomic();
+ set_bit(NAPI_STATE_THREADED, &n->state);
+
+ return 0;
+}
+
+static void dev_disable_threaded_all(struct net_device *dev)
+{
+ struct napi_struct *napi;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list)
+ napi_set_threaded(napi, false);
+ dev->threaded = 0;
+}
+
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+ struct napi_struct *napi;
+ int ret;
+
+ dev->threaded = threaded;
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ ret = napi_set_threaded(napi, threaded);
+ if (ret) {
+ /* Error occurred on one of the napi,
+ * reset threaded mode on all napi.
+ */
+ dev_disable_threaded_all(dev);
+ break;
+ }
+ }
+
+ return ret;
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index daf502c13d6d..969743567257 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -538,6 +538,50 @@ static ssize_t phys_switch_id_show(struct device *dev,
}
static DEVICE_ATTR_RO(phys_switch_id);
+static ssize_t threaded_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (!dev_isalive(netdev)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = sprintf(buf, fmt_dec, netdev->threaded);
+
+unlock:
+ rtnl_unlock();
+ return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ int ret;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ if (val != 0 && val != 1)
+ return -EOPNOTSUPP;
+
+ ret = dev_set_threaded(dev, val);
+
+ return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
@@ -570,6 +614,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_proto_down.attr,
&dev_attr_carrier_up_count.attr,
&dev_attr_carrier_down_count.attr,
+ &dev_attr_threaded.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
--
2.30.0.365.g02bc693789-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH net-next v10 3/3] net: add sysfs attribute to control napi threaded mode
2021-02-04 21:31 ` [PATCH net-next v10 3/3] net: add sysfs attribute to control napi threaded mode Wei Wang
@ 2021-02-04 22:16 ` Alexander Duyck
2021-02-05 0:42 ` Wei Wang
0 siblings, 1 reply; 7+ messages in thread
From: Alexander Duyck @ 2021-02-04 22:16 UTC (permalink / raw)
To: Wei Wang
Cc: David Miller, Netdev, Jakub Kicinski, Paolo Abeni,
Hannes Frederic Sowa, Eric Dumazet, Felix Fietkau,
Alexander Duyck
On Thu, Feb 4, 2021 at 1:35 PM Wei Wang <weiwan@google.com> wrote:
>
> This patch adds a new sysfs attribute to the network device class.
> Said attribute provides a per-device control to enable/disable the
> threaded mode for all the napi instances of the given network device,
> without the need for a device up/down.
> User sets it to 1 or 0 to enable or disable threaded mode.
> Note: when switching between threaded and the current softirq based mode
> for a napi instance, it will not immediately take effect if the napi is
> currently being polled. The mode switch will happen for the next time
> napi_schedule() is called.
>
> Co-developed-by: Paolo Abeni <pabeni@redhat.com>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Co-developed-by: Felix Fietkau <nbd@nbd.name>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> Signed-off-by: Wei Wang <weiwan@google.com>
> ---
> Documentation/ABI/testing/sysfs-class-net | 15 +++++
> include/linux/netdevice.h | 2 +
> net/core/dev.c | 67 ++++++++++++++++++++++-
> net/core/net-sysfs.c | 45 +++++++++++++++
> 4 files changed, 127 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
> index 1f2002df5ba2..1419103d11f9 100644
> --- a/Documentation/ABI/testing/sysfs-class-net
> +++ b/Documentation/ABI/testing/sysfs-class-net
> @@ -337,3 +337,18 @@ Contact: netdev@vger.kernel.org
> Description:
> 32-bit unsigned integer counting the number of times the link has
> been down
> +
> +What: /sys/class/net/<iface>/threaded
> +Date: Jan 2021
> +KernelVersion: 5.12
> +Contact: netdev@vger.kernel.org
> +Description:
> + Boolean value to control the threaded mode per device. User could
> + set this value to enable/disable threaded mode for all napi
> + belonging to this device, without the need to do device up/down.
> +
> + Possible values:
> + == ==================================
> + 0 threaded mode disabled for this dev
> + 1 threaded mode enabled for this dev
> + == ==================================
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 99fb4ec9573e..1340327f7abf 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)
> return napi_complete_done(n, 0);
> }
>
> +int dev_set_threaded(struct net_device *dev, bool threaded);
> +
> /**
> * napi_disable - prevent NAPI from scheduling
> * @n: NAPI context
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a8c5eca17074..9cc9b245419e 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4290,8 +4290,9 @@ static inline void ____napi_schedule(struct softnet_data *sd,
>
> if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
> /* Paired with smp_mb__before_atomic() in
> - * napi_enable(). Use READ_ONCE() to guarantee
> - * a complete read on napi->thread. Only call
> + * napi_enable()/napi_set_threaded().
> + * Use READ_ONCE() to guarantee a complete
> + * read on napi->thread. Only call
> * wake_up_process() when it's not NULL.
> */
> thread = READ_ONCE(napi->thread);
> @@ -6743,6 +6744,68 @@ static void init_gro_hash(struct napi_struct *napi)
> napi->gro_bitmask = 0;
> }
>
> +/* Setting/unsetting threaded mode on a napi might not immediately
> + * take effect, if the current napi instance is actively being
> + * polled. In this case, the switch between threaded mode and
> + * softirq mode will happen in the next round of napi_schedule().
> + * This should not cause hiccups/stalls to the live traffic.
> + */
> +static int napi_set_threaded(struct napi_struct *n, bool threaded)
> +{
> + int err = 0;
> +
> + if (threaded == !!test_bit(NAPI_STATE_THREADED, &n->state))
> + return 0;
> +
> + if (!threaded) {
> + clear_bit(NAPI_STATE_THREADED, &n->state);
> + return 0;
> + }
> +
> + if (!n->thread) {
> + err = napi_kthread_create(n);
> + if (err)
> + return err;
> + }
This piece needs to be broken out similar to what we did for the
napi_add and napi enable. In the case where we are enabling the
threaded NAPI you should first go through and allocate all the
threads. Then once all the threads are allocated you then enable them
by setting the NAPI_STATE_THREADED bit.
I would pull this section out and place it in a loop in
dev_set_threaded to handle creating the threads before you set
dev->threaded and then set the threaded flags in the napi instances.
> +
> + /* Make sure kthread is created before THREADED bit
> + * is set.
> + */
> + smp_mb__before_atomic();
If we split off the allocation like I mentioned earlier we can
probably pull out the barrier and place it after the allocation of the
kthreads.
> + set_bit(NAPI_STATE_THREADED, &n->state);
> +
> + return 0;
> +}
> +
With the change I suggested above you could also drop the return type
from this because it should always succeed.
> +static void dev_disable_threaded_all(struct net_device *dev)
> +{
> + struct napi_struct *napi;
> +
> + list_for_each_entry(napi, &dev->napi_list, dev_list)
> + napi_set_threaded(napi, false);
> + dev->threaded = 0;
> +}
> +
You might consider renaming this to dev_set_threaded_all and passing a
boolean "threaded" argument. Then you could reuse this code for both
enabling and disabling of the threaded setup. Another thing I might
change would be how we go about toggling the value. Maybe something
more like:
dev->threaded = 0;
list_for_each_entry(napi, &dev->napi_list, dev_list)
napi_set_threaded(napi, threaded);
dev->threaded = threaded;
The advantage of doing it this way is that we will be
enabling/disabling all instances at the same time since we are
overwriting the dev->threaded first and then taking care of the
individual flags. If you are enabling then after we have set the bits
we set dev->threaded which will switch them all on, and when we clear
dev->threaded it will switch them all off so they should stop checking
the threaded flags in the napi structs.
> +int dev_set_threaded(struct net_device *dev, bool threaded)
> +{
> + struct napi_struct *napi;
> + int ret;
> +
There are probably a couple changes that need to be made. First I
would initialize ret to 0 here. More on that below.
Second we need to add a check for if we are actually changing
anything. If not then we shouldn't be bothering. So something like the
following at the start should take care of that:
if (!dev->threaded == !threaded)
return 0;
I would add a list_for_each_entry loop that does the thread allocation
I called out above and will break if an error is encountered. That way
you aren't toggling dev->threaded unless you know all of the instances
can succeed. Something like:
if (!dev->threaded) {
list_for_each_entry(napi, &dev->napi_list, dev_list) {
ret = napi_kthread_create(napi);
if (ret)
break;
}
}
/* Make sure kthread is created before THREADED bit is set. */
smp_mb__before_atomic();
> + dev->threaded = threaded;
> + list_for_each_entry(napi, &dev->napi_list, dev_list) {
> + ret = napi_set_threaded(napi, threaded);
> + if (ret) {
> + /* Error occurred on one of the napi,
> + * reset threaded mode on all napi.
> + */
> + dev_disable_threaded_all(dev);
> + break;
> + }
> + }
With my suggestion above you could look at just making the
dev_disable_threaded_all accept a boolean argument for enable/disable
and then you could just do something like:
dev_set_threaded_all(struct net_device *dev, !ret && threaded);
> +
> + return ret;
> +}
> +
> void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
> int (*poll)(struct napi_struct *, int), int weight)
> {
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index daf502c13d6d..969743567257 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -538,6 +538,50 @@ static ssize_t phys_switch_id_show(struct device *dev,
> }
> static DEVICE_ATTR_RO(phys_switch_id);
>
> +static ssize_t threaded_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct net_device *netdev = to_net_dev(dev);
> + int ret;
> +
> + if (!rtnl_trylock())
> + return restart_syscall();
> +
> + if (!dev_isalive(netdev)) {
> + ret = -EINVAL;
> + goto unlock;
> + }
> +
> + ret = sprintf(buf, fmt_dec, netdev->threaded);
> +
It seems like the more standard pattern is to initialize ret to
-EVINAL and just call the sprintf if dev_isalive is true. That would
allow you to drop the unlock jump label you had to add below.
> +unlock:
> + rtnl_unlock();
> + return ret;
> +}
> +
> +static int modify_napi_threaded(struct net_device *dev, unsigned long val)
> +{
> + int ret;
> +
> + if (list_empty(&dev->napi_list))
> + return -EOPNOTSUPP;
> +
> + if (val != 0 && val != 1)
> + return -EOPNOTSUPP;
> +
> + ret = dev_set_threaded(dev, val);
> +
> + return ret;
> +}
> +
> +static ssize_t threaded_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + return netdev_store(dev, attr, buf, len, modify_napi_threaded);
> +}
> +static DEVICE_ATTR_RW(threaded);
> +
> static struct attribute *net_class_attrs[] __ro_after_init = {
> &dev_attr_netdev_group.attr,
> &dev_attr_type.attr,
> @@ -570,6 +614,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
> &dev_attr_proto_down.attr,
> &dev_attr_carrier_up_count.attr,
> &dev_attr_carrier_down_count.attr,
> + &dev_attr_threaded.attr,
> NULL,
> };
> ATTRIBUTE_GROUPS(net_class);
> --
> 2.30.0.365.g02bc693789-goog
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH net-next v10 3/3] net: add sysfs attribute to control napi threaded mode
2021-02-04 22:16 ` Alexander Duyck
@ 2021-02-05 0:42 ` Wei Wang
0 siblings, 0 replies; 7+ messages in thread
From: Wei Wang @ 2021-02-05 0:42 UTC (permalink / raw)
To: Alexander Duyck
Cc: David Miller, Netdev, Jakub Kicinski, Paolo Abeni,
Hannes Frederic Sowa, Eric Dumazet, Felix Fietkau,
Alexander Duyck
On Thu, Feb 4, 2021 at 2:17 PM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>
> On Thu, Feb 4, 2021 at 1:35 PM Wei Wang <weiwan@google.com> wrote:
> >
> > This patch adds a new sysfs attribute to the network device class.
> > Said attribute provides a per-device control to enable/disable the
> > threaded mode for all the napi instances of the given network device,
> > without the need for a device up/down.
> > User sets it to 1 or 0 to enable or disable threaded mode.
> > Note: when switching between threaded and the current softirq based mode
> > for a napi instance, it will not immediately take effect if the napi is
> > currently being polled. The mode switch will happen for the next time
> > napi_schedule() is called.
> >
> > Co-developed-by: Paolo Abeni <pabeni@redhat.com>
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> > Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> > Co-developed-by: Felix Fietkau <nbd@nbd.name>
> > Signed-off-by: Felix Fietkau <nbd@nbd.name>
> > Signed-off-by: Wei Wang <weiwan@google.com>
> > ---
> > Documentation/ABI/testing/sysfs-class-net | 15 +++++
> > include/linux/netdevice.h | 2 +
> > net/core/dev.c | 67 ++++++++++++++++++++++-
> > net/core/net-sysfs.c | 45 +++++++++++++++
> > 4 files changed, 127 insertions(+), 2 deletions(-)
> >
> > diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
> > index 1f2002df5ba2..1419103d11f9 100644
> > --- a/Documentation/ABI/testing/sysfs-class-net
> > +++ b/Documentation/ABI/testing/sysfs-class-net
> > @@ -337,3 +337,18 @@ Contact: netdev@vger.kernel.org
> > Description:
> > 32-bit unsigned integer counting the number of times the link has
> > been down
> > +
> > +What: /sys/class/net/<iface>/threaded
> > +Date: Jan 2021
> > +KernelVersion: 5.12
> > +Contact: netdev@vger.kernel.org
> > +Description:
> > + Boolean value to control the threaded mode per device. User could
> > + set this value to enable/disable threaded mode for all napi
> > + belonging to this device, without the need to do device up/down.
> > +
> > + Possible values:
> > + == ==================================
> > + 0 threaded mode disabled for this dev
> > + 1 threaded mode enabled for this dev
> > + == ==================================
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 99fb4ec9573e..1340327f7abf 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)
> > return napi_complete_done(n, 0);
> > }
> >
> > +int dev_set_threaded(struct net_device *dev, bool threaded);
> > +
> > /**
> > * napi_disable - prevent NAPI from scheduling
> > * @n: NAPI context
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index a8c5eca17074..9cc9b245419e 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -4290,8 +4290,9 @@ static inline void ____napi_schedule(struct softnet_data *sd,
> >
> > if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
> > /* Paired with smp_mb__before_atomic() in
> > - * napi_enable(). Use READ_ONCE() to guarantee
> > - * a complete read on napi->thread. Only call
> > + * napi_enable()/napi_set_threaded().
> > + * Use READ_ONCE() to guarantee a complete
> > + * read on napi->thread. Only call
> > * wake_up_process() when it's not NULL.
> > */
> > thread = READ_ONCE(napi->thread);
> > @@ -6743,6 +6744,68 @@ static void init_gro_hash(struct napi_struct *napi)
> > napi->gro_bitmask = 0;
> > }
> >
> > +/* Setting/unsetting threaded mode on a napi might not immediately
> > + * take effect, if the current napi instance is actively being
> > + * polled. In this case, the switch between threaded mode and
> > + * softirq mode will happen in the next round of napi_schedule().
> > + * This should not cause hiccups/stalls to the live traffic.
> > + */
> > +static int napi_set_threaded(struct napi_struct *n, bool threaded)
> > +{
> > + int err = 0;
> > +
> > + if (threaded == !!test_bit(NAPI_STATE_THREADED, &n->state))
> > + return 0;
> > +
> > + if (!threaded) {
> > + clear_bit(NAPI_STATE_THREADED, &n->state);
> > + return 0;
> > + }
>
>
> > +
> > + if (!n->thread) {
> > + err = napi_kthread_create(n);
> > + if (err)
> > + return err;
> > + }
>
> This piece needs to be broken out similar to what we did for the
> napi_add and napi enable. In the case where we are enabling the
> threaded NAPI you should first go through and allocate all the
> threads. Then once all the threads are allocated you then enable them
> by setting the NAPI_STATE_THREADED bit.
>
> I would pull this section out and place it in a loop in
> dev_set_threaded to handle creating the threads before you set
> dev->threaded and then set the threaded flags in the napi instances.
>
> > +
> > + /* Make sure kthread is created before THREADED bit
> > + * is set.
> > + */
> > + smp_mb__before_atomic();
>
> If we split off the allocation like I mentioned earlier we can
> probably pull out the barrier and place it after the allocation of the
> kthreads.
>
> > + set_bit(NAPI_STATE_THREADED, &n->state);
> > +
> > + return 0;
> > +}
> > +
>
> With the change I suggested above you could also drop the return type
> from this because it should always succeed.
>
> > +static void dev_disable_threaded_all(struct net_device *dev)
> > +{
> > + struct napi_struct *napi;
> > +
> > + list_for_each_entry(napi, &dev->napi_list, dev_list)
> > + napi_set_threaded(napi, false);
> > + dev->threaded = 0;
> > +}
> > +
>
> You might consider renaming this to dev_set_threaded_all and passing a
> boolean "threaded" argument. Then you could reuse this code for both
> enabling and disabling of the threaded setup. Another thing I might
> change would be how we go about toggling the value. Maybe something
> more like:
>
> dev->threaded = 0;
> list_for_each_entry(napi, &dev->napi_list, dev_list)
> napi_set_threaded(napi, threaded);
> dev->threaded = threaded;
>
> The advantage of doing it this way is that we will be
> enabling/disabling all instances at the same time since we are
> overwriting the dev->threaded first and then taking care of the
> individual flags. If you are enabling then after we have set the bits
> we set dev->threaded which will switch them all on, and when we clear
> dev->threaded it will switch them all off so they should stop checking
> the threaded flags in the napi structs.
>
> > +int dev_set_threaded(struct net_device *dev, bool threaded)
> > +{
> > + struct napi_struct *napi;
> > + int ret;
> > +
>
> There are probably a couple changes that need to be made. First I
> would initialize ret to 0 here. More on that below.
>
> Second we need to add a check for if we are actually changing
> anything. If not then we shouldn't be bothering. So something like the
> following at the start should take care of that:
>
> if (!dev->threaded == !threaded)
> return 0;
>
> I would add a list_for_each_entry loop that does the thread allocation
> I called out above and will break if an error is encountered. That way
> you aren't toggling dev->threaded unless you know all of the instances
> can succeed. Something like:
>
> if (!dev->threaded) {
> list_for_each_entry(napi, &dev->napi_list, dev_list) {
> ret = napi_kthread_create(napi);
> if (ret)
> break;
> }
> }
>
> /* Make sure kthread is created before THREADED bit is set. */
> smp_mb__before_atomic();
>
> > + dev->threaded = threaded;
> > + list_for_each_entry(napi, &dev->napi_list, dev_list) {
> > + ret = napi_set_threaded(napi, threaded);
> > + if (ret) {
> > + /* Error occurred on one of the napi,
> > + * reset threaded mode on all napi.
> > + */
> > + dev_disable_threaded_all(dev);
> > + break;
> > + }
> > + }
>
> With my suggestion above you could look at just making the
> dev_disable_threaded_all accept a boolean argument for enable/disable
> and then you could just do something like:
>
> dev_set_threaded_all(struct net_device *dev, !ret && threaded);
>
Ack for the above suggestions. I think I used a combined function
napi_set_threaded() to create kthread + set threaded bit here because
I have the knowledge of all the napi instances belonging to dev here.
And if anyone fails, I would just disable them all afterwards. But I
think that might introduce an inconsistent stage where some napi are
in threaded mode, and some are in softirq mode.
I agree it is cleaner to first create kthreads for all napi, then set
the threaded bit for all.
> > +
> > + return ret;
> > +}
> > +
> > void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
> > int (*poll)(struct napi_struct *, int), int weight)
> > {
> > diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> > index daf502c13d6d..969743567257 100644
> > --- a/net/core/net-sysfs.c
> > +++ b/net/core/net-sysfs.c
> > @@ -538,6 +538,50 @@ static ssize_t phys_switch_id_show(struct device *dev,
> > }
> > static DEVICE_ATTR_RO(phys_switch_id);
> >
> > +static ssize_t threaded_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct net_device *netdev = to_net_dev(dev);
> > + int ret;
> > +
> > + if (!rtnl_trylock())
> > + return restart_syscall();
> > +
> > + if (!dev_isalive(netdev)) {
> > + ret = -EINVAL;
> > + goto unlock;
> > + }
> > +
> > + ret = sprintf(buf, fmt_dec, netdev->threaded);
> > +
>
> It seems like the more standard pattern is to initialize ret to
> -EVINAL and just call the sprintf if dev_isalive is true. That would
> allow you to drop the unlock jump label you had to add below.
>
Ack.
> > +unlock:
> > + rtnl_unlock();
> > + return ret;
> > +}
> > +
> > +static int modify_napi_threaded(struct net_device *dev, unsigned long val)
> > +{
> > + int ret;
> > +
> > + if (list_empty(&dev->napi_list))
> > + return -EOPNOTSUPP;
> > +
> > + if (val != 0 && val != 1)
> > + return -EOPNOTSUPP;
> > +
> > + ret = dev_set_threaded(dev, val);
> > +
> > + return ret;
> > +}
> > +
> > +static ssize_t threaded_store(struct device *dev,
> > + struct device_attribute *attr,
> > + const char *buf, size_t len)
> > +{
> > + return netdev_store(dev, attr, buf, len, modify_napi_threaded);
> > +}
> > +static DEVICE_ATTR_RW(threaded);
> > +
> > static struct attribute *net_class_attrs[] __ro_after_init = {
> > &dev_attr_netdev_group.attr,
> > &dev_attr_type.attr,
> > @@ -570,6 +614,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
> > &dev_attr_proto_down.attr,
> > &dev_attr_carrier_up_count.attr,
> > &dev_attr_carrier_down_count.attr,
> > + &dev_attr_threaded.attr,
> > NULL,
> > };
> > ATTRIBUTE_GROUPS(net_class);
> > --
> > 2.30.0.365.g02bc693789-goog
> >
^ permalink raw reply [flat|nested] 7+ messages in thread