All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tom Herbert <tom@herbertland.com>
To: netdev@vger.kernel.org
Cc: Tom Herbert <tom@herbertland.com>
Subject: [RFC PATCH 07/11] net: Introduce global queues
Date: Wed, 24 Jun 2020 10:17:46 -0700	[thread overview]
Message-ID: <20200624171749.11927-8-tom@herbertland.com> (raw)
In-Reply-To: <20200624171749.11927-1-tom@herbertland.com>

Global queues, or gqids, are an abstract representation of NIC
device queues. They are global in the sense that the each gqid
can be map to a queue in each device, i.e. if there are multiple
devices in the system, a gqid can map to a different queue, a dqid,
in each device in a one to many mapping.  gqids are used for
configuring packet steering on both send and receive in a generic
way not bound to a particular device.

Each transmit or receive device queue may be reversed mapped to
one gqid. Each device maintains a table mapping gqids to local
device queues, those tables are used in the data path to convert
a gqid receive or transmit queue into a device queue relative to
the sending or receiving device.

Changes in the patch:
	- Add a simple index to netdev_queue and netdev_rx_queue
	  This serves as the dqid (it's just the index in the
	  receive or transmit queue array for the device)
	- Add gqid to netdev_queue and netdev_rx_queue. This is the
	  mapping of a device queue to gqid. If gqid is NO_QUEUE
	  then the gqid is unmapped
	- The per device gqid to dqid maps are maintained in an
	  array of netdev_queue_map structures in a net_devce for
	  both transmit and receive
	- Functions that return a dqid where input is gqid and
	  a net_device
	- Sysfs to set device queue mappings in global_queue_mapping
	  attribyte of the sysfs rx- and tx- queue directory
	- Create per device gqid to dqid maps in the sysfs function
---
 include/linux/netdevice.h |  75 ++++++++++++++
 net/core/dev.c            |  20 +++-
 net/core/net-sysfs.c      | 199 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 290 insertions(+), 4 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 48ba1c1fc644..ca163925211a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -606,6 +606,10 @@ struct netdev_queue {
 #endif
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
 	int			numa_node;
+#endif
+#ifdef CONFIG_RPS
+	u16			index;
+	u16			gqid;
 #endif
 	unsigned long		tx_maxrate;
 	/*
@@ -823,6 +827,8 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
 #ifdef CONFIG_RPS
+	u16			index;
+	u16			gqid;
 	struct rps_map __rcu		*rps_map;
 	struct rps_dev_flow_table __rcu	*rps_flow_table;
 #endif
@@ -875,6 +881,25 @@ struct xps_dev_maps {
 
 #endif /* CONFIG_XPS */
 
+#ifdef CONFIG_RPS
+/* Structure to map a global queue to a device queue */
+struct netdev_queue_map {
+	struct rcu_head rcu;
+	unsigned int max_ents;
+	unsigned int set_count;
+	u16 map[0];
+};
+
+/* Allocate queue map in blocks to avoid thrashing */
+#define QUEUE_MAP_ALLOC_BLOCK 128
+
+#define QUEUE_MAP_ALLOC_NUMBER(_num)					\
+	((((_num - 1) / QUEUE_MAP_ALLOC_BLOCK) + 1) * QUEUE_MAP_ALLOC_BLOCK)
+
+#define QUEUE_MAP_ALLOC_SIZE(_num) (sizeof(struct netdev_queue_map) +	\
+	(_num) * sizeof(u16))
+#endif /* CONFIG_RPS */
+
 #define TC_MAX_QUEUE	16
 #define TC_BITMASK	15
 /* HW offloaded queuing disciplines txq count and offset maps */
@@ -2092,6 +2117,10 @@ struct net_device {
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
+#ifdef CONFIG_RPS
+	struct netdev_queue_map __rcu *rx_gqueue_map;
+#endif
+
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_ingress;
 #endif
@@ -2122,6 +2151,9 @@ struct net_device {
 	struct xps_dev_maps __rcu *xps_cpus_map;
 	struct xps_dev_maps __rcu *xps_rxqs_map;
 #endif
+#ifdef CONFIG_RPS
+	struct netdev_queue_map __rcu *tx_gqueue_map;
+#endif
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
 #endif
@@ -2218,6 +2250,36 @@ struct net_device {
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
+#ifdef CONFIG_RPS
+static inline u16 netdev_gqid_to_dqid(const struct netdev_queue_map *map,
+				      u16 gqid)
+{
+	return (map && gqid < map->max_ents) ? map->map[gqid] : NO_QUEUE;
+}
+
+static inline u16 netdev_tx_gqid_to_dqid(const struct net_device *dev, u16 gqid)
+{
+	u16 dqid;
+
+	rcu_read_lock();
+	dqid = netdev_gqid_to_dqid(rcu_dereference(dev->tx_gqueue_map), gqid);
+	rcu_read_unlock();
+
+	return dqid;
+}
+
+static inline u16 netdev_rx_gqid_to_dqid(const struct net_device *dev, u16 gqid)
+{
+	u16 dqid;
+
+	rcu_read_lock();
+	dqid = netdev_gqid_to_dqid(rcu_dereference(dev->rx_gqueue_map), gqid);
+	rcu_read_unlock();
+
+	return dqid;
+}
+#endif
+
 static inline bool netif_elide_gro(const struct net_device *dev)
 {
 	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
@@ -2290,6 +2352,19 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 		f(dev, &dev->_tx[i], arg);
 }
 
+static inline void netdev_for_each_tx_queue_index(struct net_device *dev,
+						  void (*f)(struct net_device *,
+							    struct netdev_queue *,
+							    unsigned int index,
+							    void *),
+						  void *arg)
+{
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++)
+		f(dev, &dev->_tx[i], i, arg);
+}
+
 #define netdev_lockdep_set_classes(dev)				\
 {								\
 	static struct lock_class_key qdisc_tx_busylock_key;	\
diff --git a/net/core/dev.c b/net/core/dev.c
index 946940bdd583..f64bf6608775 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9331,6 +9331,10 @@ static int netif_alloc_rx_queues(struct net_device *dev)
 
 	for (i = 0; i < count; i++) {
 		rx[i].dev = dev;
+#ifdef CONFIG_RPS
+		rx[i].index = i;
+		rx[i].gqid = NO_QUEUE;
+#endif
 
 		/* XDP RX-queue setup */
 		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
@@ -9363,7 +9367,8 @@ static void netif_free_rx_queues(struct net_device *dev)
 }
 
 static void netdev_init_one_queue(struct net_device *dev,
-				  struct netdev_queue *queue, void *_unused)
+				  struct netdev_queue *queue,
+				  unsigned int index, void *_unused)
 {
 	/* Initialize queue lock */
 	spin_lock_init(&queue->_xmit_lock);
@@ -9371,6 +9376,10 @@ static void netdev_init_one_queue(struct net_device *dev,
 	queue->xmit_lock_owner = -1;
 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 	queue->dev = dev;
+#ifdef CONFIG_RPS
+	queue->index = index;
+	queue->gqid = NO_QUEUE;
+#endif
 #ifdef CONFIG_BQL
 	dql_init(&queue->dql, HZ);
 #endif
@@ -9396,7 +9405,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 
 	dev->_tx = tx;
 
-	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+	netdev_for_each_tx_queue_index(dev, netdev_init_one_queue, NULL);
 	spin_lock_init(&dev->tx_global_lock);
 
 	return 0;
@@ -9884,7 +9893,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 	if (!queue)
 		return NULL;
-	netdev_init_one_queue(dev, queue, NULL);
+	netdev_init_one_queue(dev, queue, 0, NULL);
 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
 	queue->qdisc_sleeping = &noop_qdisc;
 	rcu_assign_pointer(dev->ingress_queue, queue);
@@ -10041,6 +10050,11 @@ void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
 
+#ifdef CONFIG_RPS
+	WARN_ON(rcu_dereference_protected(dev->tx_gqueue_map, 1));
+	WARN_ON(rcu_dereference_protected(dev->rx_gqueue_map, 1));
+#endif
+
 	might_sleep();
 	netif_free_tx_queues(dev);
 	netif_free_rx_queues(dev);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 56d27463d466..3a9d3d9ee8e0 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -875,18 +875,166 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
 	return len;
 }
 
+static void queue_map_release(struct rcu_head *rcu)
+{
+	struct netdev_queue_map *q_map = container_of(rcu,
+	    struct netdev_queue_map, rcu);
+	vfree(q_map);
+}
+
+static int set_device_queue_mapping(struct netdev_queue_map **pmap,
+				    u16 gqid, u16 dqid, u16 *p_gqid)
+{
+	static DEFINE_MUTEX(global_mapping_table);
+	struct netdev_queue_map *gq_map, *old_gq_map;
+	u16 old_gqid;
+	int ret = 0;
+
+	mutex_lock(&global_mapping_table);
+
+	old_gqid = *p_gqid;
+	if (old_gqid == gqid) {
+		/* Nothing changing */
+		goto out;
+	}
+
+	gq_map = rcu_dereference_protected(*pmap,
+					   lockdep_is_held(&global_mapping_table));
+	old_gq_map = gq_map;
+
+	if (gqid == NO_QUEUE) {
+		/* Remove any old mapping (we know that old_gqid cannot be
+		 * NO_QUEUE from above)
+		 */
+		if (!WARN_ON(!gq_map || old_gqid > gq_map->max_ents ||
+			     gq_map->map[old_gqid] != dqid)) {
+			/* Unset old mapping */
+			gq_map->map[old_gqid] = NO_QUEUE;
+			if (--gq_map->set_count == 0) {
+				/* Done with map so free */
+				rcu_assign_pointer(*pmap, NULL);
+				call_rcu(&gq_map->rcu, queue_map_release);
+			}
+		}
+		*p_gqid = NO_QUEUE;
+
+		goto out;
+	}
+
+	if (!gq_map || gqid >= gq_map->max_ents) {
+		unsigned int max_queues;
+		int i = 0;
+
+		/* Need to create or expand queue map */
+
+		max_queues = QUEUE_MAP_ALLOC_NUMBER(gqid + 1);
+
+		gq_map = vmalloc(QUEUE_MAP_ALLOC_SIZE(max_queues));
+		if (!gq_map) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		gq_map->max_ents = max_queues;
+
+		if (old_gq_map) {
+			/* Copy old map entries */
+
+			memcpy(gq_map->map, old_gq_map->map,
+			       old_gq_map->max_ents * sizeof(gq_map->map[0]));
+			gq_map->set_count = old_gq_map->set_count;
+			i = old_gq_map->max_ents;
+		} else {
+			gq_map->set_count = 0;
+		}
+
+		/* Initialize entries not copied from old map */
+		for (; i < max_queues; i++)
+			gq_map->map[i] = NO_QUEUE;
+	} else if (gq_map->map[gqid] != NO_QUEUE) {
+		/* The global qid is already mapped to another device qid */
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/* Set map entry */
+	gq_map->map[gqid] = dqid;
+	gq_map->set_count++;
+
+	if (old_gqid != NO_QUEUE) {
+		/* We know old_gqid is not equal to gqid */
+		if (!WARN_ON(!old_gq_map ||
+			     old_gqid > old_gq_map->max_ents ||
+			     old_gq_map->map[old_gqid] != dqid)) {
+			/* Unset old mapping in (new) table */
+			gq_map->map[old_gqid] = NO_QUEUE;
+			gq_map->set_count--;
+		}
+	}
+
+	if (gq_map != old_gq_map) {
+		rcu_assign_pointer(*pmap, gq_map);
+		if (old_gq_map)
+			call_rcu(&old_gq_map->rcu, queue_map_release);
+	}
+
+	/* Save for caller */
+	*p_gqid = gqid;
+
+out:
+	mutex_unlock(&global_mapping_table);
+
+	return ret;
+}
+
+static ssize_t show_rx_queue_global_mapping(struct netdev_rx_queue *queue,
+					    char *buf)
+{
+	u16 gqid = queue->gqid;
+
+	if (gqid == NO_QUEUE)
+		return sprintf(buf, "none\n");
+	else
+		return sprintf(buf, "%u\n", gqid);
+}
+
+static ssize_t store_rx_queue_global_mapping(struct netdev_rx_queue *queue,
+					     const char *buf, size_t len)
+{
+	unsigned long gqid;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ret = kstrtoul(buf, 0, &gqid);
+	if (ret < 0)
+		return ret;
+
+	if (gqid > RPS_MAX_QID || WARN_ON(queue->index > RPS_MAX_QID))
+		return -EINVAL;
+
+	ret = set_device_queue_mapping(&queue->dev->rx_gqueue_map,
+				       gqid, queue->index, &queue->gqid);
+	return ret ? : len;
+}
+
 static struct rx_queue_attribute rps_cpus_attribute __ro_after_init
 	= __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map);
 
 static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init
 	= __ATTR(rps_flow_cnt, 0644,
 		 show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+static struct rx_queue_attribute rx_queue_global_mapping_attribute __ro_after_init =
+	__ATTR(global_queue_mapping, 0644,
+	       show_rx_queue_global_mapping, store_rx_queue_global_mapping);
 #endif /* CONFIG_RPS */
 
 static struct attribute *rx_queue_default_attrs[] __ro_after_init = {
 #ifdef CONFIG_RPS
 	&rps_cpus_attribute.attr,
 	&rps_dev_flow_table_cnt_attribute.attr,
+	&rx_queue_global_mapping_attribute.attr,
 #endif
 	NULL
 };
@@ -896,8 +1044,11 @@ static void rx_queue_release(struct kobject *kobj)
 {
 	struct netdev_rx_queue *queue = to_rx_queue(kobj);
 #ifdef CONFIG_RPS
-	struct rps_map *map;
 	struct rps_dev_flow_table *flow_table;
+	struct rps_map *map;
+
+	set_device_queue_mapping(&queue->dev->rx_gqueue_map, NO_QUEUE,
+				 queue->index, &queue->gqid);
 
 	map = rcu_dereference_protected(queue->rps_map, 1);
 	if (map) {
@@ -1152,6 +1303,46 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
 				 sprintf(buf, "%u\n", tc);
 }
 
+#ifdef CONFIG_RPS
+static ssize_t show_queue_global_queue_mapping(struct netdev_queue *queue,
+					       char *buf)
+{
+	u16 gqid = queue->gqid;
+
+	if (gqid == NO_QUEUE)
+		return sprintf(buf, "none\n");
+	else
+		return sprintf(buf, "%u\n", gqid);
+	return 0;
+}
+
+static ssize_t store_queue_global_queue_mapping(struct netdev_queue *queue,
+						const char *buf, size_t len)
+{
+	unsigned long gqid;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ret = kstrtoul(buf, 0, &gqid);
+	if (ret < 0)
+		return ret;
+
+	if (gqid > RPS_MAX_QID || WARN_ON(queue->index > RPS_MAX_QID))
+		return -EINVAL;
+
+	ret = set_device_queue_mapping(&queue->dev->tx_gqueue_map,
+				       gqid, queue->index, &queue->gqid);
+	return ret ? : len;
+}
+
+static struct netdev_queue_attribute global_queue_mapping_attribute __ro_after_init =
+	__ATTR(global_queue_mapping, 0644,
+	       show_queue_global_queue_mapping,
+	       store_queue_global_queue_mapping);
+#endif
+
 #ifdef CONFIG_XPS
 static ssize_t tx_maxrate_show(struct netdev_queue *queue,
 			       char *buf)
@@ -1483,6 +1674,9 @@ static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
 static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
 	&queue_trans_timeout.attr,
 	&queue_traffic_class.attr,
+#ifdef CONFIG_RPS
+	&global_queue_mapping_attribute.attr,
+#endif
 #ifdef CONFIG_XPS
 	&xps_cpus_attribute.attr,
 	&xps_rxqs_attribute.attr,
@@ -1496,6 +1690,9 @@ static void netdev_queue_release(struct kobject *kobj)
 {
 	struct netdev_queue *queue = to_netdev_queue(kobj);
 
+	set_device_queue_mapping(&queue->dev->tx_gqueue_map, NO_QUEUE,
+				 queue->index, &queue->gqid);
+
 	memset(kobj, 0, sizeof(*kobj));
 	dev_put(queue->dev);
 }
-- 
2.25.1


  parent reply	other threads:[~2020-06-24 17:19 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-24 17:17 [RFC PATCH 00/11] ptq: Per Thread Queues Tom Herbert
2020-06-24 17:17 ` [RFC PATCH 01/11] cgroup: Export cgroup_{procs,threads}_start and cgroup_procs_next Tom Herbert
2020-06-24 17:17 ` [RFC PATCH 02/11] net: Create netqueue.h and define NO_QUEUE Tom Herbert
2020-06-24 17:17 ` [RFC PATCH 03/11] arfs: Create set_arfs_queue Tom Herbert
2020-06-24 17:17 ` [RFC PATCH 04/11] net-sysfs: Create rps_create_sock_flow_table Tom Herbert
2020-06-24 17:17 ` [RFC PATCH 05/11] net: Infrastructure for per queue aRFS Tom Herbert
2020-06-28  8:55   ` kernel test robot
2020-06-24 17:17 ` [RFC PATCH 06/11] net: Function to check against maximum number for RPS queues Tom Herbert
2020-06-24 17:17 ` Tom Herbert [this message]
2020-06-24 23:00   ` [RFC PATCH 07/11] net: Introduce global queues kernel test robot
2020-06-24 23:58   ` kernel test robot
2020-06-25  0:23   ` kernel test robot
2020-06-30 21:06   ` Jonathan Lemon
2020-06-24 17:17 ` [RFC PATCH 08/11] ptq: Per Thread Queues Tom Herbert
2020-06-24 21:20   ` kernel test robot
2020-06-25  1:50   ` [RFC PATCH] ptq: null_pcdesc can be static kernel test robot
2020-06-25  7:26   ` [RFC PATCH 08/11] ptq: Per Thread Queues kernel test robot
2020-06-24 17:17 ` [RFC PATCH 09/11] ptq: Hook up transmit side of Per Queue Threads Tom Herbert
2020-06-24 17:17 ` [RFC PATCH 10/11] ptq: Hook up receive " Tom Herbert
2020-06-24 17:17 ` [RFC PATCH 11/11] doc: Documentation for Per Thread Queues Tom Herbert
2020-06-25  2:20   ` kernel test robot
2020-06-25 23:00   ` Jacob Keller
2020-06-29  6:28   ` Saeed Mahameed
2020-06-29 15:10     ` Tom Herbert

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200624171749.11927-8-tom@herbertland.com \
    --to=tom@herbertland.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.