All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2 v7] xps: Transmit Packet Steering
@ 2010-11-21 23:17 Tom Herbert
  2010-11-22 11:42 ` Changli Gao
                   ` (3 more replies)
  0 siblings, 4 replies; 21+ messages in thread
From: Tom Herbert @ 2010-11-21 23:17 UTC (permalink / raw)
  To: davem, netdev; +Cc: eric.dumazet

This patch implements transmit packet steering (XPS) for multiqueue
devices.  XPS selects a transmit queue during packet transmission based
on configuration.  This is done by mapping the CPU transmitting the
packet to a queue.  This is the transmit side analogue to RPS-- where
RPS is selecting a CPU based on receive queue, XPS selects a queue
based on the CPU (previously there was an XPS patch from Eric
Dumazet, but that might more appropriately be called transmit completion
steering).

Each transmit queue can be associated with a number of CPUs which will
use the queue to send packets.  This is configured as a CPU mask on a
per queue basis in:

/sys/class/net/eth<n>/queues/tx-<n>/xps_cpus

The mappings are stored per device in an inverted data structure that
maps CPUs to queues.  In the netdevice structure this is an array of
num_possible_cpu structures where each structure holds and array of
queue_indexes for queues which that CPU can use.

The benefits of XPS are improved locality in the per queue data
structures.  Also, transmit completions are more likely to be done
nearer to the sending thread, so this should promote locality back
to the socket on free (e.g. UDP).  The benefits of XPS are dependent on
cache hierarchy, application load, and other factors.  XPS would
nominally be configured so that a queue would only be shared by CPUs
which are sharing a cache, the degenerative configuration woud be that
each CPU has it's own queue.

Below are some benchmark results which show the potential benfit of
this patch.  The netperf test has 500 instances of netperf TCP_RR test
with 1 byte req. and resp.

bnx2x on 16 core AMD
   XPS (16 queues, 1 TX queue per CPU)  1234K at 100% CPU
   No XPS (16 queues)                   996K at 100% CPU

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/linux/netdevice.h |   30 ++++
 net/core/dev.c            |   53 ++++++-
 net/core/net-sysfs.c      |  369 ++++++++++++++++++++++++++++++++++++++++++++-
 net/core/net-sysfs.h      |    3 +
 4 files changed, 447 insertions(+), 8 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b45c1b8..badf928 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -503,6 +503,10 @@ struct netdev_queue {
 	struct Qdisc		*qdisc;
 	unsigned long		state;
 	struct Qdisc		*qdisc_sleeping;
+#ifdef CONFIG_RPS
+	struct kobject		kobj;
+#endif
+
 /*
  * write mostly part
  */
@@ -530,6 +534,30 @@ struct rps_map {
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
 
 /*
+ * This structure holds an XPS map which can be of variable length.  The
+ * map is an array of queues.
+ */
+struct xps_map {
+	unsigned int len;
+	unsigned int alloc_len;
+	struct rcu_head rcu;
+	u16 queues[0];
+};
+#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16)))
+#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map))	\
+    / sizeof(u16))
+
+/*
+ * This structure holds all XPS maps for device.  Maps are indexed by CPU.
+ */
+struct xps_dev_maps {
+	struct rcu_head rcu;
+	struct xps_map *cpu_map[0];
+};
+#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) +		\
+    (nr_cpu_ids * sizeof(struct xps_map *)))
+
+/*
  * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
  * tail pointer for that CPU's input queue at the time of last enqueue.
  */
@@ -1016,6 +1044,8 @@ struct net_device {
 	unsigned long		tx_queue_len;	/* Max frames per queue allowed */
 	spinlock_t		tx_global_lock;
 
+	struct xps_dev_maps	*xps_maps;
+
 	/* These may be needed for future network-power-down code. */
 
 	/*
diff --git a/net/core/dev.c b/net/core/dev.c
index 7b17674..c852f00 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1557,12 +1557,16 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
  */
 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 {
+	int rc;
+
 	if (txq < 1 || txq > dev->num_tx_queues)
 		return -EINVAL;
 
 	if (dev->reg_state == NETREG_REGISTERED) {
 		ASSERT_RTNL();
 
+		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
+						  txq);
 		if (txq < dev->real_num_tx_queues)
 			qdisc_reset_all_tx_gt(dev, txq);
 	}
@@ -2142,6 +2146,44 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
 	return queue_index;
 }
 
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_RPS
+	struct xps_dev_maps *dev_maps;
+	struct xps_map *map;
+	int queue_index = -1;
+
+	rcu_read_lock();
+	dev_maps = rcu_dereference(dev->xps_maps);
+	if (dev_maps) {
+		map = rcu_dereference(
+		    dev_maps->cpu_map[raw_smp_processor_id()]);
+		if (map) {
+			if (map->len == 1)
+				queue_index = map->queues[0];
+			else {
+				u32 hash;
+				if (skb->sk && skb->sk->sk_hash)
+					hash = skb->sk->sk_hash;
+				else
+					hash = (__force u16) skb->protocol ^
+					    skb->rxhash;
+				hash = jhash_1word(hash, hashrnd);
+				queue_index = map->queues[
+				    ((u64)hash * map->len) >> 32];
+			}
+			if (unlikely(queue_index >= dev->real_num_tx_queues))
+				queue_index = -1;
+		}
+	}
+	rcu_read_unlock();
+
+	return queue_index;
+#else
+	return -1;
+#endif
+}
+
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
@@ -2161,7 +2203,9 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 		    queue_index >= dev->real_num_tx_queues) {
 			int old_index = queue_index;
 
-			queue_index = skb_tx_hash(dev, skb);
+			queue_index = get_xps_queue(dev, skb);
+			if (queue_index < 0)
+				queue_index = skb_tx_hash(dev, skb);
 
 			if (queue_index != old_index && sk) {
 				struct dst_entry *dst =
@@ -5066,6 +5110,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 {
 	unsigned int count = dev->num_tx_queues;
 	struct netdev_queue *tx;
+	int i;
 
 	BUG_ON(count < 1);
 
@@ -5076,6 +5121,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 		return -ENOMEM;
 	}
 	dev->_tx = tx;
+
+	for (i = 0; i < count; i++)
+		tx[i].dev = dev;
+
 	return 0;
 }
 
@@ -5083,8 +5132,6 @@ static void netdev_init_one_queue(struct net_device *dev,
 				  struct netdev_queue *queue,
 				  void *_unused)
 {
-	queue->dev = dev;
-
 	/* Initialize queue lock */
 	spin_lock_init(&queue->_xmit_lock);
 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7abeb7c..68dbbfd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -772,18 +772,377 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 	return error;
 }
 
-static int rx_queue_register_kobjects(struct net_device *net)
+/*
+ * netdev_queue sysfs structures and functions.
+ */
+struct netdev_queue_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct netdev_queue *queue,
+	    struct netdev_queue_attribute *attr, char *buf);
+	ssize_t (*store)(struct netdev_queue *queue,
+	    struct netdev_queue_attribute *attr, const char *buf, size_t len);
+};
+#define to_netdev_queue_attr(_attr) container_of(_attr,		\
+    struct netdev_queue_attribute, attr)
+
+#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
+
+static ssize_t netdev_queue_attr_show(struct kobject *kobj,
+				      struct attribute *attr, char *buf)
+{
+	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t netdev_queue_attr_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(queue, attribute, buf, count);
+}
+
+static const struct sysfs_ops netdev_queue_sysfs_ops = {
+	.show = netdev_queue_attr_show,
+	.store = netdev_queue_attr_store,
+};
+
+static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
 {
+	struct net_device *dev = queue->dev;
+	int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++)
+		if (queue == &dev->_tx[i])
+			break;
+
+	BUG_ON(i >= dev->num_tx_queues);
+
+	return i;
+}
+
+
+static ssize_t show_xps_map(struct netdev_queue *queue,
+			    struct netdev_queue_attribute *attribute, char *buf)
+{
+	struct net_device *dev = queue->dev;
+	struct xps_dev_maps *dev_maps;
+	cpumask_var_t mask;
+	unsigned long index;
+	size_t len = 0;
+	int i;
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	index = get_netdev_queue_index(queue);
+
+	rcu_read_lock();
+	dev_maps = rcu_dereference(dev->xps_maps);
+	if (dev_maps) {
+		for_each_possible_cpu(i) {
+			struct xps_map *map =
+			    rcu_dereference(dev_maps->cpu_map[i]);
+			if (map) {
+				int j;
+				for (j = 0; j < map->len; j++) {
+					if (map->queues[j] == index) {
+						cpumask_set_cpu(i, mask);
+						break;
+					}
+				}
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
+	if (PAGE_SIZE - len < 3) {
+		free_cpumask_var(mask);
+		return -EINVAL;
+	}
+
+	free_cpumask_var(mask);
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
+static void xps_map_release(struct rcu_head *rcu)
+{
+	struct xps_map *map = container_of(rcu, struct xps_map, rcu);
+
+	kfree(map);
+}
+
+static void xps_dev_maps_release(struct rcu_head *rcu)
+{
+	struct xps_dev_maps *dev_maps =
+	    container_of(rcu, struct xps_dev_maps, rcu);
+
+	kfree(dev_maps);
+}
+
+static DEFINE_MUTEX(xps_map_mutex);
+
+static ssize_t store_xps_map(struct netdev_queue *queue,
+		      struct netdev_queue_attribute *attribute,
+		      const char *buf, size_t len)
+{
+	struct net_device *dev = queue->dev;
+	cpumask_var_t mask;
+	int err, i, cpu, pos, map_len, alloc_len, need_set;
+	unsigned long index;
+	struct xps_map *map, *new_map;
+	struct xps_dev_maps *dev_maps, *new_dev_maps;
+	int nonempty = 0;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	index = get_netdev_queue_index(queue);
+
+	err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+	if (err) {
+		free_cpumask_var(mask);
+		return err;
+	}
+
+	new_dev_maps = kzalloc(max_t(unsigned,
+	    XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL);
+	if (!new_dev_maps) {
+		free_cpumask_var(mask);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&xps_map_mutex);
+
+	dev_maps = dev->xps_maps;
+
+	for_each_possible_cpu(cpu) {
+		new_map = map = dev_maps ? dev_maps->cpu_map[cpu] : NULL;
+
+		if (map) {
+			for (pos = 0; pos < map->len; pos++)
+				if (map->queues[pos] == index)
+					break;
+			map_len = map->len;
+			alloc_len = map->alloc_len;
+		} else
+			pos = map_len = alloc_len = 0;
+
+		need_set = cpu_isset(cpu, *mask) && cpu_online(cpu);
+
+		if (need_set && pos >= map_len) {
+			/* Need to add queue to this CPU's map */
+			if (map_len >= alloc_len) {
+				alloc_len = alloc_len ?
+				    2 * alloc_len : XPS_MIN_MAP_ALLOC;
+				new_map = kzalloc(XPS_MAP_SIZE(alloc_len),
+				    GFP_KERNEL);
+				if (!new_map)
+					goto error;
+				new_map->alloc_len = alloc_len;
+				for (i = 0; i < map_len; i++)
+					new_map->queues[i] = map->queues[i];
+				new_map->len = map_len;
+			}
+			new_map->queues[new_map->len++] = index;
+		} else if (!need_set && pos < map_len) {
+			/* Need to remove queue from this CPU's map */
+			if (map_len > 1)
+				new_map->queues[pos] =
+				    new_map->queues[--new_map->len];
+			else
+				new_map = NULL;
+		}
+		new_dev_maps->cpu_map[cpu] = new_map;
+	}
+
+	/* Cleanup old maps */
+	for_each_possible_cpu(cpu) {
+		map = dev_maps ? dev_maps->cpu_map[cpu] : NULL;
+		if (map && new_dev_maps->cpu_map[cpu] != map)
+			call_rcu(&map->rcu, xps_map_release);
+		if (new_dev_maps->cpu_map[cpu])
+			nonempty = 1;
+	}
+
+	if (nonempty)
+		rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+	else {
+		kfree(new_dev_maps);
+		rcu_assign_pointer(dev->xps_maps, NULL);
+	}
+
+	if (dev_maps)
+		call_rcu(&dev_maps->rcu, xps_dev_maps_release);
+
+	mutex_unlock(&xps_map_mutex);
+
+	free_cpumask_var(mask);
+	return len;
+
+error:
+	mutex_unlock(&xps_map_mutex);
+
+	if (new_dev_maps)
+		for_each_possible_cpu(i)
+			kfree(new_dev_maps->cpu_map[i]);
+	kfree(new_dev_maps);
+	free_cpumask_var(mask);
+	return -ENOMEM;
+}
+
+static struct netdev_queue_attribute xps_cpus_attribute =
+    __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+
+static struct attribute *netdev_queue_default_attrs[] = {
+	&xps_cpus_attribute.attr,
+	NULL
+};
+
+static void netdev_queue_release(struct kobject *kobj)
+{
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+	struct net_device *dev = queue->dev;
+	struct xps_dev_maps *dev_maps;
+	struct xps_map *map;
+	unsigned long index;
+	int i, pos, nonempty = 0;
+
+	index = get_netdev_queue_index(queue);
+
+	mutex_lock(&xps_map_mutex);
+	dev_maps = dev->xps_maps;
+
+	if (dev_maps) {
+		for_each_possible_cpu(i) {
+			map  = dev_maps->cpu_map[i];
+			if (!map)
+				continue;
+
+			for (pos = 0; pos < map->len; pos++)
+				if (map->queues[pos] == index)
+					break;
+
+			if (pos < map->len) {
+				if (map->len > 1)
+					map->queues[pos] =
+					    map->queues[--map->len];
+				else {
+					RCU_INIT_POINTER(dev_maps->cpu_map[i],
+					    NULL);
+					call_rcu(&map->rcu, xps_map_release);
+					map = NULL;
+				}
+			}
+			if (map)
+				nonempty = 1;
+		}
+
+		if (!nonempty) {
+			RCU_INIT_POINTER(dev->xps_maps, NULL);
+			call_rcu(&dev_maps->rcu, xps_dev_maps_release);
+		}
+	}
+
+	mutex_unlock(&xps_map_mutex);
+
+	memset(kobj, 0, sizeof(*kobj));
+	dev_put(queue->dev);
+}
+
+static struct kobj_type netdev_queue_ktype = {
+	.sysfs_ops = &netdev_queue_sysfs_ops,
+	.release = netdev_queue_release,
+	.default_attrs = netdev_queue_default_attrs,
+};
+
+static int netdev_queue_add_kobject(struct net_device *net, int index)
+{
+	struct netdev_queue *queue = net->_tx + index;
+	struct kobject *kobj = &queue->kobj;
+	int error = 0;
+
+	kobj->kset = net->queues_kset;
+	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
+	    "tx-%u", index);
+	if (error) {
+		kobject_put(kobj);
+		return error;
+	}
+
+	kobject_uevent(kobj, KOBJ_ADD);
+	dev_hold(queue->dev);
+
+	return error;
+}
+
+int
+netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+{
+	int i;
+	int error = 0;
+
+	for (i = old_num; i < new_num; i++) {
+		error = netdev_queue_add_kobject(net, i);
+		if (error) {
+			new_num = old_num;
+			break;
+		}
+	}
+
+	while (--i >= new_num)
+		kobject_put(&net->_tx[i].kobj);
+
+	return error;
+}
+
+static int register_queue_kobjects(struct net_device *net)
+{
+	int error = 0, txq = 0, rxq = 0;
+
 	net->queues_kset = kset_create_and_add("queues",
 	    NULL, &net->dev.kobj);
 	if (!net->queues_kset)
 		return -ENOMEM;
-	return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues);
+
+	error = net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues);
+	if (error)
+		goto error;
+	rxq = net->real_num_rx_queues;
+
+	error = netdev_queue_update_kobjects(net, 0,
+					     net->real_num_tx_queues);
+	if (error)
+		goto error;
+	txq = net->real_num_tx_queues;
+
+	return 0;
+
+error:
+	netdev_queue_update_kobjects(net, txq, 0);
+	net_rx_queue_update_kobjects(net, rxq, 0);
+	return error;
 }
 
-static void rx_queue_remove_kobjects(struct net_device *net)
+static void remove_queue_kobjects(struct net_device *net)
 {
 	net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0);
+	netdev_queue_update_kobjects(net, net->real_num_tx_queues, 0);
 	kset_unregister(net->queues_kset);
 }
 #endif /* CONFIG_RPS */
@@ -886,7 +1245,7 @@ void netdev_unregister_kobject(struct net_device * net)
 	kobject_get(&dev->kobj);
 
 #ifdef CONFIG_RPS
-	rx_queue_remove_kobjects(net);
+	remove_queue_kobjects(net);
 #endif
 
 	device_del(dev);
@@ -927,7 +1286,7 @@ int netdev_register_kobject(struct net_device *net)
 		return error;
 
 #ifdef CONFIG_RPS
-	error = rx_queue_register_kobjects(net);
+	error = register_queue_kobjects(net);
 	if (error) {
 		device_del(dev);
 		return error;
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 778e157..25ec2ee 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -6,6 +6,9 @@ int netdev_register_kobject(struct net_device *);
 void netdev_unregister_kobject(struct net_device *);
 #ifdef CONFIG_RPS
 int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+int netdev_queue_update_kobjects(struct net_device *net,
+				 int old_num, int new_num);
+
 #endif
 
 #endif
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2 v7] xps: Transmit Packet Steering
  2010-11-21 23:17 [PATCH 2/2 v7] xps: Transmit Packet Steering Tom Herbert
@ 2010-11-22 11:42 ` Changli Gao
  2010-11-22 13:33 ` Eric Dumazet
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 21+ messages in thread
From: Changli Gao @ 2010-11-22 11:42 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet

On Mon, Nov 22, 2010 at 7:17 AM, Tom Herbert <therbert@google.com> wrote:
> This patch implements transmit packet steering (XPS) for multiqueue
> devices.  XPS selects a transmit queue during packet transmission based
> on configuration.  This is done by mapping the CPU transmitting the
> packet to a queue.  This is the transmit side analogue to RPS-- where
> RPS is selecting a CPU based on receive queue, XPS selects a queue
> based on the CPU (previously there was an XPS patch from Eric
> Dumazet, but that might more appropriately be called transmit completion
> steering).
>
> Each transmit queue can be associated with a number of CPUs which will
> use the queue to send packets.  This is configured as a CPU mask on a
> per queue basis in:
>
> /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus
>
> The mappings are stored per device in an inverted data structure that
> maps CPUs to queues.  In the netdevice structure this is an array of
> num_possible_cpu structures where each structure holds and array of
> queue_indexes for queues which that CPU can use.
>
> The benefits of XPS are improved locality in the per queue data
> structures.  Also, transmit completions are more likely to be done
> nearer to the sending thread, so this should promote locality back
> to the socket on free (e.g. UDP).  The benefits of XPS are dependent on
> cache hierarchy, application load, and other factors.  XPS would
> nominally be configured so that a queue would only be shared by CPUs
> which are sharing a cache, the degenerative configuration woud be that
> each CPU has it's own queue.
>
> Below are some benchmark results which show the potential benfit of
> this patch.  The netperf test has 500 instances of netperf TCP_RR test
> with 1 byte req. and resp.
>
> bnx2x on 16 core AMD
>   XPS (16 queues, 1 TX queue per CPU)  1234K at 100% CPU
>   No XPS (16 queues)                   996K at 100% CPU
>
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>  include/linux/netdevice.h |   30 ++++
>  net/core/dev.c            |   53 ++++++-
>  net/core/net-sysfs.c      |  369 ++++++++++++++++++++++++++++++++++++++++++++-
>  net/core/net-sysfs.h      |    3 +
>  4 files changed, 447 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index b45c1b8..badf928 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -503,6 +503,10 @@ struct netdev_queue {
>        struct Qdisc            *qdisc;
>        unsigned long           state;
>        struct Qdisc            *qdisc_sleeping;
> +#ifdef CONFIG_RPS
> +       struct kobject          kobj;
> +#endif
> +

Why do you reuse CONFIG_RPS? I think it is confusing, as the code
enclosed is for XPS not RPS.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2 v7] xps: Transmit Packet Steering
  2010-11-21 23:17 [PATCH 2/2 v7] xps: Transmit Packet Steering Tom Herbert
  2010-11-22 11:42 ` Changli Gao
@ 2010-11-22 13:33 ` Eric Dumazet
  2010-11-24 19:45 ` David Miller
  2010-11-25 17:12 ` [PATCH 2/2 v7] xps: Transmit Packet Steering Ben Hutchings
  3 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2010-11-22 13:33 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev

Le dimanche 21 novembre 2010 à 15:17 -0800, Tom Herbert a écrit :

...

> +
> +static DEFINE_MUTEX(xps_map_mutex);
> +
> +static ssize_t store_xps_map(struct netdev_queue *queue,
> +		      struct netdev_queue_attribute *attribute,
> +		      const char *buf, size_t len)
> +{
> +	struct net_device *dev = queue->dev;
> +	cpumask_var_t mask;
> +	int err, i, cpu, pos, map_len, alloc_len, need_set;
> +	unsigned long index;
> +	struct xps_map *map, *new_map;
> +	struct xps_dev_maps *dev_maps, *new_dev_maps;
> +	int nonempty = 0;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	index = get_netdev_queue_index(queue);
> +
> +	err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
> +	if (err) {
> +		free_cpumask_var(mask);
> +		return err;
> +	}
> +
> +	new_dev_maps = kzalloc(max_t(unsigned,
> +	    XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL);
> +	if (!new_dev_maps) {
> +		free_cpumask_var(mask);
> +		return -ENOMEM;
> +	}
> +
> +	mutex_lock(&xps_map_mutex);
> +
> +	dev_maps = dev->xps_maps;
> +
> +	for_each_possible_cpu(cpu) {
> +		new_map = map = dev_maps ? dev_maps->cpu_map[cpu] : NULL;
> +
> +		if (map) {
> +			for (pos = 0; pos < map->len; pos++)
> +				if (map->queues[pos] == index)
> +					break;
> +			map_len = map->len;
> +			alloc_len = map->alloc_len;
> +		} else
> +			pos = map_len = alloc_len = 0;
> +
> +		need_set = cpu_isset(cpu, *mask) && cpu_online(cpu);
> +
> +		if (need_set && pos >= map_len) {
> +			/* Need to add queue to this CPU's map */
> +			if (map_len >= alloc_len) {
> +				alloc_len = alloc_len ?
> +				    2 * alloc_len : XPS_MIN_MAP_ALLOC;
> +				new_map = kzalloc(XPS_MAP_SIZE(alloc_len),
> +				    GFP_KERNEL);

		kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)) 

to get memory close to the cpu ?

> +				if (!new_map)
> +					goto error;
> +				new_map->alloc_len = alloc_len;
> +				for (i = 0; i < map_len; i++)
> +					new_map->queues[i] = map->queues[i];
> +				new_map->len = map_len;
> +			}
> +			new_map->queues[new_map->len++] = index;
> +		} else if (!need_set && pos < map_len) {
> +			/* Need to remove queue from this CPU's map */
> +			if (map_len > 1)
> +				new_map->queues[pos] =
> +				    new_map->queues[--new_map->len];
> +			else
> +				new_map = NULL;
> +		}
> +		new_dev_maps->cpu_map[cpu] = new_map;
> +	}
> +
> +	/* Cleanup old maps */
> +	for_each_possible_cpu(cpu) {
> +		map = dev_maps ? dev_maps->cpu_map[cpu] : NULL;
> +		if (map && new_dev_maps->cpu_map[cpu] != map)
> +			call_rcu(&map->rcu, xps_map_release);
> +		if (new_dev_maps->cpu_map[cpu])
> +			nonempty = 1;
> +	}
> +


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2 v7] xps: Transmit Packet Steering
  2010-11-21 23:17 [PATCH 2/2 v7] xps: Transmit Packet Steering Tom Herbert
  2010-11-22 11:42 ` Changli Gao
  2010-11-22 13:33 ` Eric Dumazet
@ 2010-11-24 19:45 ` David Miller
  2010-11-26 17:13   ` Tom Herbert
  2010-11-25 17:12 ` [PATCH 2/2 v7] xps: Transmit Packet Steering Ben Hutchings
  3 siblings, 1 reply; 21+ messages in thread
From: David Miller @ 2010-11-24 19:45 UTC (permalink / raw)
  To: therbert; +Cc: netdev, eric.dumazet

From: Tom Herbert <therbert@google.com>
Date: Sun, 21 Nov 2010 15:17:27 -0800 (PST)

> This patch implements transmit packet steering (XPS) for multiqueue
> devices.  XPS selects a transmit queue during packet transmission based
> on configuration.  This is done by mapping the CPU transmitting the
> packet to a queue.  This is the transmit side analogue to RPS-- where
> RPS is selecting a CPU based on receive queue, XPS selects a queue
> based on the CPU (previously there was an XPS patch from Eric
> Dumazet, but that might more appropriately be called transmit completion
> steering).
> 
> Each transmit queue can be associated with a number of CPUs which will
> use the queue to send packets.  This is configured as a CPU mask on a
> per queue basis in:
> 
> /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus
> 
> The mappings are stored per device in an inverted data structure that
> maps CPUs to queues.  In the netdevice structure this is an array of
> num_possible_cpu structures where each structure holds and array of
> queue_indexes for queues which that CPU can use.
> 
> The benefits of XPS are improved locality in the per queue data
> structures.  Also, transmit completions are more likely to be done
> nearer to the sending thread, so this should promote locality back
> to the socket on free (e.g. UDP).  The benefits of XPS are dependent on
> cache hierarchy, application load, and other factors.  XPS would
> nominally be configured so that a queue would only be shared by CPUs
> which are sharing a cache, the degenerative configuration woud be that
> each CPU has it's own queue.
> 
> Below are some benchmark results which show the potential benfit of
> this patch.  The netperf test has 500 instances of netperf TCP_RR test
> with 1 byte req. and resp.
> 
> bnx2x on 16 core AMD
>    XPS (16 queues, 1 TX queue per CPU)  1234K at 100% CPU
>    No XPS (16 queues)                   996K at 100% CPU
> 
> Signed-off-by: Tom Herbert <therbert@google.com>

Applied, please consider Eric's feedback about map NUMA node placement.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2 v7] xps: Transmit Packet Steering
  2010-11-21 23:17 [PATCH 2/2 v7] xps: Transmit Packet Steering Tom Herbert
                   ` (2 preceding siblings ...)
  2010-11-24 19:45 ` David Miller
@ 2010-11-25 17:12 ` Ben Hutchings
  2010-11-29 18:14   ` [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity Eric Dumazet
  3 siblings, 1 reply; 21+ messages in thread
From: Ben Hutchings @ 2010-11-25 17:12 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet

On Sun, 2010-11-21 at 15:17 -0800, Tom Herbert wrote:
[...]
> Each transmit queue can be associated with a number of CPUs which will
> use the queue to send packets.  This is configured as a CPU mask on a
> per queue basis in:
> 
> /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus
> 
> The mappings are stored per device in an inverted data structure that
> maps CPUs to queues.  In the netdevice structure this is an array of
> num_possible_cpu structures where each structure holds and array of
> queue_indexes for queues which that CPU can use.
[...]

I was thinking we could also use CPU reverse-mapping from IRQ affinity
to set this up automatically - maybe leaving an option for manual
override?

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2 v7] xps: Transmit Packet Steering
  2010-11-24 19:45 ` David Miller
@ 2010-11-26 17:13   ` Tom Herbert
  2010-11-26 17:17     ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Tom Herbert @ 2010-11-26 17:13 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, eric.dumazet

> Applied, please consider Eric's feedback about map NUMA node placement.

Yes, I intended to do that.  Also, I was planning to add Changli's
idea of adding CONFIG_XPS.  Also, I believe the case where CONFIG_RPS
is not enabled breaks the build, I'll send a fix as soon as possible.

Tom

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2 v7] xps: Transmit Packet Steering
  2010-11-26 17:13   ` Tom Herbert
@ 2010-11-26 17:17     ` Eric Dumazet
  2010-11-28 15:43       ` [PATCH net-next-2.6] xps: NUMA allocations for per cpu data Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2010-11-26 17:17 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, netdev

Le vendredi 26 novembre 2010 à 09:13 -0800, Tom Herbert a écrit :
> > Applied, please consider Eric's feedback about map NUMA node placement.
> 
> Yes, I intended to do that.  Also, I was planning to add Changli's
> idea of adding CONFIG_XPS.  Also, I believe the case where CONFIG_RPS
> is not enabled breaks the build, I'll send a fix as soon as possible.
> 
> Tom

I'll provide a patch as soon as David pushes net-next-2.6, dont worry ;)

Thanks !



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH net-next-2.6] xps: NUMA allocations for per cpu data
  2010-11-26 17:17     ` Eric Dumazet
@ 2010-11-28 15:43       ` Eric Dumazet
  2010-11-29 17:43         ` David Miller
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2010-11-28 15:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Tom Herbert

store_xps_map() allocates maps that are used by single cpu, it makes
sense to use NUMA allocations.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Tom Herbert <therbert@google.com>
---
 net/core/net-sysfs.c |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 68dbbfd..35f28b1 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -950,8 +950,9 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
 			if (map_len >= alloc_len) {
 				alloc_len = alloc_len ?
 				    2 * alloc_len : XPS_MIN_MAP_ALLOC;
-				new_map = kzalloc(XPS_MAP_SIZE(alloc_len),
-				    GFP_KERNEL);
+				new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len),
+						       GFP_KERNEL,
+						       cpu_to_node(cpu));
 				if (!new_map)
 					goto error;
 				new_map->alloc_len = alloc_len;



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] xps: NUMA allocations for per cpu data
  2010-11-28 15:43       ` [PATCH net-next-2.6] xps: NUMA allocations for per cpu data Eric Dumazet
@ 2010-11-29 17:43         ` David Miller
  0 siblings, 0 replies; 21+ messages in thread
From: David Miller @ 2010-11-29 17:43 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, therbert

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 28 Nov 2010 16:43:24 +0100

> store_xps_map() allocates maps that are used by single cpu, it makes
> sense to use NUMA allocations.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Tom Herbert <therbert@google.com>

Applied.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-25 17:12 ` [PATCH 2/2 v7] xps: Transmit Packet Steering Ben Hutchings
@ 2010-11-29 18:14   ` Eric Dumazet
  2010-11-30 18:31     ` Tom Herbert
  2010-12-01 20:49     ` David Miller
  0 siblings, 2 replies; 21+ messages in thread
From: Eric Dumazet @ 2010-11-29 18:14 UTC (permalink / raw)
  To: David Miller; +Cc: Tom Herbert, netdev, Ben Hutchings

I was thinking of using XPS tx_queue->cpu mapping to eventually allocate
memory with correct NUMA affinities, for qdisc/class stuff for example.

Here is a first patch to allocate qdisc with proper NUMA affinities.

Tested on my 16-cpus machine

echo 0001 >/sys/class/net/eth1/queues/tx-0/xps_cpus
echo 0002 >/sys/class/net/eth1/queues/tx-1/xps_cpus
echo 0004 >/sys/class/net/eth1/queues/tx-2/xps_cpus
echo 0008 >/sys/class/net/eth1/queues/tx-3/xps_cpus
echo 0010 >/sys/class/net/eth1/queues/tx-4/xps_cpus
echo 0020 >/sys/class/net/eth1/queues/tx-5/xps_cpus
echo 0040 >/sys/class/net/eth1/queues/tx-6/xps_cpus
echo 0080 >/sys/class/net/eth1/queues/tx-7/xps_cpus
echo 0100 >/sys/class/net/eth1/queues/tx-8/xps_cpus
echo 0200 >/sys/class/net/eth1/queues/tx-9/xps_cpus
echo 0400 >/sys/class/net/eth1/queues/tx-10/xps_cpus
echo 0800 >/sys/class/net/eth1/queues/tx-11/xps_cpus
echo 1000 >/sys/class/net/eth1/queues/tx-12/xps_cpus
echo 2000 >/sys/class/net/eth1/queues/tx-13/xps_cpus
echo 4000 >/sys/class/net/eth1/queues/tx-14/xps_cpus
echo 8000 >/sys/class/net/eth1/queues/tx-15/xps_cpus

tc qdisc del dev eth1 root
tc qdisc add dev eth1 root mq

Thanks

[PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity

Allocate qdisc memory according to NUMA properties of cpus included in
xps map.

To be effective, qdisc should be (re)setup after changes
of /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus

I added a numa_node field in struct netdev_queue, containing NUMA node
if all cpus included in xps_cpus share same node, else -1.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Tom Herbert <therbert@google.com>
---
 include/linux/netdevice.h |   20 +++++++++++++++++++-
 net/core/dev.c            |    5 +++--
 net/core/net-sysfs.c      |   12 +++++++++++-
 net/sched/sch_generic.c   |    4 +++-
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9ae4544..f912de7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -508,7 +508,9 @@ struct netdev_queue {
 #ifdef CONFIG_RPS
 	struct kobject		kobj;
 #endif
-
+#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
+	int			numa_node;
+#endif
 /*
  * write mostly part
  */
@@ -523,6 +525,22 @@ struct netdev_queue {
 	u64			tx_dropped;
 } ____cacheline_aligned_in_smp;
 
+static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
+{
+#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
+	return q->numa_node;
+#else
+	return -1;
+#endif
+}
+
+static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node)
+{
+#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
+	q->numa_node = node;
+#endif
+}
+
 #ifdef CONFIG_RPS
 /*
  * This structure holds an RPS map which can be of variable length.  The
diff --git a/net/core/dev.c b/net/core/dev.c
index 3259d2c..cd24374 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5125,9 +5125,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 	}
 	dev->_tx = tx;
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
+		netdev_queue_numa_node_write(&tx[i], -1);
 		tx[i].dev = dev;
-
+	}
 	return 0;
 }
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 99c1129..149dde0 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -911,6 +911,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
 	struct xps_map *map, *new_map;
 	struct xps_dev_maps *dev_maps, *new_dev_maps;
 	int nonempty = 0;
+	int numa_node = -2;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -950,7 +951,14 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
 			pos = map_len = alloc_len = 0;
 
 		need_set = cpu_isset(cpu, *mask) && cpu_online(cpu);
-
+#ifdef CONFIG_NUMA
+		if (need_set) {
+			if (numa_node == -2)
+				numa_node = cpu_to_node(cpu);
+			else if (numa_node != cpu_to_node(cpu))
+				numa_node = -1;
+		}
+#endif
 		if (need_set && pos >= map_len) {
 			/* Need to add queue to this CPU's map */
 			if (map_len >= alloc_len) {
@@ -996,6 +1004,8 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
 	if (dev_maps)
 		call_rcu(&dev_maps->rcu, xps_dev_maps_release);
 
+	netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : -1);
+
 	mutex_unlock(&xps_map_mutex);
 
 	free_cpumask_var(mask);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7f0bd89..0918834 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -553,7 +553,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	size = QDISC_ALIGN(sizeof(*sch));
 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
 
-	p = kzalloc(size, GFP_KERNEL);
+	p = kzalloc_node(size, GFP_KERNEL,
+			 netdev_queue_numa_node_read(dev_queue));
+
 	if (!p)
 		goto errout;
 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-29 18:14   ` [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity Eric Dumazet
@ 2010-11-30 18:31     ` Tom Herbert
  2010-11-30 18:39       ` Eric Dumazet
  2010-11-30 18:48       ` David Miller
  2010-12-01 20:49     ` David Miller
  1 sibling, 2 replies; 21+ messages in thread
From: Tom Herbert @ 2010-11-30 18:31 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Ben Hutchings

On Mon, Nov 29, 2010 at 10:14 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> I was thinking of using XPS tx_queue->cpu mapping to eventually allocate
> memory with correct NUMA affinities, for qdisc/class stuff for example.
>

An interesting idea, but the real question is can this be used for all
queue related allocations.  This includes those that drivers allocate
which are probably done in initialization.

Tom

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-30 18:31     ` Tom Herbert
@ 2010-11-30 18:39       ` Eric Dumazet
  2010-11-30 18:46         ` Ben Hutchings
  2010-11-30 18:48       ` David Miller
  1 sibling, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2010-11-30 18:39 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, netdev, Ben Hutchings

Le mardi 30 novembre 2010 à 10:31 -0800, Tom Herbert a écrit :
> On Mon, Nov 29, 2010 at 10:14 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > I was thinking of using XPS tx_queue->cpu mapping to eventually allocate
> > memory with correct NUMA affinities, for qdisc/class stuff for example.
> >
> 
> An interesting idea, but the real question is can this be used for all
> queue related allocations.  This includes those that drivers allocate
> which are probably done in initialization.
> 

This would need a callback to device, to eventually re-allocate its ring
buffer (or whatever data structures it allocated). Probably worth it,
considering size of txbd on some NICS (up to one cache line per entry)

Right now, they tend to allocate their memory on a single NUMA node, so
it is a problem.




^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-30 18:39       ` Eric Dumazet
@ 2010-11-30 18:46         ` Ben Hutchings
  2010-11-30 18:52           ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Ben Hutchings @ 2010-11-30 18:46 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tom Herbert, David Miller, netdev

On Tue, 2010-11-30 at 19:39 +0100, Eric Dumazet wrote:
> Le mardi 30 novembre 2010 à 10:31 -0800, Tom Herbert a écrit :
> > On Mon, Nov 29, 2010 at 10:14 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > I was thinking of using XPS tx_queue->cpu mapping to eventually allocate
> > > memory with correct NUMA affinities, for qdisc/class stuff for example.
> > >
> > 
> > An interesting idea, but the real question is can this be used for all
> > queue related allocations.  This includes those that drivers allocate
> > which are probably done in initialization.
> > 
> 
> This would need a callback to device, to eventually re-allocate its ring
> buffer (or whatever data structures it allocated). Probably worth it,
> considering size of txbd on some NICS (up to one cache line per entry)
> 
> Right now, they tend to allocate their memory on a single NUMA node, so
> it is a problem.

Yes, that's why I proposed an ethtool interface for reconfiguring this.
Although to be honest I haven't yet constructed a case where it made a
difference.  I think the most important objects to be allocated on the
right node are RX buffers, and as long as refill is scheduled on the
same CPU as the IRQ this already happens.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-30 18:31     ` Tom Herbert
  2010-11-30 18:39       ` Eric Dumazet
@ 2010-11-30 18:48       ` David Miller
  2010-11-30 19:07         ` Eric Dumazet
  1 sibling, 1 reply; 21+ messages in thread
From: David Miller @ 2010-11-30 18:48 UTC (permalink / raw)
  To: therbert; +Cc: eric.dumazet, netdev, bhutchings

From: Tom Herbert <therbert@google.com>
Date: Tue, 30 Nov 2010 10:31:48 -0800

> On Mon, Nov 29, 2010 at 10:14 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> I was thinking of using XPS tx_queue->cpu mapping to eventually allocate
>> memory with correct NUMA affinities, for qdisc/class stuff for example.
>>
> 
> An interesting idea, but the real question is can this be used for all
> queue related allocations.  This includes those that drivers allocate
> which are probably done in initialization.

Most drivers do, and all drivers ought to, allocate DMA queues and
whatnot when the interface is brought up.

That solves this particular issue.

For example, drivers/net/niu.c does this by calling
niu_alloc_channels() via niu_open().

The only thing we really can't handle currently is the netdev
itself (and the associated driver private).  Jesse Brandeburg
has been reminding me about this over and over :-)

There might be some things we can even do about that part.  For
example, we can put all of the things the driver touches in the
RX and TX fast paths via indirect pointers and therefore be able
to allocate and reallocate those portions as we want long after
device registry.

Doing the core netdev struct itself is too hard because it sits
in so many tables.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-30 18:46         ` Ben Hutchings
@ 2010-11-30 18:52           ` Eric Dumazet
  0 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2010-11-30 18:52 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Tom Herbert, David Miller, netdev

Le mardi 30 novembre 2010 à 18:46 +0000, Ben Hutchings a écrit :

> Yes, that's why I proposed an ethtool interface for reconfiguring this.
> Although to be honest I haven't yet constructed a case where it made a
> difference.  I think the most important objects to be allocated on the
> right node are RX buffers, and as long as refill is scheduled on the
> same CPU as the IRQ this already happens.
> 

Hmm, right now RX skbs are allocated on the right node, since they are
allocated on the node of the cpu handling the {soft}irq.

commit 564824b0c52c346

net: allocate skbs on local node

    commit b30973f877 (node-aware skb allocation) spread a wrong habit of
    allocating net drivers skbs on a given memory node : The one closest to
    the NIC hardware. This is wrong because as soon as we try to scale
    network stack, we need to use many cpus to handle traffic and hit
    slub/slab management on cross-node allocations/frees when these cpus
    have to alloc/free skbs bound to a central node.
    
    skb allocated in RX path are ephemeral, they have a very short
    lifetime : Extra cost to maintain NUMA affinity is too expensive. What
    appeared as a nice idea four years ago is in fact a bad one.
    
    In 2010, NIC hardwares are multiqueue, or we use RPS to spread the load,
    and two 10Gb NIC might deliver more than 28 million packets per second,
    needing all the available cpus.
    
    Cost of cross-node handling in network and vm stacks outperforms the
    small benefit hardware had when doing its DMA transfert in its 'local'
    memory node at RX time. Even trying to differentiate the two allocations
    done for one skb (the sk_buff on local node, the data part on NIC
    hardware node) is not enough to bring good performance.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-30 18:48       ` David Miller
@ 2010-11-30 19:07         ` Eric Dumazet
  2010-11-30 19:19           ` Ben Hutchings
  2010-11-30 19:21           ` David Miller
  0 siblings, 2 replies; 21+ messages in thread
From: Eric Dumazet @ 2010-11-30 19:07 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, netdev, bhutchings

Le mardi 30 novembre 2010 à 10:48 -0800, David Miller a écrit :

> Most drivers do, and all drivers ought to, allocate DMA queues and
> whatnot when the interface is brought up.
> 
> That solves this particular issue.
> 
> For example, drivers/net/niu.c does this by calling
> niu_alloc_channels() via niu_open().
> 
> The only thing we really can't handle currently is the netdev
> itself (and the associated driver private).  Jesse Brandeburg
> has been reminding me about this over and over :-)
> 
> There might be some things we can even do about that part.  For
> example, we can put all of the things the driver touches in the
> RX and TX fast paths via indirect pointers and therefore be able
> to allocate and reallocate those portions as we want long after
> device registry.
> 
> Doing the core netdev struct itself is too hard because it sits
> in so many tables.

netdev struct itself is shared by all cpus, so there is no real choice,
unless you know one netdev will be used by a restricted set of
cpus/nodes... Probably very unlikely in practice.

This can probably be done right now with 
numactl .... modprobe ...


We could change (only on NUMA setups maybe)

struct netdev_queue *_tx;

to a

struct netdev_queue **_tx;

and allocate each "struct netdev_queue" on appropriate node, but adding
one indirection level might be overkill...

For very hot small structures, (one or two cache lines), I am not sure
its worth the pain.


Ben, could you remind us what was your ethtool interface ?

Something to setup a NUMA map for RX queues, TX queues ?

I can probably play with bnx2x and custom module param to test how it
can help raw performance...




^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-30 19:07         ` Eric Dumazet
@ 2010-11-30 19:19           ` Ben Hutchings
  2010-11-30 19:21           ` David Miller
  1 sibling, 0 replies; 21+ messages in thread
From: Ben Hutchings @ 2010-11-30 19:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, therbert, netdev

On Tue, 2010-11-30 at 20:07 +0100, Eric Dumazet wrote:
[...]
> Ben, could you remind us what was your ethtool interface ?
> 
> Something to setup a NUMA map for RX queues, TX queues ?
> 
> I can probably play with bnx2x and custom module param to test how it
> can help raw performance...

See these patches:

<http://patchwork.ozlabs.org/patch/65201/>
<http://patchwork.ozlabs.org/patch/65202/>

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-30 19:07         ` Eric Dumazet
  2010-11-30 19:19           ` Ben Hutchings
@ 2010-11-30 19:21           ` David Miller
  2010-11-30 20:01             ` Brandeburg, Jesse
  1 sibling, 1 reply; 21+ messages in thread
From: David Miller @ 2010-11-30 19:21 UTC (permalink / raw)
  To: eric.dumazet; +Cc: therbert, netdev, bhutchings, jesse.brandeburg

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 30 Nov 2010 20:07:27 +0100

[ Jesse CC:'d ]

> netdev struct itself is shared by all cpus, so there is no real choice,
> unless you know one netdev will be used by a restricted set of
> cpus/nodes... Probably very unlikely in practice.

Unfortunately Jesse has found non-trivial gains by NUMA localizing the
netdev struct during routing tests in soome configurations.

> We could change (only on NUMA setups maybe)
> 
> struct netdev_queue *_tx;
> 
> to a
> 
> struct netdev_queue **_tx;
> 
> and allocate each "struct netdev_queue" on appropriate node, but adding
> one indirection level might be overkill...
> 
> For very hot small structures, (one or two cache lines), I am not sure
> its worth the pain.

Jesse, do you think this would help the case you were testing?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-30 19:21           ` David Miller
@ 2010-11-30 20:01             ` Brandeburg, Jesse
  0 siblings, 0 replies; 21+ messages in thread
From: Brandeburg, Jesse @ 2010-11-30 20:01 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, therbert, netdev, bhutchings



On Tue, 30 Nov 2010, David Miller wrote:

> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 30 Nov 2010 20:07:27 +0100
> 
> [ Jesse CC:'d ]
> 
> > netdev struct itself is shared by all cpus, so there is no real choice,
> > unless you know one netdev will be used by a restricted set of
> > cpus/nodes... Probably very unlikely in practice.
> 
> Unfortunately Jesse has found non-trivial gains by NUMA localizing the
> netdev struct during routing tests in soome configurations.

Thanks Dave, given the results I can see with current upstream (at least 
igb) I don't think that netdev access is hurting performance unless the 
driver is unwisely accessing netdev structs for write on multiple cpus 
simultaneously.

I think the trick is to have drivers that are concerned with this kind of 
thing have a "hot path struct" that is used at runtime.  Since the cache 
on the numa systems will still cache remote node memory for read, if it is 
not written to, then the read data will be housed on each cpus' L3.
 
> > We could change (only on NUMA setups maybe)
> > 
> > struct netdev_queue *_tx;
> > 
> > to a
> > 
> > struct netdev_queue **_tx;
> > 
> > and allocate each "struct netdev_queue" on appropriate node, but adding
> > one indirection level might be overkill...

I agree probably overkill.

> > For very hot small structures, (one or two cache lines), I am not sure
> > its worth the pain.
> 
> Jesse, do you think this would help the case you were testing?

I would be glad to test, but I am currently seeing pretty good results 
with upstream igb.  I'll retest with latest kernel and with 
# taskset 1 insmod igb.ko
echo 2 > /proc/irq/<igb irqs>/smp_affinity

(1 and 2 are different sockets on my machine)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-11-29 18:14   ` [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity Eric Dumazet
  2010-11-30 18:31     ` Tom Herbert
@ 2010-12-01 20:49     ` David Miller
  2010-12-01 20:55       ` Eric Dumazet
  1 sibling, 1 reply; 21+ messages in thread
From: David Miller @ 2010-12-01 20:49 UTC (permalink / raw)
  To: eric.dumazet; +Cc: therbert, netdev, bhutchings

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 29 Nov 2010 19:14:37 +0100

> [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
> 
> Allocate qdisc memory according to NUMA properties of cpus included in
> xps map.
> 
> To be effective, qdisc should be (re)setup after changes
> of /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus
> 
> I added a numa_node field in struct netdev_queue, containing NUMA node
> if all cpus included in xps_cpus share same node, else -1.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

I think this is good enough to cook in net-next-2.6

Please use "net sched: " instead of just plain "sched: " in your
commit header lines so that people don't confuse such changes
with process scheduler ones :-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity
  2010-12-01 20:49     ` David Miller
@ 2010-12-01 20:55       ` Eric Dumazet
  0 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2010-12-01 20:55 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, netdev, bhutchings

Le mercredi 01 décembre 2010 à 12:49 -0800, David Miller a écrit :

> I think this is good enough to cook in net-next-2.6
> 
> Please use "net sched: " instead of just plain "sched: " in your
> commit header lines so that people don't confuse such changes
> with process scheduler ones :-)

Yes, I'll try to remember next time, thanks !



^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2010-12-01 20:56 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-21 23:17 [PATCH 2/2 v7] xps: Transmit Packet Steering Tom Herbert
2010-11-22 11:42 ` Changli Gao
2010-11-22 13:33 ` Eric Dumazet
2010-11-24 19:45 ` David Miller
2010-11-26 17:13   ` Tom Herbert
2010-11-26 17:17     ` Eric Dumazet
2010-11-28 15:43       ` [PATCH net-next-2.6] xps: NUMA allocations for per cpu data Eric Dumazet
2010-11-29 17:43         ` David Miller
2010-11-25 17:12 ` [PATCH 2/2 v7] xps: Transmit Packet Steering Ben Hutchings
2010-11-29 18:14   ` [PATCH net-next-2.6] sched: use xps information for qdisc NUMA affinity Eric Dumazet
2010-11-30 18:31     ` Tom Herbert
2010-11-30 18:39       ` Eric Dumazet
2010-11-30 18:46         ` Ben Hutchings
2010-11-30 18:52           ` Eric Dumazet
2010-11-30 18:48       ` David Miller
2010-11-30 19:07         ` Eric Dumazet
2010-11-30 19:19           ` Ben Hutchings
2010-11-30 19:21           ` David Miller
2010-11-30 20:01             ` Brandeburg, Jesse
2010-12-01 20:49     ` David Miller
2010-12-01 20:55       ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.