All of lore.kernel.org
 help / color / mirror / Atom feed
* [net-next rfc 0/3] increase the limit of tuntap queues
@ 2013-06-19  5:40 Jason Wang
  2013-06-19  5:40 ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Jason Wang @ 2013-06-19  5:40 UTC (permalink / raw)
  To: davem, edumazet, hkchu, mst, netdev, linux-kernel; +Cc: Jason Wang

Hi all:

This series tries to increase the limit of tuntap queues. Histrocially there're
two reasons which prevent us from doing this:

- We store the hash buckets in tun_struct which results a very large size of
  tun_struct, this high order memory allocation fail easily when the memory were
  fragmented.
- The netdev_queue and netdev_rx_queue array in netdevice were allocated through
  kmalloc, which may cause a high order memory allocation too when we have
  several queues. E.g. sizeof(netdev_queue) is 320, which means a high order
  allocation will happens when the device has more than 12 queues.

So this series tries to address those issues by switching to use flex array. All
entries were preallocated, and since flex array always do a order-0 allocation,
we can safely increase the limit after.

Only compile test, comments or review are more than welcomed.

Jason Wang (3):
  net: avoid high order memory allocation for queues by using flex
    array
  tuntap: reduce the size of tun_struct by using flex array
  tuntap: increase the max queues to 16

 drivers/net/tun.c         |   59 ++++++++++++++++++++++++++++++++------------
 include/linux/netdevice.h |   13 ++++++----
 net/core/dev.c            |   57 +++++++++++++++++++++++++++++++------------
 net/core/net-sysfs.c      |   15 +++++++----
 net/openvswitch/flow.c    |    2 +-
 5 files changed, 102 insertions(+), 44 deletions(-)


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19  5:40 [net-next rfc 0/3] increase the limit of tuntap queues Jason Wang
@ 2013-06-19  5:40 ` Jason Wang
  2013-06-19  6:31   ` Eric Dumazet
  2013-06-19  5:40 ` [net-next rfc 2/3] tuntap: reduce the size of tun_struct " Jason Wang
  2013-06-19  5:40 ` [net-next rfc 3/3] tuntap: increase the max queues to 16 Jason Wang
  2 siblings, 1 reply; 26+ messages in thread
From: Jason Wang @ 2013-06-19  5:40 UTC (permalink / raw)
  To: davem, edumazet, hkchu, mst, netdev, linux-kernel; +Cc: Jason Wang

Currently, we use kcalloc to allocate rx/tx queues for a net device which could
be easily lead to a high order memory allocation request when initializing a
multiqueue net device. We can simply avoid this by switching to use flex array
which always allocate at order zero.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/netdevice.h |   13 ++++++----
 net/core/dev.c            |   57 ++++++++++++++++++++++++++++++++------------
 net/core/net-sysfs.c      |   15 +++++++----
 3 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 09b4188..c0b5d04 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -32,6 +32,7 @@
 #include <linux/atomic.h>
 #include <asm/cache.h>
 #include <asm/byteorder.h>
+#include <linux/flex_array.h>
 
 #include <linux/percpu.h>
 #include <linux/rculist.h>
@@ -1230,7 +1231,7 @@ struct net_device {
 
 
 #ifdef CONFIG_RPS
-	struct netdev_rx_queue	*_rx;
+	struct flex_array	*_rx;
 
 	/* Number of RX queues allocated at register_netdev() time */
 	unsigned int		num_rx_queues;
@@ -1250,7 +1251,7 @@ struct net_device {
 /*
  * Cache lines mostly used on transmit path
  */
-	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
+	struct flex_array	*_tx ____cacheline_aligned_in_smp;
 
 	/* Number of TX queues allocated at alloc_netdev_mq() time  */
 	unsigned int		num_tx_queues;
@@ -1434,7 +1435,7 @@ static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
 {
-	return &dev->_tx[index];
+	return flex_array_get(dev->_tx, index);
 }
 
 static inline void netdev_for_each_tx_queue(struct net_device *dev,
@@ -1445,8 +1446,10 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 {
 	unsigned int i;
 
-	for (i = 0; i < dev->num_tx_queues; i++)
-		f(dev, &dev->_tx[i], arg);
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct netdev_queue *txq = flex_array_get(dev->_tx, i);
+		f(dev, txq, arg);
+	}
 }
 
 extern struct netdev_queue *netdev_pick_tx(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index fa007db..3a4ecb1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
 #include <linux/cpu_rmap.h>
 #include <linux/static_key.h>
 #include <linux/hashtable.h>
+#include <linux/flex_array.h>
 
 #include "net-sysfs.h"
 
@@ -2902,7 +2903,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		if (rxq_index == skb_get_rx_queue(skb))
 			goto out;
 
-		rxqueue = dev->_rx + rxq_index;
+		rxqueue = flex_array_get(dev->_rx, rxq_index);
 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 		if (!flow_table)
 			goto out;
@@ -2950,9 +2951,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 				  dev->name, index, dev->real_num_rx_queues);
 			goto done;
 		}
-		rxqueue = dev->_rx + index;
+		rxqueue = flex_array_get(dev->_rx, index);
 	} else
-		rxqueue = dev->_rx;
+		rxqueue = flex_array_get(dev->_rx, 0);
 
 	map = rcu_dereference(rxqueue->rps_map);
 	if (map) {
@@ -3038,7 +3039,7 @@ done:
 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 			 u32 flow_id, u16 filter_id)
 {
-	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+	struct netdev_rx_queue *rxqueue = flex_array_get(dev->_rx, rxq_index);
 	struct rps_dev_flow_table *flow_table;
 	struct rps_dev_flow *rflow;
 	bool expire = true;
@@ -5223,18 +5224,31 @@ EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 static int netif_alloc_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
-	struct netdev_rx_queue *rx;
+	struct flex_array *rx;
+	int err;
 
 	BUG_ON(count < 1);
 
-	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
-	if (!rx)
+	rx = flex_array_alloc(sizeof(struct netdev_rx_queue), count,
+			      GFP_KERNEL | __GFP_ZERO);
+	if (!rx) {
+		pr_err("netdev: Unable to allocate flex array for rx queues\n");
 		return -ENOMEM;
+	}
+
+	err = flex_array_prealloc(rx, 0, count, GFP_KERNEL | __GFP_ZERO);
+	if (err) {
+		pr_err("netdev, Unable to prealloc %u rx qeueus\n", count);
+		flex_array_free(rx);
+		return err;
+	}
 
 	dev->_rx = rx;
 
-	for (i = 0; i < count; i++)
-		rx[i].dev = dev;
+	for (i = 0; i < count; i++) {
+		struct netdev_rx_queue *rxq = flex_array_get(rx, i);
+		rxq->dev = dev;
+	}
 	return 0;
 }
 #endif
@@ -5256,13 +5270,24 @@ static void netdev_init_one_queue(struct net_device *dev,
 static int netif_alloc_netdev_queues(struct net_device *dev)
 {
 	unsigned int count = dev->num_tx_queues;
-	struct netdev_queue *tx;
+	struct flex_array *tx;
+	int err;
 
 	BUG_ON(count < 1);
 
-	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
-	if (!tx)
+	tx = flex_array_alloc(sizeof(struct netdev_queue), count,
+			      GFP_KERNEL | __GFP_ZERO);
+	if (!tx) {
+		pr_err("netdev: Unable to allocate flex array for tx queues\n");
 		return -ENOMEM;
+	}
+
+	err = flex_array_prealloc(tx, 0, count, GFP_KERNEL | __GFP_ZERO);
+	if (err) {
+		pr_err("netdev, Unable to prealloc %u tx qeueus\n", count);
+		flex_array_free(tx);
+		return err;
+	}
 
 	dev->_tx = tx;
 
@@ -5811,9 +5836,9 @@ free_all:
 
 free_pcpu:
 	free_percpu(dev->pcpu_refcnt);
-	kfree(dev->_tx);
+	flex_array_free(dev->_tx);
 #ifdef CONFIG_RPS
-	kfree(dev->_rx);
+	flex_array_free(dev->_rx);
 #endif
 
 free_p:
@@ -5836,9 +5861,9 @@ void free_netdev(struct net_device *dev)
 
 	release_net(dev_net(dev));
 
-	kfree(dev->_tx);
+	flex_array_free(dev->_tx);
 #ifdef CONFIG_RPS
-	kfree(dev->_rx);
+	flex_array_free(dev->_rx);
 #endif
 
 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 981fed3..4328c3a 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -22,6 +22,7 @@
 #include <linux/export.h>
 #include <linux/jiffies.h>
 #include <linux/pm_runtime.h>
+#include <linux/flex_array.h>
 
 #include "net-sysfs.h"
 
@@ -717,7 +718,7 @@ static struct kobj_type rx_queue_ktype = {
 
 static int rx_queue_add_kobject(struct net_device *net, int index)
 {
-	struct netdev_rx_queue *queue = net->_rx + index;
+	struct netdev_rx_queue *queue = flex_array_get(net->_rx, index);
 	struct kobject *kobj = &queue->kobj;
 	int error = 0;
 
@@ -751,8 +752,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 		}
 	}
 
-	while (--i >= new_num)
-		kobject_put(&net->_rx[i].kobj);
+	while (--i >= new_num) {
+		struct netdev_rx_queue *rxq = flex_array_get(net->_rx, i);
+		kobject_put(&rxq->kobj);
+	}
 
 	return error;
 #else
@@ -939,7 +942,7 @@ static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
 	int i;
 
 	for (i = 0; i < dev->num_tx_queues; i++)
-		if (queue == &dev->_tx[i])
+		if (queue == (struct netdev_queue *)flex_array_get(dev->_tx, i))
 			break;
 
 	BUG_ON(i >= dev->num_tx_queues);
@@ -1051,7 +1054,7 @@ static struct kobj_type netdev_queue_ktype = {
 
 static int netdev_queue_add_kobject(struct net_device *net, int index)
 {
-	struct netdev_queue *queue = net->_tx + index;
+	struct netdev_queue *queue = flex_array_get(net->_tx, index);
 	struct kobject *kobj = &queue->kobj;
 	int error = 0;
 
@@ -1093,7 +1096,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 	}
 
 	while (--i >= new_num) {
-		struct netdev_queue *queue = net->_tx + i;
+		struct netdev_queue *queue = flex_array_get(net->_tx, i);
 
 #ifdef CONFIG_BQL
 		sysfs_remove_group(&queue->kobj, &dql_group);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [net-next rfc 2/3] tuntap: reduce the size of tun_struct by using flex array
  2013-06-19  5:40 [net-next rfc 0/3] increase the limit of tuntap queues Jason Wang
  2013-06-19  5:40 ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
@ 2013-06-19  5:40 ` Jason Wang
  2013-06-19  5:40 ` [net-next rfc 3/3] tuntap: increase the max queues to 16 Jason Wang
  2 siblings, 0 replies; 26+ messages in thread
From: Jason Wang @ 2013-06-19  5:40 UTC (permalink / raw)
  To: davem, edumazet, hkchu, mst, netdev, linux-kernel; +Cc: Jason Wang

This patch switches to use flex array to implement the flow caches, it can
brings several advantages:

- save the size of the tun_struct structure, which can allows us to increase the
  upper limit of queues in the future.
- avoid higher order memory allocation which could be used when switching to use
  pure hashing in flow cache who may demand a larger size array in the future.

After this patch, the size of tun_struct on x86_64 were reduced from 8512 to
328.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c      |   54 +++++++++++++++++++++++++++++++++++++----------
 net/openvswitch/flow.c |    2 +-
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a344270..8c5c124 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -64,6 +64,7 @@
 #include <linux/nsproxy.h>
 #include <linux/virtio_net.h>
 #include <linux/rcupdate.h>
+#include <linux/flex_array.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
@@ -180,7 +181,7 @@ struct tun_struct {
 	int debug;
 #endif
 	spinlock_t lock;
-	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
+	struct flex_array *flows;
 	struct timer_list flow_gc_timer;
 	unsigned long ageing_time;
 	unsigned int numdisabled;
@@ -239,10 +240,11 @@ static void tun_flow_flush(struct tun_struct *tun)
 
 	spin_lock_bh(&tun->lock);
 	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+		struct hlist_head *h = flex_array_get(tun->flows, i);
 		struct tun_flow_entry *e;
 		struct hlist_node *n;
 
-		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
+		hlist_for_each_entry_safe(e, n, h, hash_link)
 			tun_flow_delete(tun, e);
 	}
 	spin_unlock_bh(&tun->lock);
@@ -254,10 +256,11 @@ static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
 
 	spin_lock_bh(&tun->lock);
 	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+		struct hlist_head *h = flex_array_get(tun->flows, i);
 		struct tun_flow_entry *e;
 		struct hlist_node *n;
 
-		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
+		hlist_for_each_entry_safe(e, n, h, hash_link) {
 			if (e->queue_index == queue_index)
 				tun_flow_delete(tun, e);
 		}
@@ -277,10 +280,11 @@ static void tun_flow_cleanup(unsigned long data)
 
 	spin_lock_bh(&tun->lock);
 	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
+		struct hlist_head *h = flex_array_get(tun->flows, i);
 		struct tun_flow_entry *e;
 		struct hlist_node *n;
 
-		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
+		hlist_for_each_entry_safe(e, n, h, hash_link) {
 			unsigned long this_timer;
 			count++;
 			this_timer = e->updated + delay;
@@ -307,7 +311,7 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
 	if (!rxhash)
 		return;
 	else
-		head = &tun->flows[tun_hashfn(rxhash)];
+		head = flex_array_get(tun->flows, tun_hashfn(rxhash));
 
 	rcu_read_lock();
 
@@ -356,7 +360,8 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
 
 	txq = skb_get_rxhash(skb);
 	if (txq) {
-		e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
+		e = tun_flow_find(flex_array_get(tun->flows, tun_hashfn(txq)),
+				  txq);
 		if (e)
 			txq = e->queue_index;
 		else
@@ -841,23 +846,45 @@ static const struct net_device_ops tap_netdev_ops = {
 #endif
 };
 
-static void tun_flow_init(struct tun_struct *tun)
+static int tun_flow_init(struct tun_struct *tun, bool mq)
 {
-	int i;
+	struct flex_array *buckets;
+	int i, err;
+
+	if (!mq)
+		return 0;
+
+	buckets = flex_array_alloc(sizeof(struct hlist_head),
+				TUN_NUM_FLOW_ENTRIES, GFP_KERNEL);
+	if (!buckets)
+		return -ENOMEM;
 
+	err = flex_array_prealloc(buckets, 0, TUN_NUM_FLOW_ENTRIES, GFP_KERNEL);
+	if (err) {
+		flex_array_free(buckets);
+		return -ENOMEM;
+	}
+
+	tun->flows = buckets;
 	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
-		INIT_HLIST_HEAD(&tun->flows[i]);
+		INIT_HLIST_HEAD((struct hlist_head *)
+				flex_array_get(buckets, i));
 
 	tun->ageing_time = TUN_FLOW_EXPIRE;
 	setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
 	mod_timer(&tun->flow_gc_timer,
 		  round_jiffies_up(jiffies + tun->ageing_time));
+
+	return 0;
 }
 
 static void tun_flow_uninit(struct tun_struct *tun)
 {
-	del_timer_sync(&tun->flow_gc_timer);
-	tun_flow_flush(tun);
+	if (tun->flags & TUN_TAP_MQ) {
+		del_timer_sync(&tun->flow_gc_timer);
+		tun_flow_flush(tun);
+		flex_array_free(tun->flows);
+	}
 }
 
 /* Initialize net device. */
@@ -1660,7 +1687,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 			goto err_free_dev;
 
 		tun_net_init(dev);
-		tun_flow_init(tun);
+
+		err = tun_flow_init(tun, queues > 1);
+		if (err < 0)
+			goto err_free_dev;
 
 		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
 			TUN_USER_FEATURES;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 093c191..5787acc 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -241,7 +241,7 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
 	struct flex_array *buckets;
 	int i, err;
 
-	buckets = flex_array_alloc(sizeof(struct hlist_head *),
+	buckets = flex_array_alloc(sizeof(struct hlist_head),
 				   n_buckets, GFP_KERNEL);
 	if (!buckets)
 		return NULL;
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [net-next rfc 3/3] tuntap: increase the max queues to 16
  2013-06-19  5:40 [net-next rfc 0/3] increase the limit of tuntap queues Jason Wang
  2013-06-19  5:40 ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
  2013-06-19  5:40 ` [net-next rfc 2/3] tuntap: reduce the size of tun_struct " Jason Wang
@ 2013-06-19  5:40 ` Jason Wang
  2013-06-19  6:34   ` Eric Dumazet
  2 siblings, 1 reply; 26+ messages in thread
From: Jason Wang @ 2013-06-19  5:40 UTC (permalink / raw)
  To: davem, edumazet, hkchu, mst, netdev, linux-kernel; +Cc: Jason Wang

Since we've reduce the size of tun_struct and use flex array to allocate netdev
queues, it's safe for us to increase the limit of queues in tuntap.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c |    5 +----
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8c5c124..205f6aa 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -110,10 +110,7 @@ struct tap_filter {
 	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
 };
 
-/* DEFAULT_MAX_NUM_RSS_QUEUES were choosed to let the rx/tx queues allocated for
- * the netdevice to be fit in one page. So we can make sure the success of
- * memory allocation. TODO: increase the limit. */
-#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES
+#define MAX_TAP_QUEUES 16
 #define MAX_TAP_FLOWS  4096
 
 #define TUN_FLOW_EXPIRE (3 * HZ)
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19  5:40 ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
@ 2013-06-19  6:31   ` Eric Dumazet
  2013-06-19  7:14     ` Jason Wang
  2013-06-19  9:11     ` Michael S. Tsirkin
  0 siblings, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2013-06-19  6:31 UTC (permalink / raw)
  To: Jason Wang; +Cc: davem, edumazet, hkchu, mst, netdev, linux-kernel

On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> Currently, we use kcalloc to allocate rx/tx queues for a net device which could
> be easily lead to a high order memory allocation request when initializing a
> multiqueue net device. We can simply avoid this by switching to use flex array
> which always allocate at order zero.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  include/linux/netdevice.h |   13 ++++++----
>  net/core/dev.c            |   57 ++++++++++++++++++++++++++++++++------------
>  net/core/net-sysfs.c      |   15 +++++++----
>  3 files changed, 58 insertions(+), 27 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 09b4188..c0b5d04 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -32,6 +32,7 @@
>  #include <linux/atomic.h>
>  #include <asm/cache.h>
>  #include <asm/byteorder.h>
> +#include <linux/flex_array.h>
>  
>  #include <linux/percpu.h>
>  #include <linux/rculist.h>
> @@ -1230,7 +1231,7 @@ struct net_device {
>  
> 
>  #ifdef CONFIG_RPS
> -	struct netdev_rx_queue	*_rx;
> +	struct flex_array	*_rx;
>  
>  	/* Number of RX queues allocated at register_netdev() time */
>  	unsigned int		num_rx_queues;
> @@ -1250,7 +1251,7 @@ struct net_device {
>  /*
>   * Cache lines mostly used on transmit path
>   */
> -	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
> +	struct flex_array	*_tx ____cacheline_aligned_in_smp;
>  

Using flex_array and adding overhead in this super critical part of
network stack, only to avoid order-1 allocations done in GFP_KERNEL
context is simply insane.

We can revisit this in 2050 if we ever need order-4 allocations or so,
and still use 4K pages.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 3/3] tuntap: increase the max queues to 16
  2013-06-19  5:40 ` [net-next rfc 3/3] tuntap: increase the max queues to 16 Jason Wang
@ 2013-06-19  6:34   ` Eric Dumazet
  2013-06-19  7:15     ` Jason Wang
  2013-06-19 19:16     ` Jerry Chu
  0 siblings, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2013-06-19  6:34 UTC (permalink / raw)
  To: Jason Wang; +Cc: davem, edumazet, hkchu, mst, netdev, linux-kernel

On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> Since we've reduce the size of tun_struct and use flex array to allocate netdev
> queues, it's safe for us to increase the limit of queues in tuntap.

Its already safe to increase max queues to 16, without your patches 1 &
2




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19  6:31   ` Eric Dumazet
@ 2013-06-19  7:14     ` Jason Wang
  2013-06-19  9:11     ` Michael S. Tsirkin
  1 sibling, 0 replies; 26+ messages in thread
From: Jason Wang @ 2013-06-19  7:14 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, edumazet, hkchu, mst, netdev, linux-kernel

On 06/19/2013 02:31 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Currently, we use kcalloc to allocate rx/tx queues for a net device which could
>> be easily lead to a high order memory allocation request when initializing a
>> multiqueue net device. We can simply avoid this by switching to use flex array
>> which always allocate at order zero.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>  include/linux/netdevice.h |   13 ++++++----
>>  net/core/dev.c            |   57 ++++++++++++++++++++++++++++++++------------
>>  net/core/net-sysfs.c      |   15 +++++++----
>>  3 files changed, 58 insertions(+), 27 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 09b4188..c0b5d04 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -32,6 +32,7 @@
>>  #include <linux/atomic.h>
>>  #include <asm/cache.h>
>>  #include <asm/byteorder.h>
>> +#include <linux/flex_array.h>
>>  
>>  #include <linux/percpu.h>
>>  #include <linux/rculist.h>
>> @@ -1230,7 +1231,7 @@ struct net_device {
>>  
>>
>>  #ifdef CONFIG_RPS
>> -	struct netdev_rx_queue	*_rx;
>> +	struct flex_array	*_rx;
>>  
>>  	/* Number of RX queues allocated at register_netdev() time */
>>  	unsigned int		num_rx_queues;
>> @@ -1250,7 +1251,7 @@ struct net_device {
>>  /*
>>   * Cache lines mostly used on transmit path
>>   */
>> -	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
>> +	struct flex_array	*_tx ____cacheline_aligned_in_smp;
>>  
> Using flex_array and adding overhead in this super critical part of
> network stack, only to avoid order-1 allocations done in GFP_KERNEL
> context is simply insane.

Yes, and I also miss the fact of GFP_KERNEL allocation.
> We can revisit this in 2050 if we ever need order-4 allocations or so,
> and still use 4K pages.
>
>

Will drop this patch, thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 3/3] tuntap: increase the max queues to 16
  2013-06-19  6:34   ` Eric Dumazet
@ 2013-06-19  7:15     ` Jason Wang
  2013-06-19 19:16     ` Jerry Chu
  1 sibling, 0 replies; 26+ messages in thread
From: Jason Wang @ 2013-06-19  7:15 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, edumazet, hkchu, mst, netdev, linux-kernel

On 06/19/2013 02:34 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Since we've reduce the size of tun_struct and use flex array to allocate netdev
>> queues, it's safe for us to increase the limit of queues in tuntap.
> Its already safe to increase max queues to 16, without your patches 1 &
> 2
>
>
>

Will send a single patch to increase it to 16.

Thanks


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19  6:31   ` Eric Dumazet
  2013-06-19  7:14     ` Jason Wang
@ 2013-06-19  9:11     ` Michael S. Tsirkin
  2013-06-19  9:56       ` Eric Dumazet
  1 sibling, 1 reply; 26+ messages in thread
From: Michael S. Tsirkin @ 2013-06-19  9:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, davem, edumazet, hkchu, netdev, linux-kernel

On Tue, Jun 18, 2013 at 11:31:58PM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
> > Currently, we use kcalloc to allocate rx/tx queues for a net device which could
> > be easily lead to a high order memory allocation request when initializing a
> > multiqueue net device. We can simply avoid this by switching to use flex array
> > which always allocate at order zero.
> > 
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > ---
> >  include/linux/netdevice.h |   13 ++++++----
> >  net/core/dev.c            |   57 ++++++++++++++++++++++++++++++++------------
> >  net/core/net-sysfs.c      |   15 +++++++----
> >  3 files changed, 58 insertions(+), 27 deletions(-)
> > 
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 09b4188..c0b5d04 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -32,6 +32,7 @@
> >  #include <linux/atomic.h>
> >  #include <asm/cache.h>
> >  #include <asm/byteorder.h>
> > +#include <linux/flex_array.h>
> >  
> >  #include <linux/percpu.h>
> >  #include <linux/rculist.h>
> > @@ -1230,7 +1231,7 @@ struct net_device {
> >  
> > 
> >  #ifdef CONFIG_RPS
> > -	struct netdev_rx_queue	*_rx;
> > +	struct flex_array	*_rx;
> >  
> >  	/* Number of RX queues allocated at register_netdev() time */
> >  	unsigned int		num_rx_queues;
> > @@ -1250,7 +1251,7 @@ struct net_device {
> >  /*
> >   * Cache lines mostly used on transmit path
> >   */
> > -	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
> > +	struct flex_array	*_tx ____cacheline_aligned_in_smp;
> >  
> 
> Using flex_array and adding overhead in this super critical part of
> network stack, only to avoid order-1 allocations done in GFP_KERNEL
> context is simply insane.
> 
> We can revisit this in 2050 if we ever need order-4 allocations or so,
> and still use 4K pages.
> 
> 

Well KVM supports up to 160 VCPUs on x86.

Creating a queue per CPU is very reasonable, and
assuming cache line size of 64 bytes, netdev_queue seems to be 320
bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
I agree most people don't have such systems yet, but
they do exist.

We can cut the size of netdev_queue, moving out kobj - which
does not seem to be used on data path to a separate structure.
It's 64 byte in size so exactly 256 bytes.
That will get us an order-3 allocation, and there's
some padding there so we won't immediately increase it
the moment we add some fields.

Comments on this idea?

Instead of always using a flex array, we could have
+	struct netdev_queue     *_tx; /* Used with small # of queues */
+#ifdef CONFIG_NETDEV_HUGE_NUMBER_OR_QUEUES
+	struct flex_array     *_tx_large; /* Used with large # of queues */
+#endif

And fix wrappers to use _tx if not NULL, otherwise _tx_large.

If configured in, it's an extra branch on data path but probably less
costly than the extra indirection.

-- 
MST

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19  9:11     ` Michael S. Tsirkin
@ 2013-06-19  9:56       ` Eric Dumazet
  2013-06-19 12:22         ` Michael S. Tsirkin
                           ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Eric Dumazet @ 2013-06-19  9:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, davem, edumazet, hkchu, netdev, linux-kernel

On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:

> Well KVM supports up to 160 VCPUs on x86.
> 
> Creating a queue per CPU is very reasonable, and
> assuming cache line size of 64 bytes, netdev_queue seems to be 320
> bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> I agree most people don't have such systems yet, but
> they do exist.

Even so, it will just work, like a fork() is likely to work, even if a
process needs order-1 allocation for kernel stack.

Some drivers still use order-10 allocations with kmalloc(), and nobody
complained yet.

We had complains with mlx4 driver lately only bcause kmalloc() now gives
a warning if allocations above MAX_ORDER are attempted.

Having a single pointer means that we can :

- Attempts a regular kmalloc() call, it will work most of the time.
- fallback to vmalloc() _if_ kmalloc() failed.

Frankly, if you want one tx queue per cpu, I would rather use
NETIF_F_LLTX, like some other virtual devices.

This way, you can have real per cpu memory, with proper NUMA affinity.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19  9:56       ` Eric Dumazet
@ 2013-06-19 12:22         ` Michael S. Tsirkin
  2013-06-19 15:40         ` Michael S. Tsirkin
  2013-06-20  5:14         ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
  2 siblings, 0 replies; 26+ messages in thread
From: Michael S. Tsirkin @ 2013-06-19 12:22 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, davem, edumazet, hkchu, netdev, linux-kernel

On Wed, Jun 19, 2013 at 02:56:03AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
> 
> > Well KVM supports up to 160 VCPUs on x86.
> > 
> > Creating a queue per CPU is very reasonable, and
> > assuming cache line size of 64 bytes, netdev_queue seems to be 320
> > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> > I agree most people don't have such systems yet, but
> > they do exist.
> 
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
> 
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
> 
> Having a single pointer means that we can :
> 
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.

Most drivers create devices at boot time, when this is more likely to work.
What makes tun (and macvlan) a bit special is that the device is created
from userspace.  Virt setups create/destroy them all the
time.

> 
> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.
> 
> This way, you can have real per cpu memory, with proper NUMA affinity.
> 

Hmm good point, worth looking at.

Thanks,
-- 
MST

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19  9:56       ` Eric Dumazet
  2013-06-19 12:22         ` Michael S. Tsirkin
@ 2013-06-19 15:40         ` Michael S. Tsirkin
  2013-06-19 15:58           ` Eric Dumazet
  2013-06-20  5:14         ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
  2 siblings, 1 reply; 26+ messages in thread
From: Michael S. Tsirkin @ 2013-06-19 15:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, davem, edumazet, hkchu, netdev, linux-kernel

On Wed, Jun 19, 2013 at 02:56:03AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
> 
> > Well KVM supports up to 160 VCPUs on x86.
> > 
> > Creating a queue per CPU is very reasonable, and
> > assuming cache line size of 64 bytes, netdev_queue seems to be 320
> > bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
> > I agree most people don't have such systems yet, but
> > they do exist.
> 
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
> 
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
> 
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
> 
> Having a single pointer means that we can :
> 
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.

That's a good trick too - vmalloc memory is a bit slower
on x86 since it's not using a huge page, but that's only
when we have lots of CPUs/queues...

Short term - how about switching to vmalloc if > 32 queues?

> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.
> 
> This way, you can have real per cpu memory, with proper NUMA affinity.
> 
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19 15:40         ` Michael S. Tsirkin
@ 2013-06-19 15:58           ` Eric Dumazet
  2013-06-19 16:06               ` David Laight
  2013-06-19 18:07             ` Michael S. Tsirkin
  0 siblings, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2013-06-19 15:58 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, davem, edumazet, hkchu, netdev, linux-kernel

On Wed, 2013-06-19 at 18:40 +0300, Michael S. Tsirkin wrote:

> That's a good trick too - vmalloc memory is a bit slower
> on x86 since it's not using a huge page, but that's only
> when we have lots of CPUs/queues...
> 
> Short term - how about switching to vmalloc if > 32 queues?

You do not have to switch "if > 32 queues"

diff --git a/net/core/dev.c b/net/core/dev.c
index fa007db..473cc9e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
 #include <linux/cpu_rmap.h>
 #include <linux/static_key.h>
 #include <linux/hashtable.h>
+#include <linux/vmalloc.h>
 
 #include "net-sysfs.h"
 
@@ -5253,6 +5254,14 @@ static void netdev_init_one_queue(struct net_device *dev,
 #endif
 }
 
+static void netif_free_tx_queues(struct net_device *dev)
+{
+	if (is_vmalloc_addr(dev->_tx))
+		vfree(dev->_tx);
+	else
+		kfree(dev->_tx);
+}
+
 static int netif_alloc_netdev_queues(struct net_device *dev)
 {
 	unsigned int count = dev->num_tx_queues;
@@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 	BUG_ON(count < 1);
 
 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
-	if (!tx)
-		return -ENOMEM;
-
+	if (!tx) {
+		tx = vzalloc(count * sizeof(struct netdev_queue));
+		if (!tx)
+			return -ENOMEM;
+	}
 	dev->_tx = tx;
 
 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -5811,7 +5822,7 @@ free_all:
 
 free_pcpu:
 	free_percpu(dev->pcpu_refcnt);
-	kfree(dev->_tx);
+	netif_free_tx_queues(dev);
 #ifdef CONFIG_RPS
 	kfree(dev->_rx);
 #endif
@@ -5836,7 +5847,7 @@ void free_netdev(struct net_device *dev)
 
 	release_net(dev_net(dev));
 
-	kfree(dev->_tx);
+	netif_free_tx_queues(dev);
 #ifdef CONFIG_RPS
 	kfree(dev->_rx);
 #endif



^ permalink raw reply related	[flat|nested] 26+ messages in thread

* RE: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19 15:58           ` Eric Dumazet
@ 2013-06-19 16:06               ` David Laight
  2013-06-19 18:07             ` Michael S. Tsirkin
  1 sibling, 0 replies; 26+ messages in thread
From: David Laight @ 2013-06-19 16:06 UTC (permalink / raw)
  To: Eric Dumazet, Michael S. Tsirkin
  Cc: Jason Wang, davem, edumazet, hkchu, netdev, linux-kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 987 bytes --]

> +static void netif_free_tx_queues(struct net_device *dev)
> +{
> +	if (is_vmalloc_addr(dev->_tx))
> +		vfree(dev->_tx);
> +	else
> +		kfree(dev->_tx);
> +}
> +
>  static int netif_alloc_netdev_queues(struct net_device *dev)
>  {
>  	unsigned int count = dev->num_tx_queues;
> @@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
>  	BUG_ON(count < 1);
> 
>  	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
> -	if (!tx)
> -		return -ENOMEM;
> -
> +	if (!tx) {
> +		tx = vzalloc(count * sizeof(struct netdev_queue));
> +		if (!tx)
> +			return -ENOMEM;
> +	}
>  	dev->_tx = tx;

Given the number of places I've seen this code added, why
not put it in a general helper?

I also thought that malloc() with GFP_KERNEL would sleep.
Under what conditions does it fail instead?

	David

ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
@ 2013-06-19 16:06               ` David Laight
  0 siblings, 0 replies; 26+ messages in thread
From: David Laight @ 2013-06-19 16:06 UTC (permalink / raw)
  To: Eric Dumazet, Michael S. Tsirkin
  Cc: Jason Wang, davem, edumazet, hkchu, netdev, linux-kernel

> +static void netif_free_tx_queues(struct net_device *dev)
> +{
> +	if (is_vmalloc_addr(dev->_tx))
> +		vfree(dev->_tx);
> +	else
> +		kfree(dev->_tx);
> +}
> +
>  static int netif_alloc_netdev_queues(struct net_device *dev)
>  {
>  	unsigned int count = dev->num_tx_queues;
> @@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
>  	BUG_ON(count < 1);
> 
>  	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
> -	if (!tx)
> -		return -ENOMEM;
> -
> +	if (!tx) {
> +		tx = vzalloc(count * sizeof(struct netdev_queue));
> +		if (!tx)
> +			return -ENOMEM;
> +	}
>  	dev->_tx = tx;

Given the number of places I've seen this code added, why
not put it in a general helper?

I also thought that malloc() with GFP_KERNEL would sleep.
Under what conditions does it fail instead?

	David


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19 16:06               ` David Laight
  (?)
@ 2013-06-19 16:28               ` Eric Dumazet
  -1 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2013-06-19 16:28 UTC (permalink / raw)
  To: David Laight
  Cc: Michael S. Tsirkin, Jason Wang, davem, edumazet, hkchu, netdev,
	linux-kernel

On Wed, 2013-06-19 at 17:06 +0100, David Laight wrote:

> Given the number of places I've seen this code added, why
> not put it in a general helper?

Good luck with that. We had many attempts in the past.

> I also thought that malloc() with GFP_KERNEL would sleep.
> Under what conditions does it fail instead?

For example on 32 bit kernel, LOW memory exhaustion/fragmentation.

vmalloc() has an extra 128MB virtual space (on i386 at least with
standard 3G/1G split)




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19 15:58           ` Eric Dumazet
  2013-06-19 16:06               ` David Laight
@ 2013-06-19 18:07             ` Michael S. Tsirkin
  2013-06-20  8:15               ` [PATCH net-next] net: allow large number of tx queues Eric Dumazet
  1 sibling, 1 reply; 26+ messages in thread
From: Michael S. Tsirkin @ 2013-06-19 18:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, davem, edumazet, hkchu, netdev, linux-kernel

On Wed, Jun 19, 2013 at 08:58:31AM -0700, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 18:40 +0300, Michael S. Tsirkin wrote:
> 
> > That's a good trick too - vmalloc memory is a bit slower
> > on x86 since it's not using a huge page, but that's only
> > when we have lots of CPUs/queues...
> > 
> > Short term - how about switching to vmalloc if > 32 queues?
> 
> You do not have to switch "if > 32 queues"
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index fa007db..473cc9e 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -130,6 +130,7 @@
>  #include <linux/cpu_rmap.h>
>  #include <linux/static_key.h>
>  #include <linux/hashtable.h>
> +#include <linux/vmalloc.h>
>  
>  #include "net-sysfs.h"
>  
> @@ -5253,6 +5254,14 @@ static void netdev_init_one_queue(struct net_device *dev,
>  #endif
>  }
>  
> +static void netif_free_tx_queues(struct net_device *dev)
> +{
> +	if (is_vmalloc_addr(dev->_tx))
> +		vfree(dev->_tx);
> +	else
> +		kfree(dev->_tx);
> +}
> +
>  static int netif_alloc_netdev_queues(struct net_device *dev)
>  {
>  	unsigned int count = dev->num_tx_queues;
> @@ -5261,9 +5270,11 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
>  	BUG_ON(count < 1);
>  
>  	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);

As we have explicit code to recover, maybe set __GFP_NOWARN?
If we are out of memory, vzalloc will warn too, we don't
need two warnings.

> -	if (!tx)
> -		return -ENOMEM;
> -
> +	if (!tx) {
> +		tx = vzalloc(count * sizeof(struct netdev_queue));
> +		if (!tx)
> +			return -ENOMEM;
> +	}
>  	dev->_tx = tx;
>  
>  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
> @@ -5811,7 +5822,7 @@ free_all:
>  
>  free_pcpu:
>  	free_percpu(dev->pcpu_refcnt);
> -	kfree(dev->_tx);
> +	netif_free_tx_queues(dev);
>  #ifdef CONFIG_RPS
>  	kfree(dev->_rx);
>  #endif
> @@ -5836,7 +5847,7 @@ void free_netdev(struct net_device *dev)
>  
>  	release_net(dev_net(dev));
>  
> -	kfree(dev->_tx);
> +	netif_free_tx_queues(dev);
>  #ifdef CONFIG_RPS
>  	kfree(dev->_rx);
>  #endif
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 3/3] tuntap: increase the max queues to 16
  2013-06-19  6:34   ` Eric Dumazet
  2013-06-19  7:15     ` Jason Wang
@ 2013-06-19 19:16     ` Jerry Chu
  1 sibling, 0 replies; 26+ messages in thread
From: Jerry Chu @ 2013-06-19 19:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jason Wang, David Miller, Eric Dumazet, mst, netdev, linux-kernel

On Tue, Jun 18, 2013 at 11:34 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2013-06-19 at 13:40 +0800, Jason Wang wrote:
>> Since we've reduce the size of tun_struct and use flex array to allocate netdev
>> queues, it's safe for us to increase the limit of queues in tuntap.
>
> Its already safe to increase max queues to 16, without your patches 1 &
> 2

How about 32? Will kmem size be an issue?

As others have pointed out it's best to allocate one queue per CPU
(virtual or real)
and > 16 CPU machines are very common. (I only asked for 16 initially
out of concern
for kmem size.)

Thanks,

Jerry

>
>
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-19  9:56       ` Eric Dumazet
  2013-06-19 12:22         ` Michael S. Tsirkin
  2013-06-19 15:40         ` Michael S. Tsirkin
@ 2013-06-20  5:14         ` Jason Wang
  2013-06-20  6:05           ` Eric Dumazet
  2 siblings, 1 reply; 26+ messages in thread
From: Jason Wang @ 2013-06-20  5:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Michael S. Tsirkin, davem, edumazet, hkchu, netdev, linux-kernel

On 06/19/2013 05:56 PM, Eric Dumazet wrote:
> On Wed, 2013-06-19 at 12:11 +0300, Michael S. Tsirkin wrote:
>
>> Well KVM supports up to 160 VCPUs on x86.
>>
>> Creating a queue per CPU is very reasonable, and
>> assuming cache line size of 64 bytes, netdev_queue seems to be 320
>> bytes, that's 320*160 = 51200. So 12.5 pages, order-4 allocation.
>> I agree most people don't have such systems yet, but
>> they do exist.
> Even so, it will just work, like a fork() is likely to work, even if a
> process needs order-1 allocation for kernel stack.
>
> Some drivers still use order-10 allocations with kmalloc(), and nobody
> complained yet.
>
> We had complains with mlx4 driver lately only bcause kmalloc() now gives
> a warning if allocations above MAX_ORDER are attempted.
>
> Having a single pointer means that we can :
>
> - Attempts a regular kmalloc() call, it will work most of the time.
> - fallback to vmalloc() _if_ kmalloc() failed.
>
> Frankly, if you want one tx queue per cpu, I would rather use
> NETIF_F_LLTX, like some other virtual devices.

A drawback of NETIF_F_LLTX is that we may contend on qdisc lock
especially when we have a huge number of tx queues.
>
> This way, you can have real per cpu memory, with proper NUMA affinity.
>
>
>


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array
  2013-06-20  5:14         ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
@ 2013-06-20  6:05           ` Eric Dumazet
  0 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2013-06-20  6:05 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, davem, edumazet, hkchu, netdev, linux-kernel

On Thu, 2013-06-20 at 13:14 +0800, Jason Wang wrote:

> A drawback of NETIF_F_LLTX is that we may contend on qdisc lock
> especially when we have a huge number of tx queues.

For your information loopback driver is LLTX, and there is no qdisc on
it, unless you specifically add one.

Once you add a qdisc on a device, you hit the typical contention on a
spinlock. But its hard to design a parallel qdisc. So far nobody did
that.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH net-next] net: allow large number of tx queues
  2013-06-19 18:07             ` Michael S. Tsirkin
@ 2013-06-20  8:15               ` Eric Dumazet
  2013-06-20  8:35                 ` Michael S. Tsirkin
  2013-06-24  6:57                 ` David Miller
  0 siblings, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2013-06-20  8:15 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, davem, hkchu, netdev

From: Eric Dumazet <edumazet@google.com>

On Wed, 2013-06-19 at 21:07 +0300, Michael S. Tsirkin wrote:

> As we have explicit code to recover, maybe set __GFP_NOWARN?
> If we are out of memory, vzalloc will warn too, we don't
> need two warnings.

Yes, that's absolutely the right thing to do.

Its yet not clear this patch is really needed, but here it is.

Thanks

[PATCH net-next] net: allow large number of tx queues

netif_alloc_netdev_queues() uses kcalloc() to allocate memory
for the "struct netdev_queue *_tx" array.

For large number of tx queues, kcalloc() might fail, so this
patch does a fallback to vzalloc().

As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT
to kzalloc() flags to do this fallback only when really needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dev.c |   26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index fa007db..722f633 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
 #include <linux/cpu_rmap.h>
 #include <linux/static_key.h>
 #include <linux/hashtable.h>
+#include <linux/vmalloc.h>
 
 #include "net-sysfs.h"
 
@@ -5253,17 +5254,28 @@ static void netdev_init_one_queue(struct net_device *dev,
 #endif
 }
 
+static void netif_free_tx_queues(struct net_device *dev)
+{
+	if (is_vmalloc_addr(dev->_tx))
+		vfree(dev->_tx);
+	else
+		kfree(dev->_tx);
+}
+
 static int netif_alloc_netdev_queues(struct net_device *dev)
 {
 	unsigned int count = dev->num_tx_queues;
 	struct netdev_queue *tx;
+	size_t sz = count * sizeof(*tx);
 
-	BUG_ON(count < 1);
-
-	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
-	if (!tx)
-		return -ENOMEM;
+	BUG_ON(count < 1 || count > 0xffff);
 
+	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	if (!tx) {
+		tx = vzalloc(sz);
+		if (!tx)
+			return -ENOMEM;
+	}
 	dev->_tx = tx;
 
 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -5811,7 +5823,7 @@ free_all:
 
 free_pcpu:
 	free_percpu(dev->pcpu_refcnt);
-	kfree(dev->_tx);
+	netif_free_tx_queues(dev);
 #ifdef CONFIG_RPS
 	kfree(dev->_rx);
 #endif
@@ -5836,7 +5848,7 @@ void free_netdev(struct net_device *dev)
 
 	release_net(dev_net(dev));
 
-	kfree(dev->_tx);
+	netif_free_tx_queues(dev);
 #ifdef CONFIG_RPS
 	kfree(dev->_rx);
 #endif

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH net-next] net: allow large number of tx queues
  2013-06-20  8:15               ` [PATCH net-next] net: allow large number of tx queues Eric Dumazet
@ 2013-06-20  8:35                 ` Michael S. Tsirkin
  2013-06-21  6:41                   ` Jason Wang
  2013-06-24  6:57                 ` David Miller
  1 sibling, 1 reply; 26+ messages in thread
From: Michael S. Tsirkin @ 2013-06-20  8:35 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, davem, hkchu, netdev, dwmw2

On Thu, Jun 20, 2013 at 01:15:51AM -0700, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> On Wed, 2013-06-19 at 21:07 +0300, Michael S. Tsirkin wrote:
> 
> > As we have explicit code to recover, maybe set __GFP_NOWARN?
> > If we are out of memory, vzalloc will warn too, we don't
> > need two warnings.
> 
> Yes, that's absolutely the right thing to do.
> 
> Its yet not clear this patch is really needed, but here it is.
> 
> Thanks
> 
> [PATCH net-next] net: allow large number of tx queues
> 
> netif_alloc_netdev_queues() uses kcalloc() to allocate memory
> for the "struct netdev_queue *_tx" array.
> 
> For large number of tx queues, kcalloc() might fail, so this
> patch does a fallback to vzalloc().
> 
> As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT
> to kzalloc() flags to do this fallback only when really needed.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

FWIW

Acked-by: Michael S. Tsirkin <mst@redhat.com>

This makes it possible to go up to 1K queues which should
be enough, so we can revert the first chunk in
edfb6a148ce62e5e19354a1dcd9a34e00815c2a1.


> ---
>  net/core/dev.c |   26 +++++++++++++++++++-------
>  1 file changed, 19 insertions(+), 7 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index fa007db..722f633 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -130,6 +130,7 @@
>  #include <linux/cpu_rmap.h>
>  #include <linux/static_key.h>
>  #include <linux/hashtable.h>
> +#include <linux/vmalloc.h>
>  
>  #include "net-sysfs.h"
>  
> @@ -5253,17 +5254,28 @@ static void netdev_init_one_queue(struct net_device *dev,
>  #endif
>  }
>  
> +static void netif_free_tx_queues(struct net_device *dev)
> +{
> +	if (is_vmalloc_addr(dev->_tx))
> +		vfree(dev->_tx);
> +	else
> +		kfree(dev->_tx);
> +}
> +
>  static int netif_alloc_netdev_queues(struct net_device *dev)
>  {
>  	unsigned int count = dev->num_tx_queues;
>  	struct netdev_queue *tx;
> +	size_t sz = count * sizeof(*tx);
>  
> -	BUG_ON(count < 1);
> -
> -	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
> -	if (!tx)
> -		return -ENOMEM;
> +	BUG_ON(count < 1 || count > 0xffff);
>  
> +	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
> +	if (!tx) {
> +		tx = vzalloc(sz);
> +		if (!tx)
> +			return -ENOMEM;
> +	}
>  	dev->_tx = tx;
>  
>  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
> @@ -5811,7 +5823,7 @@ free_all:
>  
>  free_pcpu:
>  	free_percpu(dev->pcpu_refcnt);
> -	kfree(dev->_tx);
> +	netif_free_tx_queues(dev);
>  #ifdef CONFIG_RPS
>  	kfree(dev->_rx);
>  #endif
> @@ -5836,7 +5848,7 @@ void free_netdev(struct net_device *dev)
>  
>  	release_net(dev_net(dev));
>  
> -	kfree(dev->_tx);
> +	netif_free_tx_queues(dev);
>  #ifdef CONFIG_RPS
>  	kfree(dev->_rx);
>  #endif
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH net-next] net: allow large number of tx queues
  2013-06-20  8:35                 ` Michael S. Tsirkin
@ 2013-06-21  6:41                   ` Jason Wang
  2013-06-21  7:12                     ` Eric Dumazet
  0 siblings, 1 reply; 26+ messages in thread
From: Jason Wang @ 2013-06-21  6:41 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Eric Dumazet, davem, hkchu, netdev, dwmw2

On 06/20/2013 04:35 PM, Michael S. Tsirkin wrote:
> On Thu, Jun 20, 2013 at 01:15:51AM -0700, Eric Dumazet wrote:
>> From: Eric Dumazet <edumazet@google.com>
>>
>> On Wed, 2013-06-19 at 21:07 +0300, Michael S. Tsirkin wrote:
>>
>>> As we have explicit code to recover, maybe set __GFP_NOWARN?
>>> If we are out of memory, vzalloc will warn too, we don't
>>> need two warnings.
>> Yes, that's absolutely the right thing to do.
>>
>> Its yet not clear this patch is really needed, but here it is.
>>
>> Thanks
>>
>> [PATCH net-next] net: allow large number of tx queues
>>
>> netif_alloc_netdev_queues() uses kcalloc() to allocate memory
>> for the "struct netdev_queue *_tx" array.
>>
>> For large number of tx queues, kcalloc() might fail, so this
>> patch does a fallback to vzalloc().
>>
>> As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT
>> to kzalloc() flags to do this fallback only when really needed.
>>
>> Signed-off-by: Eric Dumazet <edumazet@google.com>
> FWIW
>
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
>
> This makes it possible to go up to 1K queues which should
> be enough, so we can revert the first chunk in
> edfb6a148ce62e5e19354a1dcd9a34e00815c2a1.

1K queues (about 80 pages) looks a little bit aggressive which means we
may always fall back to vmalloc()?
>
>
>> ---
>>  net/core/dev.c |   26 +++++++++++++++++++-------
>>  1 file changed, 19 insertions(+), 7 deletions(-)
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index fa007db..722f633 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -130,6 +130,7 @@
>>  #include <linux/cpu_rmap.h>
>>  #include <linux/static_key.h>
>>  #include <linux/hashtable.h>
>> +#include <linux/vmalloc.h>
>>  
>>  #include "net-sysfs.h"
>>  
>> @@ -5253,17 +5254,28 @@ static void netdev_init_one_queue(struct net_device *dev,
>>  #endif
>>  }
>>  
>> +static void netif_free_tx_queues(struct net_device *dev)
>> +{
>> +	if (is_vmalloc_addr(dev->_tx))
>> +		vfree(dev->_tx);
>> +	else
>> +		kfree(dev->_tx);
>> +}
>> +
>>  static int netif_alloc_netdev_queues(struct net_device *dev)
>>  {
>>  	unsigned int count = dev->num_tx_queues;
>>  	struct netdev_queue *tx;
>> +	size_t sz = count * sizeof(*tx);
>>  
>> -	BUG_ON(count < 1);
>> -
>> -	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
>> -	if (!tx)
>> -		return -ENOMEM;
>> +	BUG_ON(count < 1 || count > 0xffff);
>>  
>> +	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
>> +	if (!tx) {
>> +		tx = vzalloc(sz);
>> +		if (!tx)
>> +			return -ENOMEM;
>> +	}
>>  	dev->_tx = tx;
>>  
>>  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
>> @@ -5811,7 +5823,7 @@ free_all:
>>  
>>  free_pcpu:
>>  	free_percpu(dev->pcpu_refcnt);
>> -	kfree(dev->_tx);
>> +	netif_free_tx_queues(dev);
>>  #ifdef CONFIG_RPS
>>  	kfree(dev->_rx);
>>  #endif
>> @@ -5836,7 +5848,7 @@ void free_netdev(struct net_device *dev)
>>  
>>  	release_net(dev_net(dev));
>>  
>> -	kfree(dev->_tx);
>> +	netif_free_tx_queues(dev);
>>  #ifdef CONFIG_RPS
>>  	kfree(dev->_rx);
>>  #endif
>>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH net-next] net: allow large number of tx queues
  2013-06-21  6:41                   ` Jason Wang
@ 2013-06-21  7:12                     ` Eric Dumazet
  2013-06-23 10:29                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2013-06-21  7:12 UTC (permalink / raw)
  To: Jason Wang; +Cc: Michael S. Tsirkin, davem, hkchu, netdev, dwmw2

On Fri, 2013-06-21 at 14:41 +0800, Jason Wang wrote:

> 1K queues (about 80 pages) looks a little bit aggressive which means we
> may always fall back to vmalloc()?

1K queues looks like you should use 0 queue (LLTX) in fact, so that you
design the minimal percpu structure in the driver to fit the needs.

Using Qdisc, qith MQ, pfifo_fast, on 1000 'queues' is adding a fair
amount of overhead and memory usage.

For example, loopback driver only needs small percpu structure (16
bytes), as it immediately forward the packet into netif_rx() [ Which
also use a percpu area called softnet_data ]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH net-next] net: allow large number of tx queues
  2013-06-21  7:12                     ` Eric Dumazet
@ 2013-06-23 10:29                       ` Michael S. Tsirkin
  0 siblings, 0 replies; 26+ messages in thread
From: Michael S. Tsirkin @ 2013-06-23 10:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, davem, hkchu, netdev, dwmw2

On Fri, Jun 21, 2013 at 12:12:30AM -0700, Eric Dumazet wrote:
> On Fri, 2013-06-21 at 14:41 +0800, Jason Wang wrote:
> 
> > 1K queues (about 80 pages) looks a little bit aggressive which means we
> > may always fall back to vmalloc()?
> 
> 1K queues looks like you should use 0 queue (LLTX) in fact, so that you
> design the minimal percpu structure in the driver to fit the needs.
> 
> Using Qdisc, qith MQ, pfifo_fast, on 1000 'queues' is adding a fair
> amount of overhead and memory usage.
> 
> For example, loopback driver only needs small percpu structure (16
> bytes), as it immediately forward the packet into netif_rx() [ Which
> also use a percpu area called softnet_data ]
> 
> 

I agree it's worth considering for tun.

But the same number of queues will be there for virtio net,
and that's much more like a regular network device.

-- 
MST

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH net-next] net: allow large number of tx queues
  2013-06-20  8:15               ` [PATCH net-next] net: allow large number of tx queues Eric Dumazet
  2013-06-20  8:35                 ` Michael S. Tsirkin
@ 2013-06-24  6:57                 ` David Miller
  1 sibling, 0 replies; 26+ messages in thread
From: David Miller @ 2013-06-24  6:57 UTC (permalink / raw)
  To: eric.dumazet; +Cc: mst, jasowang, hkchu, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 20 Jun 2013 01:15:51 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> netif_alloc_netdev_queues() uses kcalloc() to allocate memory
> for the "struct netdev_queue *_tx" array.
> 
> For large number of tx queues, kcalloc() might fail, so this
> patch does a fallback to vzalloc().
> 
> As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT
> to kzalloc() flags to do this fallback only when really needed.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2013-06-24  6:57 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-06-19  5:40 [net-next rfc 0/3] increase the limit of tuntap queues Jason Wang
2013-06-19  5:40 ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
2013-06-19  6:31   ` Eric Dumazet
2013-06-19  7:14     ` Jason Wang
2013-06-19  9:11     ` Michael S. Tsirkin
2013-06-19  9:56       ` Eric Dumazet
2013-06-19 12:22         ` Michael S. Tsirkin
2013-06-19 15:40         ` Michael S. Tsirkin
2013-06-19 15:58           ` Eric Dumazet
2013-06-19 16:06             ` David Laight
2013-06-19 16:06               ` David Laight
2013-06-19 16:28               ` Eric Dumazet
2013-06-19 18:07             ` Michael S. Tsirkin
2013-06-20  8:15               ` [PATCH net-next] net: allow large number of tx queues Eric Dumazet
2013-06-20  8:35                 ` Michael S. Tsirkin
2013-06-21  6:41                   ` Jason Wang
2013-06-21  7:12                     ` Eric Dumazet
2013-06-23 10:29                       ` Michael S. Tsirkin
2013-06-24  6:57                 ` David Miller
2013-06-20  5:14         ` [net-next rfc 1/3] net: avoid high order memory allocation for queues by using flex array Jason Wang
2013-06-20  6:05           ` Eric Dumazet
2013-06-19  5:40 ` [net-next rfc 2/3] tuntap: reduce the size of tun_struct " Jason Wang
2013-06-19  5:40 ` [net-next rfc 3/3] tuntap: increase the max queues to 16 Jason Wang
2013-06-19  6:34   ` Eric Dumazet
2013-06-19  7:15     ` Jason Wang
2013-06-19 19:16     ` Jerry Chu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.