All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ifb: add multi-queue support
@ 2009-11-10  8:30 Changli Gao
  2009-11-10  9:07 ` Eric Dumazet
  2009-11-10 10:29 ` Patrick McHardy
  0 siblings, 2 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-10  8:30 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, xiaosuo, Tom Herbert

ifb: add multi-queue support

Add multi-queue support, and one kernel thread is created for per queue.
It can used to emulate multi-queue NIC in software, and distribute work
among CPUs.
gentux linux # modprobe ifb numtxqs=2
gentux linux # ifconfig ifb0 up
gentux linux # pgrep ifb0
18508
18509
gentux linux # taskset -p 1 18508
pid 18508's current affinity mask: 3
pid 18508's new affinity mask: 1
gentux linux # taskset -p 2 18509
pid 18509's current affinity mask: 3
pid 18509's new affinity mask: 2
gentux linux # tc qdisc add dev br0 ingress
gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
action mirred egress redirect dev ifb0

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
drivers/net/ifb.c | 309
++++++++++++++++++++++++++++++++----------------------
1 file changed, 186 insertions(+), 123 deletions(-)

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 030913f..6e04188 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -33,139 +33,101 @@
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
 #include <net/pkt_sched.h>
 #include <net/net_namespace.h>
 
-#define TX_TIMEOUT  (2*HZ)
-
 #define TX_Q_LIMIT    32
+
 struct ifb_private {
-	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
-	/* mostly debug stats leave in for now */
-	unsigned long   st_task_enter; /* tasklet entered */
-	unsigned long   st_txq_refl_try; /* transmit queue refill attempt */
-	unsigned long   st_rxq_enter; /* receive queue entered */
-	unsigned long   st_rx2tx_tran; /* receive to trasmit transfers */
-	unsigned long   st_rxq_notenter; /*receiveQ not entered, resched */
-	unsigned long   st_rx_frm_egr; /* received from egress path */
-	unsigned long   st_rx_frm_ing; /* received from ingress path */
-	unsigned long   st_rxq_check;
-	unsigned long   st_rxq_rsch;
-	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
+	struct net_device	*dev;
+	struct sk_buff_head	rq;
+	struct sk_buff_head	tq;
+	wait_queue_head_t	wq;
+	struct task_struct	*task;
 };
 
+/* Number of ifb devices to be set up by this module. */
 static int numifbs = 2;
+module_param(numifbs, int, 0444);
+MODULE_PARM_DESC(numifbs, "Number of ifb devices");
 
-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
+/* Number of TX queues per ifb */
+static int numtxqs = 1;
+module_param(numtxqs, int, 0444);
+MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
 
-static void ri_tasklet(unsigned long dev)
+static int ifb_thread(void *priv)
 {
-
-	struct net_device *_dev = (struct net_device *)dev;
-	struct ifb_private *dp = netdev_priv(_dev);
-	struct net_device_stats *stats = &_dev->stats;
-	struct netdev_queue *txq;
+	struct ifb_private *dp = (struct ifb_private*)priv;
+	struct net_device *dev = dp->dev;
+	struct net_device_stats *stats = &dev->stats;
+	unsigned int num = dp - (struct ifb_private*)netdev_priv(dev);
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 	struct sk_buff *skb;
-
-	txq = netdev_get_tx_queue(_dev, 0);
-	dp->st_task_enter++;
-	if ((skb = skb_peek(&dp->tq)) == NULL) {
-		dp->st_txq_refl_try++;
-		if (__netif_tx_trylock(txq)) {
-			dp->st_rxq_enter++;
-			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
+	DEFINE_WAIT(wait);
+
+	while (1) {
+		/* move skb from rq to tq */
+		while (1) {
+			prepare_to_wait(&dp->wq, &wait, TASK_UNINTERRUPTIBLE);
+			while (!__netif_tx_trylock(txq))
+				yield();
+			while ((skb = skb_dequeue(&dp->rq)) != NULL)
 				skb_queue_tail(&dp->tq, skb);
-				dp->st_rx2tx_tran++;
-			}
+			if (netif_queue_stopped(dev))
+				netif_wake_queue(dev);
 			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			dp->st_rxq_notenter++;
-			goto resched;
+			if (kthread_should_stop() || !skb_queue_empty(&dp->tq))
+				break;
+			schedule();
 		}
-	}
-
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
-		u32 from = G_TC_FROM(skb->tc_verd);
-
-		skb->tc_verd = 0;
-		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
-		stats->tx_packets++;
-		stats->tx_bytes +=skb->len;
-
-		skb->dev = dev_get_by_index(&init_net, skb->iif);
-		if (!skb->dev) {
-			dev_kfree_skb(skb);
-			stats->tx_dropped++;
+		finish_wait(&dp->wq, &wait);
+		if (kthread_should_stop())
 			break;
-		}
-		dev_put(skb->dev);
-		skb->iif = _dev->ifindex;
-
-		if (from & AT_EGRESS) {
-			dp->st_rx_frm_egr++;
-			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
-			dp->st_rx_frm_ing++;
-			skb_pull(skb, skb->dev->hard_header_len);
-			netif_rx(skb);
-		} else
-			BUG();
-	}
 
-	if (__netif_tx_trylock(txq)) {
-		dp->st_rxq_check++;
-		if ((skb = skb_peek(&dp->rq)) == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			dp->st_rxq_rsch++;
-			__netif_tx_unlock(txq);
-			goto resched;
+		/* transfer packets */
+		while ((skb = skb_dequeue(&dp->tq)) != NULL) {
+			u32 from = G_TC_FROM(skb->tc_verd);
+	
+			skb->tc_verd = 0;
+			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+			stats->tx_packets++;
+			stats->tx_bytes +=skb->len;
+	
+			skb->dev = dev_get_by_index(&init_net, skb->iif);
+			if (!skb->dev) {
+				dev_kfree_skb(skb);
+				stats->tx_dropped++;
+				break;
+			}
+			dev_put(skb->dev);
+			skb->iif = dev->ifindex;
+	
+			if (from & AT_EGRESS) {
+				dev_queue_xmit(skb);
+			} else if (from & AT_INGRESS) {
+				skb_pull(skb, skb->dev->hard_header_len);
+				netif_rx_ni(skb);
+			} else
+				BUG();
 		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
 	}
 
-}
-
-static const struct net_device_ops ifb_netdev_ops = {
-	.ndo_open	= ifb_open,
-	.ndo_stop	= ifb_close,
-	.ndo_start_xmit	= ifb_xmit,
-	.ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
-{
-	/* Initialize the device structure. */
-	dev->destructor = free_netdev;
-	dev->netdev_ops = &ifb_netdev_ops;
-
-	/* Fill in device structure with ethernet-generic values. */
-	ether_setup(dev);
-	dev->tx_queue_len = TX_Q_LIMIT;
-
-	dev->flags |= IFF_NOARP;
-	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
-	random_ether_addr(dev->dev_addr);
+	return 0;
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
 	struct net_device_stats *stats = &dev->stats;
 	u32 from = G_TC_FROM(skb->tc_verd);
+	int num = skb_get_queue_mapping(skb);
+	struct ifb_private *dp = ((struct ifb_private*)netdev_priv(dev)) + num;
 
 	stats->rx_packets++;
 	stats->rx_bytes+=skb->len;
@@ -182,10 +144,8 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	dev->trans_start = jiffies;
 	skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
+	if (skb_queue_len(&dp->rq) == 1)
+		wake_up(&dp->wq);
 
 	return NETDEV_TX_OK;
 }
@@ -193,26 +153,132 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 static int ifb_close(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		kthread_stop(dp[i].task);
+		skb_queue_purge(&dp[i].tq);
+		skb_queue_purge(&dp[i].rq);
+	}
 
-	tasklet_kill(&dp->ifb_tasklet);
 	netif_stop_queue(dev);
-	skb_queue_purge(&dp->rq);
-	skb_queue_purge(&dp->tq);
+
 	return 0;
 }
 
 static int ifb_open(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
+	int i;
+	
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		dp[i].dev = dev;
+		skb_queue_head_init(&dp[i].rq);
+		skb_queue_head_init(&dp[i].tq);
+		init_waitqueue_head(&dp[i].wq);
+		dp[i].task = kthread_run(ifb_thread, &dp[i], "%s/%d", dev->name,
+					i);
+		if (IS_ERR(dp[i].task)) {
+			int err = PTR_ERR(dp[i].task);
+			while (--i >= 0)
+				kthread_stop(dp[i].task);
+			return err;
+		}
+	}
 
-	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
-	skb_queue_head_init(&dp->rq);
-	skb_queue_head_init(&dp->tq);
 	netif_start_queue(dev);
 
 	return 0;
 }
 
+static u32 simple_tx_hashrnd;
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2;
+	u32 hash, ihl;
+	union {
+		u16 in16[2];
+		u32 in32;
+	} ports;
+	u8 ip_proto;
+
+	if ((hash = skb_rx_queue_recorded(skb))) {
+		while (hash >= dev->real_num_tx_queues)
+			hash -= dev->real_num_tx_queues;
+		return hash;
+	}
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+			ip_proto = ip_hdr(skb)->protocol;
+		else
+			ip_proto = 0;
+		addr1 = ip_hdr(skb)->saddr;
+		addr2 = ip_hdr(skb)->daddr;
+		ihl = ip_hdr(skb)->ihl << 2;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		ip_proto = ipv6_hdr(skb)->nexthdr;
+		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+		ihl = 10;
+		break;
+	default:
+		return 0;
+	}
+	if (addr1 > addr2)
+		swap(addr1, addr2);
+
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
+		if (ports.in16[0] > ports.in16[1])
+			swap(ports.in16[0], ports.in16[1]);
+		break;
+
+	default:
+		ports.in32 = 0;
+		break;
+	}
+
+	hash = jhash_3words(addr1, addr2, ports.in32,
+			    simple_tx_hashrnd ^ ip_proto);
+
+	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_open		= ifb_open,
+	.ndo_stop		= ifb_close,
+	.ndo_start_xmit		= ifb_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_select_queue	= ifb_select_queue,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+	/* Initialize the device structure. */
+	dev->destructor = free_netdev;
+	dev->netdev_ops = &ifb_netdev_ops;
+
+	/* Fill in device structure with ethernet-generic values. */
+	ether_setup(dev);
+	dev->tx_queue_len = TX_Q_LIMIT;
+
+	dev->flags |= IFF_NOARP;
+	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	random_ether_addr(dev->dev_addr);
+}
+
 static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {
@@ -231,17 +297,13 @@ static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.validate	= ifb_validate,
 };
 
-/* Number of ifb devices to be set up by this module. */
-module_param(numifbs, int, 0);
-MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-
 static int __init ifb_init_one(int index)
 {
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
-				 "ifb%d", ifb_setup);
+	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private) * numtxqs, "ifb%d",
+				  ifb_setup, numtxqs);
 
 	if (!dev_ifb)
 		return -ENOMEM;
@@ -266,6 +328,7 @@ static int __init ifb_init_module(void)
 {
 	int i, err;
 
+	get_random_bytes(&simple_tx_hashrnd, 4);
 	rtnl_lock();
 	err = __rtnl_link_register(&ifb_link_ops);
 



^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10  8:30 [PATCH] ifb: add multi-queue support Changli Gao
@ 2009-11-10  9:07 ` Eric Dumazet
  2009-11-10  9:43   ` Changli Gao
  2009-11-10 10:29 ` Patrick McHardy
  1 sibling, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2009-11-10  9:07 UTC (permalink / raw)
  To: xiaosuo; +Cc: David S. Miller, netdev, Tom Herbert

Changli Gao a écrit :
> ifb: add multi-queue support
> 
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0

Seems pretty cool !

I have some comments 

> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
> drivers/net/ifb.c | 309
> ++++++++++++++++++++++++++++++++----------------------
> 1 file changed, 186 insertions(+), 123 deletions(-)
> 
> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
> index 030913f..6e04188 100644
> --- a/drivers/net/ifb.c
> +++ b/drivers/net/ifb.c
> @@ -33,139 +33,101 @@
>  #include <linux/etherdevice.h>
>  #include <linux/init.h>
>  #include <linux/moduleparam.h>
> +#include <linux/wait.h>
> +#include <linux/sched.h>
> +#include <linux/kthread.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <net/ip.h>
>  #include <net/pkt_sched.h>
>  #include <net/net_namespace.h>
>  
> -#define TX_TIMEOUT  (2*HZ)
> -
>  #define TX_Q_LIMIT    32
> +
>  struct ifb_private {
> -	struct tasklet_struct   ifb_tasklet;
> -	int     tasklet_pending;
> -	/* mostly debug stats leave in for now */
> -	unsigned long   st_task_enter; /* tasklet entered */
> -	unsigned long   st_txq_refl_try; /* transmit queue refill attempt */
> -	unsigned long   st_rxq_enter; /* receive queue entered */
> -	unsigned long   st_rx2tx_tran; /* receive to trasmit transfers */
> -	unsigned long   st_rxq_notenter; /*receiveQ not entered, resched */
> -	unsigned long   st_rx_frm_egr; /* received from egress path */
> -	unsigned long   st_rx_frm_ing; /* received from ingress path */
> -	unsigned long   st_rxq_check;
> -	unsigned long   st_rxq_rsch;
> -	struct sk_buff_head     rq;
> -	struct sk_buff_head     tq;
> +	struct net_device	*dev;
> +	struct sk_buff_head	rq;
> +	struct sk_buff_head	tq;
> +	wait_queue_head_t	wq;
> +	struct task_struct	*task;
>  };

Maybe you should allocate true per_cpu structure, to avoid cache line sharing
and get appropriate NUMA properties.

At least use a __cacheline_aligned_in_smp ...

>  
> +/* Number of ifb devices to be set up by this module. */
>  static int numifbs = 2;
> +module_param(numifbs, int, 0444);
> +MODULE_PARM_DESC(numifbs, "Number of ifb devices");
>  
> -static void ri_tasklet(unsigned long dev);
> -static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
> -static int ifb_open(struct net_device *dev);
> -static int ifb_close(struct net_device *dev);
> +/* Number of TX queues per ifb */
> +static int numtxqs = 1;
> +module_param(numtxqs, int, 0444);
> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>  
> -static void ri_tasklet(unsigned long dev)
> +static int ifb_thread(void *priv)
>  {
> -
> -	struct net_device *_dev = (struct net_device *)dev;
> -	struct ifb_private *dp = netdev_priv(_dev);
> -	struct net_device_stats *stats = &_dev->stats;
> -	struct netdev_queue *txq;

> +	struct ifb_private *dp = (struct ifb_private*)priv;

A space is required before * : (struct ifb_private *)priv;
(in many places in your patch)

> +	struct net_device *dev = dp->dev;


> +	struct net_device_stats *stats = &dev->stats;

Here you use a net_device_stats that is shared by all your queues,
your updates wont be protected and some will be lost.
You should use txq->tx_ counters.

> +	unsigned int num = dp - (struct ifb_private*)netdev_priv(dev);
> +	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
>  	struct sk_buff *skb;
> -
> -	txq = netdev_get_tx_queue(_dev, 0);
> -	dp->st_task_enter++;
> -	if ((skb = skb_peek(&dp->tq)) == NULL) {
> -		dp->st_txq_refl_try++;
> -		if (__netif_tx_trylock(txq)) {
> -			dp->st_rxq_enter++;
> -			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
> +	DEFINE_WAIT(wait);
> +
> +	while (1) {
> +		/* move skb from rq to tq */
> +		while (1) {
> +			prepare_to_wait(&dp->wq, &wait, TASK_UNINTERRUPTIBLE);
> +			while (!__netif_tx_trylock(txq))
> +				yield();
> +			while ((skb = skb_dequeue(&dp->rq)) != NULL)
>  				skb_queue_tail(&dp->tq, skb);
> -				dp->st_rx2tx_tran++;
> -			}
> +			if (netif_queue_stopped(dev))
> +				netif_wake_queue(dev);
>  			__netif_tx_unlock(txq);
> -		} else {
> -			/* reschedule */
> -			dp->st_rxq_notenter++;
> -			goto resched;
> +			if (kthread_should_stop() || !skb_queue_empty(&dp->tq))
> +				break;
> +			schedule();
>  		}
> -	}
> -
> -	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
> -		u32 from = G_TC_FROM(skb->tc_verd);
> -
> -		skb->tc_verd = 0;
> -		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
> -		stats->tx_packets++;
> -		stats->tx_bytes +=skb->len;
> -
> -		skb->dev = dev_get_by_index(&init_net, skb->iif);
> -		if (!skb->dev) {
> -			dev_kfree_skb(skb);
> -			stats->tx_dropped++;
> +		finish_wait(&dp->wq, &wait);
> +		if (kthread_should_stop())
>  			break;
> -		}
> -		dev_put(skb->dev);
> -		skb->iif = _dev->ifindex;
> -
> -		if (from & AT_EGRESS) {
> -			dp->st_rx_frm_egr++;
> -			dev_queue_xmit(skb);
> -		} else if (from & AT_INGRESS) {
> -			dp->st_rx_frm_ing++;
> -			skb_pull(skb, skb->dev->hard_header_len);
> -			netif_rx(skb);
> -		} else
> -			BUG();
> -	}
>  
> -	if (__netif_tx_trylock(txq)) {
> -		dp->st_rxq_check++;
> -		if ((skb = skb_peek(&dp->rq)) == NULL) {
> -			dp->tasklet_pending = 0;
> -			if (netif_queue_stopped(_dev))
> -				netif_wake_queue(_dev);
> -		} else {
> -			dp->st_rxq_rsch++;
> -			__netif_tx_unlock(txq);
> -			goto resched;
> +		/* transfer packets */
> +		while ((skb = skb_dequeue(&dp->tq)) != NULL) {
> +			u32 from = G_TC_FROM(skb->tc_verd);
> +	
> +			skb->tc_verd = 0;
> +			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);

> +			stats->tx_packets++;
> +			stats->tx_bytes +=skb->len;

Use :			txq->tx_packets++
			txq->tx_bytes += skb->len;
> +	
> +			skb->dev = dev_get_by_index(&init_net, skb->iif);
> +			if (!skb->dev) {
> +				dev_kfree_skb(skb);

> +				stats->tx_dropped++;
			txq->tx_dropped ?

> +				break;
> +			}
> +			dev_put(skb->dev);
> +			skb->iif = dev->ifindex;
> +	
> +			if (from & AT_EGRESS) {
> +				dev_queue_xmit(skb);
> +			} else if (from & AT_INGRESS) {
> +				skb_pull(skb, skb->dev->hard_header_len);
> +				netif_rx_ni(skb);
> +			} else
> +				BUG();
>  		}
> -		__netif_tx_unlock(txq);
> -	} else {
> -resched:
> -		dp->tasklet_pending = 1;
> -		tasklet_schedule(&dp->ifb_tasklet);
>  	}
>  
> -}
> -
> -static const struct net_device_ops ifb_netdev_ops = {
> -	.ndo_open	= ifb_open,
> -	.ndo_stop	= ifb_close,
> -	.ndo_start_xmit	= ifb_xmit,
> -	.ndo_validate_addr = eth_validate_addr,
> -};
> -
> -static void ifb_setup(struct net_device *dev)
> -{
> -	/* Initialize the device structure. */
> -	dev->destructor = free_netdev;
> -	dev->netdev_ops = &ifb_netdev_ops;
> -
> -	/* Fill in device structure with ethernet-generic values. */
> -	ether_setup(dev);
> -	dev->tx_queue_len = TX_Q_LIMIT;
> -
> -	dev->flags |= IFF_NOARP;
> -	dev->flags &= ~IFF_MULTICAST;
> -	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
> -	random_ether_addr(dev->dev_addr);
> +	return 0;
>  }
>  
>  static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
> -	struct ifb_private *dp = netdev_priv(dev);
>  	struct net_device_stats *stats = &dev->stats;
>  	u32 from = G_TC_FROM(skb->tc_verd);
> +	int num = skb_get_queue_mapping(skb);
> +	struct ifb_private *dp = ((struct ifb_private*)netdev_priv(dev)) + num;
>  

>  	stats->rx_packets++;
>  	stats->rx_bytes+=skb->len;

Not sure how to solve this problem (several cpus can updates counter in //)

Thanks

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10  9:07 ` Eric Dumazet
@ 2009-11-10  9:43   ` Changli Gao
  2009-11-10 10:57     ` Eric Dumazet
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-10  9:43 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev, Tom Herbert

2009/11/10 Eric Dumazet <eric.dumazet@gmail.com>:
>
> Not sure how to solve this problem (several cpus can updates counter in //)
>
Thanks, and follow your suggestions. I can maintain the counter per TX
queue, and update it in a timer handler like ixgbe or implement our
own struct net_device_stats* (*ndo_get_stats)(struct net_device *dev),
and update the counters when it gets called.



-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10  8:30 [PATCH] ifb: add multi-queue support Changli Gao
  2009-11-10  9:07 ` Eric Dumazet
@ 2009-11-10 10:29 ` Patrick McHardy
  2009-11-10 10:48   ` Changli Gao
  1 sibling, 1 reply; 62+ messages in thread
From: Patrick McHardy @ 2009-11-10 10:29 UTC (permalink / raw)
  To: xiaosuo; +Cc: David S. Miller, netdev, Tom Herbert

Changli Gao wrote:
> ifb: add multi-queue support
> 
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0

I'm not sure how this will help, ifb device transmission is
still serialized by the ingress queue lock and the mirred lock.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 10:29 ` Patrick McHardy
@ 2009-11-10 10:48   ` Changli Gao
  2009-11-10 10:55     ` Eric Dumazet
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-10 10:48 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, netdev, Tom Herbert

On Tue, Nov 10, 2009 at 6:29 PM, Patrick McHardy <kaber@trash.net> wrote:
>
> I'm not sure how this will help, ifb device transmission is
> still serialized by the ingress queue lock and the mirred lock.
>

But not in ifb_thread. For firewall the main routine is after
netif_rx_ni(). For server applications, the packets belong to the same
flow are always routed to the same CPU, and the applications can
benefit with better localization.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 10:48   ` Changli Gao
@ 2009-11-10 10:55     ` Eric Dumazet
  0 siblings, 0 replies; 62+ messages in thread
From: Eric Dumazet @ 2009-11-10 10:55 UTC (permalink / raw)
  To: Changli Gao; +Cc: Patrick McHardy, David S. Miller, netdev, Tom Herbert

Changli Gao a écrit :
> On Tue, Nov 10, 2009 at 6:29 PM, Patrick McHardy <kaber@trash.net> wrote:
>> I'm not sure how this will help, ifb device transmission is
>> still serialized by the ingress queue lock and the mirred lock.
>>
> 
> But not in ifb_thread. For firewall the main routine is after
> netif_rx_ni(). For server applications, the packets belong to the same
> flow are always routed to the same CPU, and the applications can
> benefit with better localization.
> 

Hmm, you know many cache lines already were bringed into cpu receiving
the original softirq ? But yes, this is a possible way to go / try :)

Please submit your future patch on top of net-next-2.6, because
I see you still use dev_get_by_index() :-(

commit 05e8689c9a3a208bf75b60662778d81e23eac460
Author: Eric Dumazet <eric.dumazet@gmail.com>
Date:   Sun Nov 1 19:45:16 2009 +0000

    ifb: RCU locking avoids touching dev refcount
    
    Avoids touching dev refcount in hotpath
    
    Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

commit db519144243de6b17ff0c56c26f06059743110a7
Author: Eric Dumazet <eric.dumazet@gmail.com>
Date:   Tue Oct 20 02:35:50 2009 +0000

    ifb: should not use __dev_get_by_index() without locks
    
    At this point (ri_tasklet()), RTNL or dev_base_lock are not held,
    we must use dev_get_by_index() instead of __dev_get_by_index()
    
    Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>



^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10  9:43   ` Changli Gao
@ 2009-11-10 10:57     ` Eric Dumazet
  2009-11-10 11:14       ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2009-11-10 10:57 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev, Tom Herbert

Changli Gao a écrit :
> 2009/11/10 Eric Dumazet <eric.dumazet@gmail.com>:
>> Not sure how to solve this problem (several cpus can updates counter in //)
>>
> Thanks, and follow your suggestions. I can maintain the counter per TX
> queue, and update it in a timer handler like ixgbe or implement our
> own struct net_device_stats* (*ndo_get_stats)(struct net_device *dev),
> and update the counters when it gets called.

Please no timer stuff :)


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 10:57     ` Eric Dumazet
@ 2009-11-10 11:14       ` Changli Gao
  2009-11-10 11:41         ` Patrick McHardy
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-10 11:14 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev, Tom Herbert

[-- Attachment #1: Type: text/plain, Size: 643 bytes --]

2009/11/10 Eric Dumazet <eric.dumazet@gmail.com>:
> Changli Gao a écrit :
>> 2009/11/10 Eric Dumazet <eric.dumazet@gmail.com>:
>>> Not sure how to solve this problem (several cpus can updates counter in //)
>>>
>> Thanks, and follow your suggestions. I can maintain the counter per TX
>> queue, and update it in a timer handler like ixgbe or implement our
>> own struct net_device_stats* (*ndo_get_stats)(struct net_device *dev),
>> and update the counters when it gets called.
>
> Please no timer stuff :)

The whole ifb.c file is attached, please review and test it. Thanks!


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

[-- Attachment #2: ifb.c --]
[-- Type: application/octet-stream, Size: 9063 bytes --]

/* drivers/net/ifb.c:

	The purpose of this driver is to provide a device that allows
	for sharing of resources:

	1) qdiscs/policies that are per device as opposed to system wide.
	ifb allows for a device which can be redirected to thus providing
	an impression of sharing.

	2) Allows for queueing incoming traffic for shaping instead of
	dropping.

	The original concept is based on what is known as the IMQ
	driver initially written by Martin Devera, later rewritten
	by Patrick McHardy and then maintained by Andre Correa.

	You need the tc action  mirror or redirect to feed this device
       	packets.

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version
	2 of the License, or (at your option) any later version.

  	Authors:	Jamal Hadi Salim (2005)

*/


#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/init.h>
#include <linux/moduleparam.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/net_namespace.h>

#define TX_Q_LIMIT    32

struct ifb_private {
	struct net_device	*dev;
	struct sk_buff_head	rq;
	struct sk_buff_head	tq;
	wait_queue_head_t	wq;
	struct task_struct	*task;
	unsigned long		rx_packets;
	unsigned long		rx_bytes;
	unsigned long		rx_dropped;
} ____cacheline_aligned_in_smp;

/* Number of ifb devices to be set up by this module. */
static int numifbs = 2;
module_param(numifbs, int, 0444);
MODULE_PARM_DESC(numifbs, "Number of ifb devices");

/* Number of TX queues per ifb */
static int numtxqs = 1;
module_param(numtxqs, int, 0444);
MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");

static int ifb_thread(void *priv)
{
	struct ifb_private *dp = (struct ifb_private *)priv;
	struct net_device *dev = dp->dev;
	unsigned int num = dp - (struct ifb_private *)netdev_priv(dev);
	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
	struct sk_buff *skb;
	DEFINE_WAIT(wait);

	while (1) {
		/* move skb from rq to tq */
		while (1) {
			prepare_to_wait(&dp->wq, &wait, TASK_UNINTERRUPTIBLE);
			while (!__netif_tx_trylock(txq))
				yield();
			while ((skb = skb_dequeue(&dp->rq)) != NULL)
				skb_queue_tail(&dp->tq, skb);
			if (netif_queue_stopped(dev))
				netif_wake_queue(dev);
			__netif_tx_unlock(txq);
			if (kthread_should_stop() || !skb_queue_empty(&dp->tq))
				break;
			schedule();
		}
		finish_wait(&dp->wq, &wait);
		if (kthread_should_stop())
			break;

		/* transfer packets */
		while ((skb = skb_dequeue(&dp->tq)) != NULL) {
			u32 from = G_TC_FROM(skb->tc_verd);
	
			skb->tc_verd = 0;
			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
			txq->tx_packets++;
			txq->tx_bytes +=skb->len;
	
			rcu_read_lock();
			skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
			if (!skb->dev) {
				rcu_read_unlock();
				dev_kfree_skb(skb);
				txq->tx_dropped++;
				break;
			}
			rcu_read_unlock();
			skb->iif = dev->ifindex;
	
			if (from & AT_EGRESS) {
				dev_queue_xmit(skb);
			} else if (from & AT_INGRESS) {
				skb_pull(skb, skb->dev->hard_header_len);
				netif_rx_ni(skb);
			} else
				BUG();
		}
	}

	return 0;
}

struct net_device_stats* ifb_get_stats(struct net_device *dev)
{
	struct net_device_stats *stats = &dev->stats;
	struct ifb_private *dp = netdev_priv(dev);
	struct netdev_queue *txq;
	int i;
	unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
	unsigned long tx_packets = 0, tx_bytes = 0, tx_dropped = 0;

	for (i = 0; i < dev->real_num_tx_queues; i++) {
		rx_packets += dp[i].rx_packets;
		rx_bytes += dp[i].rx_bytes;
		rx_dropped += dp[i].rx_dropped;
		txq = netdev_get_tx_queue(dev, i);
		tx_packets = txq->tx_packets;
		tx_bytes = txq->tx_bytes;
		tx_dropped += txq->tx_dropped;
	}

	stats->rx_packets = rx_packets;
	stats->rx_bytes = rx_bytes;
	stats->rx_dropped = rx_dropped;
	stats->tx_packets = tx_packets;
	stats->tx_bytes = tx_bytes;
	stats->tx_dropped = tx_dropped;

	return stats;
}

static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
{
	u32 from = G_TC_FROM(skb->tc_verd);
	int num = skb_get_queue_mapping(skb);
	struct ifb_private *dp = ((struct ifb_private *)netdev_priv(dev)) + num;
	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);

	dp->rx_packets++;
	dp->rx_bytes+=skb->len;

	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
		dev_kfree_skb(skb);
		dp->rx_dropped++;
		return NETDEV_TX_OK;
	}

	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len) {
		netif_stop_queue(dev);
	}

	txq->trans_start = jiffies;
	skb_queue_tail(&dp->rq, skb);
	if (skb_queue_len(&dp->rq) == 1)
		wake_up(&dp->wq);

	return NETDEV_TX_OK;
}

static int ifb_close(struct net_device *dev)
{
	struct ifb_private *dp = netdev_priv(dev);
	int i;

	for (i = 0; i < dev->real_num_tx_queues; i++) {
		kthread_stop(dp[i].task);
		skb_queue_purge(&dp[i].tq);
		skb_queue_purge(&dp[i].rq);
	}

	netif_stop_queue(dev);

	return 0;
}

static int ifb_open(struct net_device *dev)
{
	struct ifb_private *dp = netdev_priv(dev);
	int i;
	
	for (i = 0; i < dev->real_num_tx_queues; i++) {
		dp[i].dev = dev;
		skb_queue_head_init(&dp[i].rq);
		skb_queue_head_init(&dp[i].tq);
		init_waitqueue_head(&dp[i].wq);
		dp[i].task = kthread_run(ifb_thread, &dp[i], "%s/%d", dev->name,
					i);
		if (IS_ERR(dp[i].task)) {
			int err = PTR_ERR(dp[i].task);
			while (--i >= 0)
				kthread_stop(dp[i].task);
			return err;
		}
	}

	netif_start_queue(dev);

	return 0;
}

static u32 simple_tx_hashrnd;

static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
{
	u32 addr1, addr2;
	u32 hash, ihl;
	union {
		u16 in16[2];
		u32 in32;
	} ports;
	u8 ip_proto;

	if ((hash = skb_rx_queue_recorded(skb))) {
		while (hash >= dev->real_num_tx_queues)
			hash -= dev->real_num_tx_queues;
		return hash;
	}

	switch (skb->protocol) {
	case __constant_htons(ETH_P_IP):
		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
			ip_proto = ip_hdr(skb)->protocol;
		else
			ip_proto = 0;
		addr1 = ip_hdr(skb)->saddr;
		addr2 = ip_hdr(skb)->daddr;
		ihl = ip_hdr(skb)->ihl << 2;
		break;
	case __constant_htons(ETH_P_IPV6):
		ip_proto = ipv6_hdr(skb)->nexthdr;
		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
		ihl = 10;
		break;
	default:
		return 0;
	}
	if (addr1 > addr2)
		swap(addr1, addr2);

	switch (ip_proto) {
	case IPPROTO_TCP:
	case IPPROTO_UDP:
	case IPPROTO_DCCP:
	case IPPROTO_ESP:
	case IPPROTO_AH:
	case IPPROTO_SCTP:
	case IPPROTO_UDPLITE:
		ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
		if (ports.in16[0] > ports.in16[1])
			swap(ports.in16[0], ports.in16[1]);
		break;

	default:
		ports.in32 = 0;
		break;
	}

	hash = jhash_3words(addr1, addr2, ports.in32,
			    simple_tx_hashrnd ^ ip_proto);

	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
}

static const struct net_device_ops ifb_netdev_ops = {
	.ndo_open		= ifb_open,
	.ndo_stop		= ifb_close,
	.ndo_start_xmit		= ifb_xmit,
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_select_queue	= ifb_select_queue,
	.ndo_get_stats		= ifb_get_stats,
};

static void ifb_setup(struct net_device *dev)
{
	/* Initialize the device structure. */
	dev->destructor = free_netdev;
	dev->netdev_ops = &ifb_netdev_ops;

	/* Fill in device structure with ethernet-generic values. */
	ether_setup(dev);
	dev->tx_queue_len = TX_Q_LIMIT;

	dev->flags |= IFF_NOARP;
	dev->flags &= ~IFF_MULTICAST;
	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
	random_ether_addr(dev->dev_addr);
}

static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
			return -EINVAL;
		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
			return -EADDRNOTAVAIL;
	}
	return 0;
}

static struct rtnl_link_ops ifb_link_ops __read_mostly = {
	.kind		= "ifb",
	.priv_size	= sizeof(struct ifb_private),
	.setup		= ifb_setup,
	.validate	= ifb_validate,
};

static int __init ifb_init_one(int index)
{
	struct net_device *dev_ifb;
	int err;

	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private) * numtxqs, "ifb%d",
				  ifb_setup, numtxqs);

	if (!dev_ifb)
		return -ENOMEM;

	err = dev_alloc_name(dev_ifb, dev_ifb->name);
	if (err < 0)
		goto err;

	dev_ifb->rtnl_link_ops = &ifb_link_ops;
	err = register_netdevice(dev_ifb);
	if (err < 0)
		goto err;

	return 0;

err:
	free_netdev(dev_ifb);
	return err;
}

static int __init ifb_init_module(void)
{
	int i, err;

	get_random_bytes(&simple_tx_hashrnd, 4);
	rtnl_lock();
	err = __rtnl_link_register(&ifb_link_ops);

	for (i = 0; i < numifbs && !err; i++)
		err = ifb_init_one(i);
	if (err)
		__rtnl_link_unregister(&ifb_link_ops);
	rtnl_unlock();

	return err;
}

static void __exit ifb_cleanup_module(void)
{
	rtnl_link_unregister(&ifb_link_ops);
}

module_init(ifb_init_module);
module_exit(ifb_cleanup_module);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jamal Hadi Salim");
MODULE_ALIAS_RTNL_LINK("ifb");

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 11:14       ` Changli Gao
@ 2009-11-10 11:41         ` Patrick McHardy
  2009-11-10 12:14           ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Patrick McHardy @ 2009-11-10 11:41 UTC (permalink / raw)
  To: Changli Gao; +Cc: Eric Dumazet, David S. Miller, netdev, Tom Herbert

Changli Gao wrote:
> 2009/11/10 Eric Dumazet <eric.dumazet@gmail.com>:
>> Changli Gao a écrit :
>>> 2009/11/10 Eric Dumazet <eric.dumazet@gmail.com>:
>>>> Not sure how to solve this problem (several cpus can updates counter in //)
>>>>
>>> Thanks, and follow your suggestions. I can maintain the counter per TX
>>> queue, and update it in a timer handler like ixgbe or implement our
>>> own struct net_device_stats* (*ndo_get_stats)(struct net_device *dev),
>>> and update the counters when it gets called.
>> Please no timer stuff :)
> 
> The whole ifb.c file is attached, please review and test it. Thanks!

> /* Number of TX queues per ifb */
> static int numtxqs = 1;
> module_param(numtxqs, int, 0444);
> MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");

Module parameters suck as a configuration API. The existing numifbs
option exists purely for compatibility reasons, I'd prefer if you'd
this to the netlink interface.

> static int __init ifb_init_one(int index)
> {
> 	struct net_device *dev_ifb;
> 	int err;
> 
> 	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private) * numtxqs, "ifb%d",
> 				  ifb_setup, numtxqs);
> 

This won't work for the rtnl_link setup since the size must
be constant.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 11:41         ` Patrick McHardy
@ 2009-11-10 12:14           ` Changli Gao
  2009-11-10 12:19             ` Patrick McHardy
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-10 12:14 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric Dumazet, David S. Miller, netdev, Tom Herbert

On Tue, Nov 10, 2009 at 7:41 PM, Patrick McHardy <kaber@trash.net> wrote:
> Changli Gao wrote:
>> 2009/11/10 Eric Dumazet <eric.dumazet@gmail.com>:
>>
>> The whole ifb.c file is attached, please review and test it. Thanks!
>
>> /* Number of TX queues per ifb */
>> static int numtxqs = 1;
>> module_param(numtxqs, int, 0444);
>> MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>
> Module parameters suck as a configuration API. The existing numifbs
> option exists purely for compatibility reasons, I'd prefer if you'd
> this to the netlink interface.

How to do that? I haven't found any examples. Is there a interface to
config the number of the TX queues of a NIC.

>
>> static int __init ifb_init_one(int index)
>> {
>>       struct net_device *dev_ifb;
>>       int err;
>>
>>       dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private) * numtxqs, "ifb%d",
>>                                 ifb_setup, numtxqs);
>>
>
> This won't work for the rtnl_link setup since the size must
> be constant.
>

Does this work?

        ifb_link_ops.priv_size = sizeof(struct ifb_private) * numtxqs;
        rtnl_lock();
        err = __rtnl_link_register(&ifb_link_ops);

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 12:14           ` Changli Gao
@ 2009-11-10 12:19             ` Patrick McHardy
  2009-11-10 12:37               ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Patrick McHardy @ 2009-11-10 12:19 UTC (permalink / raw)
  To: Changli Gao; +Cc: Eric Dumazet, David S. Miller, netdev, Tom Herbert

Changli Gao wrote:
> On Tue, Nov 10, 2009 at 7:41 PM, Patrick McHardy <kaber@trash.net> wrote:
>> Changli Gao wrote:
>>> 2009/11/10 Eric Dumazet <eric.dumazet@gmail.com>:
>>>
>>> The whole ifb.c file is attached, please review and test it. Thanks!
>>> /* Number of TX queues per ifb */
>>> static int numtxqs = 1;
>>> module_param(numtxqs, int, 0444);
>>> MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>> Module parameters suck as a configuration API. The existing numifbs
>> option exists purely for compatibility reasons, I'd prefer if you'd
>> this to the netlink interface.
> 
> How to do that? I haven't found any examples. Is there a interface to
> config the number of the TX queues of a NIC.

You have to add a get_tx_queues() callback to the rtnl_link_ops.
Additionally you need a new attribute (IFLA_NTXQ or something like
that) that contains the number of queues. The callback has to parse
the attribute and set the number of queues accordingly.

>>> static int __init ifb_init_one(int index)
>>> {
>>>       struct net_device *dev_ifb;
>>>       int err;
>>>
>>>       dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private) * numtxqs, "ifb%d",
>>>                                 ifb_setup, numtxqs);
>>>
>> This won't work for the rtnl_link setup since the size must
>> be constant.
>>
> 
> Does this work?
> 
>         ifb_link_ops.priv_size = sizeof(struct ifb_private) * numtxqs;
>         rtnl_lock();
>         err = __rtnl_link_register(&ifb_link_ops);

Only for the module parameter. For rtnl_link you need to either
allocate the private space seperately or turn priv_size into
a callback that returns the required space based on the number
of queues.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 12:19             ` Patrick McHardy
@ 2009-11-10 12:37               ` Changli Gao
  2009-11-10 12:45                 ` Patrick McHardy
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-10 12:37 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric Dumazet, David S. Miller, netdev, Tom Herbert

On Tue, Nov 10, 2009 at 8:19 PM, Patrick McHardy <kaber@trash.net> wrote:
>
> You have to add a get_tx_queues() callback to the rtnl_link_ops.

It is used to get the current real_num_tx_queues and num_tx_queues,
not to setting.

> Additionally you need a new attribute (IFLA_NTXQ or something like
> that) that contains the number of queues. The callback has to parse
> the attribute and set the number of queues accordingly.

It seems another patch is needed first.

>> Does this work?
>>
>>         ifb_link_ops.priv_size = sizeof(struct ifb_private) * numtxqs;
>>         rtnl_lock();
>>         err = __rtnl_link_register(&ifb_link_ops);
>
> Only for the module parameter. For rtnl_link you need to either
> allocate the private space seperately or turn priv_size into
> a callback that returns the required space based on the number
> of queues.

Do you means that if module ifb is loaded automatically, parameters
won't be set correctly?

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 12:37               ` Changli Gao
@ 2009-11-10 12:45                 ` Patrick McHardy
  2009-11-10 13:06                   ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Patrick McHardy @ 2009-11-10 12:45 UTC (permalink / raw)
  To: Changli Gao; +Cc: Eric Dumazet, David S. Miller, netdev, Tom Herbert

Changli Gao wrote:
> On Tue, Nov 10, 2009 at 8:19 PM, Patrick McHardy <kaber@trash.net> wrote:
>   
>> You have to add a get_tx_queues() callback to the rtnl_link_ops.
>>     
>
> It is used to get the current real_num_tx_queues and num_tx_queues,
> not to setting.
>   

Yes, it gets the number from a user-supplied parameter for device
allocation.

>> Additionally you need a new attribute (IFLA_NTXQ or something like
>> that) that contains the number of queues. The callback has to parse
>> the attribute and set the number of queues accordingly.
>>     
>
> It seems another patch is needed first.
>   

Its a trivial change, you can put them in the same patch:

static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
                             unsigned int *num_tx_queues,
                             unsigned int *real_num_tx_queues)
{
    unsigned int n = 1;

    if (tb[IFLA_NTXQ])
       n = nla_get_u32(tb[IFLA_NTXQ]);

    *num_tx_queues = n;
    *real_num_tx_queues = n;
    return 0;
}

>>> Does this work?
>>>
>>>         ifb_link_ops.priv_size = sizeof(struct ifb_private) * numtxqs;
>>>         rtnl_lock();
>>>         err = __rtnl_link_register(&ifb_link_ops);
>>>       
>> Only for the module parameter. For rtnl_link you need to either
>> allocate the private space seperately or turn priv_size into
>> a callback that returns the required space based on the number
>> of queues.
>>     
>
> Do you means that if module ifb is loaded automatically, parameters
> won't be set correctly?

No, I mean if you add a proper interface for this, you have to deal
with different interfaces using a different amount of queues.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 12:45                 ` Patrick McHardy
@ 2009-11-10 13:06                   ` Changli Gao
  2009-11-10 13:34                     ` Eric Dumazet
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-10 13:06 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric Dumazet, David S. Miller, netdev, Tom Herbert

On Tue, Nov 10, 2009 at 8:45 PM, Patrick McHardy <kaber@trash.net> wrote:
>
> Its a trivial change, you can put them in the same patch:
>
> static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>                             unsigned int *num_tx_queues,
>                             unsigned int *real_num_tx_queues)
> {
>    unsigned int n = 1;
>
>    if (tb[IFLA_NTXQ])
>       n = nla_get_u32(tb[IFLA_NTXQ]);
>
>    *num_tx_queues = n;
>    *real_num_tx_queues = n;
>    return 0;
> }
>

Is IFLA_NTXQ added? I didn't find it in any branch. And I think both
num_tx_queueus and real_num_tx_queues are needed.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 13:06                   ` Changli Gao
@ 2009-11-10 13:34                     ` Eric Dumazet
  2009-11-10 13:49                       ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2009-11-10 13:34 UTC (permalink / raw)
  To: Changli Gao; +Cc: Patrick McHardy, David S. Miller, netdev, Tom Herbert

Changli Gao a écrit :
> On Tue, Nov 10, 2009 at 8:45 PM, Patrick McHardy <kaber@trash.net> wrote:
>> Its a trivial change, you can put them in the same patch:
>>
>> static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>>                             unsigned int *num_tx_queues,
>>                             unsigned int *real_num_tx_queues)
>> {
>>    unsigned int n = 1;
>>
>>    if (tb[IFLA_NTXQ])
>>       n = nla_get_u32(tb[IFLA_NTXQ]);
>>
>>    *num_tx_queues = n;
>>    *real_num_tx_queues = n;
>>    return 0;
>> }
>>
> 
> Is IFLA_NTXQ added? I didn't find it in any branch. And I think both
> num_tx_queueus and real_num_tx_queues are needed.
> 

I believe Patrick was referring to vlan get_tx_queues() implementation,
but it actually gets values from real device (tb[IFLA_LINK])

In your case you'll need to add a new IFLA_NTXQ attribute, and
change iproute2 to pass this new attribute at link creation.
(check include/linux/if_link.h)

ip link add link .....  ntxq 2

static int vlan_get_tx_queues(struct net *net,
                              struct nlattr *tb[],
                              unsigned int *num_tx_queues,
                              unsigned int *real_num_tx_queues)
{
        struct net_device *real_dev;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
        if (!real_dev)
                return -ENODEV;

        *num_tx_queues      = real_dev->num_tx_queues;
        *real_num_tx_queues = real_dev->real_num_tx_queues;
        return 0;
}



^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 13:34                     ` Eric Dumazet
@ 2009-11-10 13:49                       ` Changli Gao
  2009-11-10 16:45                         ` Stephen Hemminger
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-10 13:49 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Patrick McHardy, David S. Miller, netdev, Tom Herbert

On Tue, Nov 10, 2009 at 9:34 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Changli Gao a écrit :
>
> I believe Patrick was referring to vlan get_tx_queues() implementation,
> but it actually gets values from real device (tb[IFLA_LINK])
>
> In your case you'll need to add a new IFLA_NTXQ attribute, and
> change iproute2 to pass this new attribute at link creation.
> (check include/linux/if_link.h)
>
> ip link add link .....  ntxq 2
>
> static int vlan_get_tx_queues(struct net *net,
>                              struct nlattr *tb[],
>                              unsigned int *num_tx_queues,
>                              unsigned int *real_num_tx_queues)
> {
>        struct net_device *real_dev;
>
>        if (!tb[IFLA_LINK])
>                return -EINVAL;
>
>        real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
>        if (!real_dev)
>                return -ENODEV;
>
>        *num_tx_queues      = real_dev->num_tx_queues;
>        *real_num_tx_queues = real_dev->real_num_tx_queues;
>        return 0;
> }
>
>
>

got it. Thanks. BTW: why not merge iproute2 into linux, just like perf.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 13:49                       ` Changli Gao
@ 2009-11-10 16:45                         ` Stephen Hemminger
  2009-11-11  6:30                           ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2009-11-10 16:45 UTC (permalink / raw)
  To: Changli Gao
  Cc: Eric Dumazet, Patrick McHardy, David S. Miller, netdev, Tom Herbert

On Tue, 10 Nov 2009 21:49:31 +0800
Changli Gao <xiaosuo@gmail.com> wrote:

> On Tue, Nov 10, 2009 at 9:34 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Changli Gao a écrit :
> >
> > I believe Patrick was referring to vlan get_tx_queues() implementation,
> > but it actually gets values from real device (tb[IFLA_LINK])
> >
> > In your case you'll need to add a new IFLA_NTXQ attribute, and
> > change iproute2 to pass this new attribute at link creation.
> > (check include/linux/if_link.h)
> >
> > ip link add link .....  ntxq 2
> >
> > static int vlan_get_tx_queues(struct net *net,
> >                              struct nlattr *tb[],
> >                              unsigned int *num_tx_queues,
> >                              unsigned int *real_num_tx_queues)
> > {
> >        struct net_device *real_dev;
> >
> >        if (!tb[IFLA_LINK])
> >                return -EINVAL;
> >
> >        real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
> >        if (!real_dev)
> >                return -ENODEV;
> >
> >        *num_tx_queues      = real_dev->num_tx_queues;
> >        *real_num_tx_queues = real_dev->real_num_tx_queues;
> >        return 0;
> > }
> >
> >
> >
> 
> got it. Thanks. BTW: why not merge iproute2 into linux, just like perf.
> 

Distro's would find that hard, also it would cause people to forgot binary
API compatibility.

-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-10 16:45                         ` Stephen Hemminger
@ 2009-11-11  6:30                           ` Changli Gao
  0 siblings, 0 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-11  6:30 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Patrick McHardy, David S. Miller, netdev, Tom Herbert

[-- Attachment #1: Type: text/plain, Size: 750 bytes --]

On Wed, Nov 11, 2009 at 12:45 AM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> On Tue, 10 Nov 2009 21:49:31 +0800
> Changli Gao <xiaosuo@gmail.com> wrote:
>
>
> Distro's would find that hard, also it would cause people to forgot binary
> API compatibility.
>

Get it. Thanks.

The new version of ifb is attached:
1. fixbug: a deadlock.
2. fixbug: statistics data corruption.
3. fixbug: can't work with ip link command.

Dear Patrick:

The support for specifying the number of TX queues with ip link
command is comming soon. I do think the corresponding kernel module
parameter is needed for the default option and the ifbs created when
module loading, so I preserve it.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

[-- Attachment #2: ifb.c --]
[-- Type: application/octet-stream, Size: 10030 bytes --]

/* drivers/net/ifb.c:

	The purpose of this driver is to provide a device that allows
	for sharing of resources:

	1) qdiscs/policies that are per device as opposed to system wide.
	ifb allows for a device which can be redirected to thus providing
	an impression of sharing.

	2) Allows for queueing incoming traffic for shaping instead of
	dropping.

	The original concept is based on what is known as the IMQ
	driver initially written by Martin Devera, later rewritten
	by Patrick McHardy and then maintained by Andre Correa.

	You need the tc action  mirror or redirect to feed this device
       	packets.

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version
	2 of the License, or (at your option) any later version.

  	Authors:	Jamal Hadi Salim (2005)

*/


#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/init.h>
#include <linux/moduleparam.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/net_namespace.h>

#define TX_Q_LIMIT    32

struct ifb_private_q {
	struct net_device	*dev;
	struct sk_buff_head	rq;
	struct sk_buff_head	tq;
	wait_queue_head_t	wq;
	struct task_struct	*task;
	unsigned long		rx_packets;
	unsigned long		rx_bytes;
	unsigned long		rx_dropped;
} ____cacheline_aligned_in_smp;

struct ifb_private {
	struct ifb_private_q	*pq;
};

/* Number of ifb devices to be set up by this module. */
static int numifbs = 2;
module_param(numifbs, int, 0444);
MODULE_PARM_DESC(numifbs, "Number of ifb devices");

/* Number of TX queues per ifb */
static int numtxqs = 1;
module_param(numtxqs, int, 0444);
MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");

static int ifb_thread(void *priv)
{
	struct ifb_private_q *pq = priv;
	struct net_device *dev = pq->dev;
	int num = pq - ((struct ifb_private *)netdev_priv(dev))->pq;
	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
	struct sk_buff *skb;
	DEFINE_WAIT(wait);

	while (1) {
		/* move skb from rq to tq */
		while (1) {
			prepare_to_wait(&pq->wq, &wait, TASK_UNINTERRUPTIBLE);
			__netif_tx_lock_bh(txq);
			while ((skb = skb_dequeue(&pq->rq)) != NULL)
				skb_queue_tail(&pq->tq, skb);
			if (netif_queue_stopped(dev))
				netif_wake_queue(dev);
			__netif_tx_unlock_bh(txq);
			if (kthread_should_stop() || !skb_queue_empty(&pq->tq))
				break;
			schedule();
		}
		finish_wait(&pq->wq, &wait);
		if (kthread_should_stop())
			break;
		if (need_resched())
			schedule();

		/* transfer packets */
		while ((skb = skb_dequeue(&pq->tq)) != NULL) {
			u32 from = G_TC_FROM(skb->tc_verd);
	
			skb->tc_verd = 0;
			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
			txq->tx_packets++;
			txq->tx_bytes +=skb->len;

			rcu_read_lock();
			skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
			if (!skb->dev) {
				rcu_read_unlock();
				dev_kfree_skb(skb);
				txq->tx_dropped++;
				break;
			}
			rcu_read_unlock();
			skb->iif = dev->ifindex;
	
			if (from & AT_EGRESS) {
				dev_queue_xmit(skb);
			} else if (from & AT_INGRESS) {
				skb_pull(skb, skb->dev->hard_header_len);
				netif_rx_ni(skb);
			} else
				BUG();
		}
	}

	return 0;
}

struct net_device_stats* ifb_get_stats(struct net_device *dev)
{
	struct net_device_stats *stats = &dev->stats;
	struct ifb_private *dp = netdev_priv(dev);
	struct ifb_private_q *pq = dp->pq;
	struct netdev_queue *txq;
	int i;
	unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
	unsigned long tx_packets = 0, tx_bytes = 0, tx_dropped = 0;

	for (i = 0; i < dev->real_num_tx_queues; i++) {
		rx_packets += pq[i].rx_packets;
		rx_bytes += pq[i].rx_bytes;
		rx_dropped += pq[i].rx_dropped;
		txq = netdev_get_tx_queue(dev, i);
		tx_packets += txq->tx_packets;
		tx_bytes += txq->tx_bytes;
		tx_dropped += txq->tx_dropped;
	}

	stats->rx_packets = rx_packets;
	stats->rx_bytes = rx_bytes;
	stats->rx_dropped = rx_dropped;
	stats->tx_packets = tx_packets;
	stats->tx_bytes = tx_bytes;
	stats->tx_dropped = tx_dropped;

	return stats;
}

static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
{
	u32 from = G_TC_FROM(skb->tc_verd);
	int num = skb_get_queue_mapping(skb);
	struct ifb_private *dp = netdev_priv(dev);
	struct ifb_private_q *pq = dp->pq + num;
	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);

	pq->rx_packets++;
	pq->rx_bytes += skb->len;

	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
		dev_kfree_skb(skb);
		pq->rx_dropped++;
		return NETDEV_TX_OK;
	}

	txq->trans_start = jiffies;
	skb_queue_tail(&pq->rq, skb);
	if (skb_queue_len(&pq->rq) >= dev->tx_queue_len)
		netif_stop_queue(dev);
	if (skb_queue_len(&pq->rq) == 1)
		wake_up(&pq->wq);

	return NETDEV_TX_OK;
}

static int ifb_close(struct net_device *dev)
{
	struct ifb_private *dp = netdev_priv(dev);
	struct ifb_private_q *pq = dp->pq;
	int i;

	netif_stop_queue(dev);
	for (i = 0; i < dev->real_num_tx_queues; i++) {
		kthread_stop(pq[i].task);
		skb_queue_purge(&pq[i].tq);
		skb_queue_purge(&pq[i].rq);
	}

	return 0;
}

static int ifb_open(struct net_device *dev)
{
	struct ifb_private *dp = netdev_priv(dev);
	struct ifb_private_q *pq = dp->pq;
	int i;
	
	for (i = 0; i < dev->real_num_tx_queues; i++) {
		pq[i].task = kthread_run(ifb_thread, &pq[i], "%s/%d", dev->name,
					 i);
		if (IS_ERR(pq[i].task)) {
			int err = PTR_ERR(pq[i].task);
			while (--i >= 0)
				kthread_stop(pq[i].task);
			return err;
		}
	}
	netif_start_queue(dev);

	return 0;
}

static u32 simple_tx_hashrnd;

static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
{
	u32 addr1, addr2;
	u32 hash, ihl;
	union {
		u16 in16[2];
		u32 in32;
	} ports;
	u8 ip_proto;

	if ((hash = skb_rx_queue_recorded(skb))) {
		while (hash >= dev->real_num_tx_queues)
			hash -= dev->real_num_tx_queues;
		return hash;
	}

	switch (skb->protocol) {
	case __constant_htons(ETH_P_IP):
		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
			ip_proto = ip_hdr(skb)->protocol;
		else
			ip_proto = 0;
		addr1 = ip_hdr(skb)->saddr;
		addr2 = ip_hdr(skb)->daddr;
		ihl = ip_hdr(skb)->ihl << 2;
		break;
	case __constant_htons(ETH_P_IPV6):
		ip_proto = ipv6_hdr(skb)->nexthdr;
		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
		ihl = 10;
		break;
	default:
		return 0;
	}
	if (addr1 > addr2)
		swap(addr1, addr2);

	switch (ip_proto) {
	case IPPROTO_TCP:
	case IPPROTO_UDP:
	case IPPROTO_DCCP:
	case IPPROTO_ESP:
	case IPPROTO_AH:
	case IPPROTO_SCTP:
	case IPPROTO_UDPLITE:
		ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
		if (ports.in16[0] > ports.in16[1])
			swap(ports.in16[0], ports.in16[1]);
		break;

	default:
		ports.in32 = 0;
		break;
	}

	hash = jhash_3words(addr1, addr2, ports.in32,
			    simple_tx_hashrnd ^ ip_proto);

	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
}

static int ifb_init(struct net_device *dev)
{
	struct ifb_private *dp = netdev_priv(dev);
	struct ifb_private_q *pq = dp->pq;
	int i;

	pq = kmalloc(sizeof(*pq) * dev->real_num_tx_queues, GFP_KERNEL);
	if (pq == NULL)
		return -ENOMEM;
	dp->pq = pq;

	for (i = 0; i < dev->real_num_tx_queues; i++) {
		pq[i].dev = dev;
		skb_queue_head_init(&pq[i].rq);
		skb_queue_head_init(&pq[i].tq);
		init_waitqueue_head(&pq[i].wq);
		pq[i].rx_packets = 0;
		pq[i].rx_bytes = 0;
		pq[i].rx_dropped = 0;
	}

	return 0;
}

static void ifb_uninit(struct net_device *dev)
{
	struct ifb_private *dp = netdev_priv(dev);

	kfree(dp->pq);
}

static const struct net_device_ops ifb_netdev_ops = {
	.ndo_init		= ifb_init,
	.ndo_uninit		= ifb_uninit,
	.ndo_open		= ifb_open,
	.ndo_stop		= ifb_close,
	.ndo_start_xmit		= ifb_xmit,
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_select_queue	= ifb_select_queue,
	.ndo_get_stats		= ifb_get_stats,
};

static void ifb_setup(struct net_device *dev)
{
	/* Initialize the device structure. */
	dev->destructor = free_netdev;
	dev->netdev_ops = &ifb_netdev_ops;

	/* Fill in device structure with ethernet-generic values. */
	ether_setup(dev);
	dev->tx_queue_len = TX_Q_LIMIT;

	dev->flags |= IFF_NOARP;
	dev->flags &= ~IFF_MULTICAST;
	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
	random_ether_addr(dev->dev_addr);
}

static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
			return -EINVAL;
		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
			return -EADDRNOTAVAIL;
	}
	return 0;
}

static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
			     unsigned int *num_tx_queues,
			     unsigned int *real_num_tx_queues)
{
	*num_tx_queues = numtxqs;
	*real_num_tx_queues = numtxqs;

	return 0;
}

static struct rtnl_link_ops ifb_link_ops __read_mostly = {
	.kind		= "ifb",
	.setup		= ifb_setup,
	.validate	= ifb_validate,
	.get_tx_queues	= ifb_get_tx_queues,
	.priv_size	= sizeof(struct ifb_private),
};

static int __init ifb_init_one(int index)
{
	struct net_device *dev_ifb;
	int err;

	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
				  ifb_setup, numtxqs);

	if (!dev_ifb)
		return -ENOMEM;

	err = dev_alloc_name(dev_ifb, dev_ifb->name);
	if (err < 0)
		goto err;

	dev_ifb->rtnl_link_ops = &ifb_link_ops;
	err = register_netdevice(dev_ifb);
	if (err < 0)
		goto err;

	return 0;

err:
	free_netdev(dev_ifb);
	return err;
}

static int __init ifb_init_module(void)
{
	int i, err;

	get_random_bytes(&simple_tx_hashrnd, 4);
	rtnl_lock();
	err = __rtnl_link_register(&ifb_link_ops);
	for (i = 0; i < numifbs && !err; i++)
		err = ifb_init_one(i);
	if (err)
		__rtnl_link_unregister(&ifb_link_ops);
	rtnl_unlock();

	return err;
}

static void __exit ifb_cleanup_module(void)
{
	rtnl_link_unregister(&ifb_link_ops);
}

module_init(ifb_init_module);
module_exit(ifb_cleanup_module);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jamal Hadi Salim");
MODULE_ALIAS_RTNL_LINK("ifb");

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-17  5:38         ` Changli Gao
@ 2009-11-17  6:02           ` Stephen Hemminger
  0 siblings, 0 replies; 62+ messages in thread
From: Stephen Hemminger @ 2009-11-17  6:02 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, kaber, eric.dumazet, therbert, netdev

On Tue, 17 Nov 2009 13:38:29 +0800
Changli Gao <xiaosuo@gmail.com> wrote:

> On Tue, Nov 17, 2009 at 11:10 AM, David Miller <davem@davemloft.net> wrote:
> > From: Stephen Hemminger <shemminger@vyatta.com>
> > Date: Mon, 16 Nov 2009 08:39:05 -0800
> >
> >> My $.02 is that receive packet steering RPS should be done generically at
> >> receive layer. Then all the CPU, mapping and configuration issues can be
> >> done once, not just for IFB, Bridge, VLAN, ... The number of users of IFB
> >> is small, and setup is complex. Steering packets in IFB is optimizing only
> >> a rarely used corner.
> >>
> >> Layered link services like IFB need to be multi-threaded lockless to maintain
> >> the advantages of multi-queue and RPS.
> >
> > You're probably right.
> >
> 
> So I have to wait RPS, and after it gets merged, I'll back with
> multi-threaded lockless IFB.

Please don't wait... Help, test it make sure it works.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-17  3:10       ` David Miller
@ 2009-11-17  5:38         ` Changli Gao
  2009-11-17  6:02           ` Stephen Hemminger
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-17  5:38 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, kaber, eric.dumazet, therbert, netdev

On Tue, Nov 17, 2009 at 11:10 AM, David Miller <davem@davemloft.net> wrote:
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Mon, 16 Nov 2009 08:39:05 -0800
>
>> My $.02 is that receive packet steering RPS should be done generically at
>> receive layer. Then all the CPU, mapping and configuration issues can be
>> done once, not just for IFB, Bridge, VLAN, ... The number of users of IFB
>> is small, and setup is complex. Steering packets in IFB is optimizing only
>> a rarely used corner.
>>
>> Layered link services like IFB need to be multi-threaded lockless to maintain
>> the advantages of multi-queue and RPS.
>
> You're probably right.
>

So I have to wait RPS, and after it gets merged, I'll back with
multi-threaded lockless IFB.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-16 16:39     ` Stephen Hemminger
@ 2009-11-17  3:10       ` David Miller
  2009-11-17  5:38         ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: David Miller @ 2009-11-17  3:10 UTC (permalink / raw)
  To: shemminger; +Cc: xiaosuo, kaber, eric.dumazet, therbert, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 16 Nov 2009 08:39:05 -0800

> My $.02 is that receive packet steering RPS should be done generically at
> receive layer. Then all the CPU, mapping and configuration issues can be
> done once, not just for IFB, Bridge, VLAN, ... The number of users of IFB
> is small, and setup is complex. Steering packets in IFB is optimizing only
> a rarely used corner.
> 
> Layered link services like IFB need to be multi-threaded lockless to maintain
> the advantages of multi-queue and RPS.

You're probably right.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  4:37   ` Changli Gao
@ 2009-11-16 16:39     ` Stephen Hemminger
  2009-11-17  3:10       ` David Miller
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2009-11-16 16:39 UTC (permalink / raw)
  To: xiaosuo
  Cc: David S. Miller, Patrick McHardy, Eric Dumazet, Tom Herbert, netdev

On Fri, 13 Nov 2009 12:37:07 +0800
Changli Gao <xiaosuo@gmail.com> wrote:

> ifb: add multi-queue support
> 
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.

My $.02 is that receive packet steering RPS should be done generically at
receive layer. Then all the CPU, mapping and configuration issues can be
done once, not just for IFB, Bridge, VLAN, ... The number of users of IFB
is small, and setup is complex. Steering packets in IFB is optimizing only
a rarely used corner.

Layered link services like IFB need to be multi-threaded lockless to maintain
the advantages of multi-queue and RPS.

-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-16  8:19 ` Eric Dumazet
@ 2009-11-16  8:43   ` Changli Gao
  0 siblings, 0 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-16  8:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Stephen Hemminger, Patrick McHardy, Tom Herbert, netdev

2009/11/16 Eric Dumazet <eric.dumazet@gmail.com>:
> Changli Gao a écrit :
>> ifb: add multi-queue support.
>>
>> Add multi-queue support, through more than one tasklets. One tasklet per
>> queue always steers on the same CPU, and if the number of the
>> tasklets(queues) is lager than the number of CPUs, more than one
>> tasklets will be assigned to the same CPU.
>>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>> ----
>
>>
>
> I must say I fail to see how multiple tasklets per cpu can be of any use.
>
> And :
>
> +       if (skb_queue_len(&pq->rq) == 1)
> +               tasklet_schedule_on(&pq->task, num % num_online_cpus());
>
> Is probably not what you want, because :
>
> A) num_online_cpus() can be expensive to compute,
> B) and you can have such configs :
>
> Three online cpus, CPU0 and CPU4, CPU5
> -> you call tasklet_schedule_on(... , num = {0|1|2})
>
> To avoid packets reorder, if your hash function selects an offline cpu,
> you must forward the packet to a particular cpu, regardless of cpu currently running.
> (even if real device is not multiqueue, its RX interrupts can be handled by any online cpu)
>
> You maybe need to maintain a mapping table with cpu hotplug notifiers.
>

Yea, a mapping table is needed in any way. But I don't find a suitable
user interface to do that. And I think many people maybe interested in
the two new APIs: tasklet_schedule_on() and tasklet_hi_schedule_on().
Any comment?

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-16  7:31 Changli Gao
@ 2009-11-16  8:19 ` Eric Dumazet
  2009-11-16  8:43   ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2009-11-16  8:19 UTC (permalink / raw)
  To: xiaosuo
  Cc: David S. Miller, Stephen Hemminger, Patrick McHardy, Tom Herbert, netdev

Changli Gao a écrit :
> ifb: add multi-queue support.
> 
> Add multi-queue support, through more than one tasklets. One tasklet per
> queue always steers on the same CPU, and if the number of the
> tasklets(queues) is lager than the number of CPUs, more than one
> tasklets will be assigned to the same CPU.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----

> 

I must say I fail to see how multiple tasklets per cpu can be of any use.

And :

+	if (skb_queue_len(&pq->rq) == 1)
+		tasklet_schedule_on(&pq->task, num % num_online_cpus());

Is probably not what you want, because :

A) num_online_cpus() can be expensive to compute,
B) and you can have such configs :

Three online cpus, CPU0 and CPU4, CPU5
-> you call tasklet_schedule_on(... , num = {0|1|2})

To avoid packets reorder, if your hash function selects an offline cpu,
you must forward the packet to a particular cpu, regardless of cpu currently running.
(even if real device is not multiqueue, its RX interrupts can be handled by any online cpu)

You maybe need to maintain a mapping table with cpu hotplug notifiers.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH] ifb: add multi-queue support
@ 2009-11-16  7:31 Changli Gao
  2009-11-16  8:19 ` Eric Dumazet
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-16  7:31 UTC (permalink / raw)
  To: David S. Miller, Stephen Hemminger, Patrick McHardy
  Cc: xiaosuo, Eric Dumazet, Tom Herbert, netdev

ifb: add multi-queue support.

Add multi-queue support, through more than one tasklets. One tasklet per
queue always steers on the same CPU, and if the number of the
tasklets(queues) is lager than the number of CPUs, more than one
tasklets will be assigned to the same CPU.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
drivers/net/ifb.c | 453 ++++++++++++++++++++++++++++++++--------------
include/linux/if_link.h | 1
include/linux/interrupt.h | 16 +
net/core/rtnetlink.c | 3
4 files changed, 342 insertions(+), 131 deletions(-)

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 69c2566..0639822 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -33,161 +33,145 @@
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
 #include <net/pkt_sched.h>
 #include <net/net_namespace.h>
 
-#define TX_TIMEOUT  (2*HZ)
-
 #define TX_Q_LIMIT    32
+
+struct ifb_private_q {
+	struct net_device	*dev;
+	struct sk_buff_head	rq;
+	struct tasklet_struct	task;
+	unsigned long		rx_packets;
+	unsigned long		rx_bytes;
+	unsigned long		rx_dropped;
+} ____cacheline_aligned_in_smp;
+
 struct ifb_private {
-	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
-	/* mostly debug stats leave in for now */
-	unsigned long   st_task_enter; /* tasklet entered */
-	unsigned long   st_txq_refl_try; /* transmit queue refill attempt */
-	unsigned long   st_rxq_enter; /* receive queue entered */
-	unsigned long   st_rx2tx_tran; /* receive to trasmit transfers */
-	unsigned long   st_rxq_notenter; /*receiveQ not entered, resched */
-	unsigned long   st_rx_frm_egr; /* received from egress path */
-	unsigned long   st_rx_frm_ing; /* received from ingress path */
-	unsigned long   st_rxq_check;
-	unsigned long   st_rxq_rsch;
-	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
+	struct ifb_private_q	*pq;
 };
 
+/* Number of ifb devices to be set up by this module. */
 static int numifbs = 2;
+module_param(numifbs, int, 0444);
+MODULE_PARM_DESC(numifbs, "Number of ifb devices");
 
-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
+/* Number of TX queues per ifb */
+static unsigned int numtxqs = 1;
+module_param(numtxqs, uint, 0444);
+MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
 
-static void ri_tasklet(unsigned long dev)
+static void ifb_tasklet(void *priv)
 {
-
-	struct net_device *_dev = (struct net_device *)dev;
-	struct ifb_private *dp = netdev_priv(_dev);
-	struct net_device_stats *stats = &_dev->stats;
-	struct netdev_queue *txq;
+	struct ifb_private_q *pq = priv;
+	struct net_device *dev = pq->dev, *_dev;
+	int num = pq - ((struct ifb_private *)netdev_priv(dev))->pq;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 	struct sk_buff *skb;
-
-	txq = netdev_get_tx_queue(_dev, 0);
-	dp->st_task_enter++;
-	if ((skb = skb_peek(&dp->tq)) == NULL) {
-		dp->st_txq_refl_try++;
-		if (__netif_tx_trylock(txq)) {
-			dp->st_rxq_enter++;
-			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
-				skb_queue_tail(&dp->tq, skb);
-				dp->st_rx2tx_tran++;
-			}
-			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			dp->st_rxq_notenter++;
-			goto resched;
-		}
-	}
-
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
+	struct sk_buff_head tq;
+
+	__skb_queue_head_init(&tq);
+	/* move skb from rq to tq */
+	__netif_tx_lock(txq, smp_processor_id());
+	skb_queue_splice_tail_init(&pq->rq, &tq);
+	if (netif_queue_stopped(dev))
+		netif_wake_queue(dev);
+	__netif_tx_unlock(txq);
+	if (skb_queue_empty(&tq))
+		return;
+
+	/* transfer packets */
+	while ((skb = __skb_dequeue(&tq)) != NULL) {
 		u32 from = G_TC_FROM(skb->tc_verd);
 
 		skb->tc_verd = 0;
 		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
-		stats->tx_packets++;
-		stats->tx_bytes +=skb->len;
+		txq->tx_packets++;
+		txq->tx_bytes +=skb->len;
 
 		rcu_read_lock();
-		skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
-		if (!skb->dev) {
-			rcu_read_unlock();
+		_dev = dev_get_by_index_rcu(&init_net, skb->iif);
+		if (_dev != NULL) {
+			skb->dev = _dev;
+			skb->iif = dev->ifindex;
+			if (from & AT_EGRESS) {
+				dev_queue_xmit(skb);
+			} else if (from & AT_INGRESS) {
+				skb_pull(skb, _dev->hard_header_len);
+				netif_rx_ni(skb);
+			} else {
+				BUG();
+			}
+		} else {
 			dev_kfree_skb(skb);
-			stats->tx_dropped++;
-			break;
+			txq->tx_dropped++;
 		}
 		rcu_read_unlock();
-		skb->iif = _dev->ifindex;
-
-		if (from & AT_EGRESS) {
-			dp->st_rx_frm_egr++;
-			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
-			dp->st_rx_frm_ing++;
-			skb_pull(skb, skb->dev->hard_header_len);
-			netif_rx(skb);
-		} else
-			BUG();
 	}
-
-	if (__netif_tx_trylock(txq)) {
-		dp->st_rxq_check++;
-		if ((skb = skb_peek(&dp->rq)) == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			dp->st_rxq_rsch++;
-			__netif_tx_unlock(txq);
-			goto resched;
-		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
-
 }
 
-static const struct net_device_ops ifb_netdev_ops = {
-	.ndo_open	= ifb_open,
-	.ndo_stop	= ifb_close,
-	.ndo_start_xmit	= ifb_xmit,
-	.ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
+struct net_device_stats* ifb_get_stats(struct net_device *dev)
 {
-	/* Initialize the device structure. */
-	dev->destructor = free_netdev;
-	dev->netdev_ops = &ifb_netdev_ops;
+	struct net_device_stats *stats = &dev->stats;
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	struct netdev_queue *txq;
+	int i;
+	unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
+	unsigned long tx_packets = 0, tx_bytes = 0, tx_dropped = 0;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		rx_packets += pq[i].rx_packets;
+		rx_bytes += pq[i].rx_bytes;
+		rx_dropped += pq[i].rx_dropped;
+		txq = netdev_get_tx_queue(dev, i);
+		tx_packets += txq->tx_packets;
+		tx_bytes += txq->tx_bytes;
+		tx_dropped += txq->tx_dropped;
+	}
 
-	/* Fill in device structure with ethernet-generic values. */
-	ether_setup(dev);
-	dev->tx_queue_len = TX_Q_LIMIT;
+	stats->rx_packets = rx_packets;
+	stats->rx_bytes = rx_bytes;
+	stats->rx_dropped = rx_dropped;
+	stats->tx_packets = tx_packets;
+	stats->tx_bytes = tx_bytes;
+	stats->tx_dropped = tx_dropped;
 
-	dev->flags |= IFF_NOARP;
-	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
-	random_ether_addr(dev->dev_addr);
+	return stats;
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
 	u32 from = G_TC_FROM(skb->tc_verd);
+	int num = skb_get_queue_mapping(skb);
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq + num;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 
-	stats->rx_packets++;
-	stats->rx_bytes+=skb->len;
+	pq->rx_packets++;
+	pq->rx_bytes += skb->len;
 
 	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
 		dev_kfree_skb(skb);
-		stats->rx_dropped++;
+		pq->rx_dropped++;
 		return NETDEV_TX_OK;
 	}
 
-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len) {
+	txq->trans_start = jiffies;
+	__skb_queue_tail(&pq->rq, skb);
+	if (skb_queue_len(&pq->rq) >= dev->tx_queue_len)
 		netif_stop_queue(dev);
-	}
-
-	dev->trans_start = jiffies;
-	skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
+	if (skb_queue_len(&pq->rq) == 1)
+		tasklet_schedule_on(&pq->task, num % num_online_cpus());
 
 	return NETDEV_TX_OK;
 }
@@ -195,26 +179,217 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 static int ifb_close(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
 
-	tasklet_kill(&dp->ifb_tasklet);
 	netif_stop_queue(dev);
-	skb_queue_purge(&dp->rq);
-	skb_queue_purge(&dp->tq);
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		tasklet_kill(&pq[i].task);
+		__skb_queue_purge(&pq[i].rq);
+	}
+
 	return 0;
 }
 
 static int ifb_open(struct net_device *dev)
 {
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static u32 simple_tx_hashrnd;
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+	return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+			sizeof(struct pppoe_hdr)));
+}
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2;
+	u32 hash;
+	union {
+		u16 in16[2];
+		u32 in32;
+	} ports;
+	u8 ip_proto;
+	unsigned int pull_len, proto_pull_len;
+	int ihl;
+
+	if ((hash = skb_rx_queue_recorded(skb))) {
+		while (hash >= dev->real_num_tx_queues)
+			hash -= dev->real_num_tx_queues;
+		return hash;
+	}
+
+	/* act_mirred pushed the skb before calling dev_queue_xmit() */
+	proto_pull_len = 0;
+	pull_len = skb_network_header(skb) - skb->data;
+	if (unlikely(skb_pull(skb, pull_len) == NULL)) {
+		pull_len = 0;
+		goto process_other;
+	}
+
+	ihl = 0;
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_8021Q):
+		if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL))
+			goto process_other;
+		pull_len += VLAN_HLEN;
+		skb->network_header += VLAN_HLEN;
+		proto_pull_len = VLAN_HLEN;
+		switch (vlan_eth_hdr(skb)->h_vlan_encapsulated_proto) {
+		case __constant_htons(ETH_P_IP):
+			goto process_ip;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case __constant_htons(ETH_P_IPV6):
+			goto process_ipv6;
+#endif
+		default:
+			goto process_other;
+		}
+		break;
+	case __constant_htons(ETH_P_PPP_SES):
+		if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL))
+			goto process_other;
+		pull_len += PPPOE_SES_HLEN;
+		skb->network_header += PPPOE_SES_HLEN;
+		proto_pull_len = PPPOE_SES_HLEN;
+		switch (pppoe_proto(skb)) {
+		case __constant_htons(ETH_P_IP):
+			goto process_ip;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case __constant_htons(ETH_P_IPV6):
+			goto process_ipv6;
+#endif
+		default:
+			goto process_other;
+		}
+		break;
+	case __constant_htons(ETH_P_IP):
+process_ip:
+		if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
+			goto process_other;
+		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+			ip_proto = ip_hdr(skb)->protocol;
+		else
+			ip_proto = 0;
+		addr1 = ip_hdr(skb)->saddr;
+		addr2 = ip_hdr(skb)->daddr;
+		ihl = ip_hdrlen(skb);
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case __constant_htons(ETH_P_IPV6):
+process_ipv6:
+		if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr))))
+			goto process_other;
+		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+		ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto);
+		if (unlikely(ihl < 0))
+			goto process_other_trans;
+		break;
+#endif
+	default:
+process_other:
+		if (pull_len != 0) {
+			skb_push(skb, pull_len);
+			if (proto_pull_len != 0)
+				skb->network_header -= proto_pull_len;
+		}
+		return skb->protocol % dev->real_num_tx_queues;
+	}
+	if (addr1 > addr2)
+		swap(addr1, addr2);
+
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (unlikely(skb_copy_bits(skb, ihl, &ports.in32, 4) < 0))
+			goto process_other_trans;
+		if (ports.in16[0] > ports.in16[1])
+			swap(ports.in16[0], ports.in16[1]);
+		break;
+	default:
+process_other_trans:
+		ports.in32 = 0;
+		break;
+	}
+
+	if (pull_len != 0) {
+		skb_push(skb, pull_len);
+		if (proto_pull_len != 0)
+			skb->network_header -= proto_pull_len;
+	}
+
+	hash = jhash_3words(addr1, addr2, ports.in32,
+			    simple_tx_hashrnd ^ ip_proto);
+
+	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static int ifb_init(struct net_device *dev)
+{
 	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
 
-	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
-	skb_queue_head_init(&dp->rq);
-	skb_queue_head_init(&dp->tq);
-	netif_start_queue(dev);
+	pq = kcalloc(dev->real_num_tx_queues, sizeof(*pq), GFP_KERNEL);
+	if (pq == NULL)
+		return -ENOMEM;
+	dp->pq = pq;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].dev = dev;
+		__skb_queue_head_init(&pq[i].rq);
+		tasklet_init(&pq[i].task, (void(*)(unsigned long))ifb_tasklet,
+			     (unsigned long)&pq[i]);
+	}
 
 	return 0;
 }
 
+static void ifb_uninit(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+
+	kfree(dp->pq);
+}
+
+static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_init		= ifb_init,
+	.ndo_uninit		= ifb_uninit,
+	.ndo_open		= ifb_open,
+	.ndo_stop		= ifb_close,
+	.ndo_start_xmit		= ifb_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_select_queue	= ifb_select_queue,
+	.ndo_get_stats		= ifb_get_stats,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+	/* Initialize the device structure. */
+	dev->destructor = free_netdev;
+	dev->netdev_ops = &ifb_netdev_ops;
+
+	/* Fill in device structure with ethernet-generic values. */
+	ether_setup(dev);
+	dev->tx_queue_len = TX_Q_LIMIT;
+
+	dev->flags |= IFF_NOARP;
+	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	random_ether_addr(dev->dev_addr);
+}
+
 static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {
@@ -226,25 +401,38 @@ static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
+			     unsigned int *num_tx_queues,
+			     unsigned int *real_num_tx_queues)
+{
+	if (tb[IFLA_NTXQ]) {
+		*num_tx_queues = nla_get_u32(tb[IFLA_NTXQ]);
+		if (*num_tx_queues < 1)
+			return -EINVAL;
+		*real_num_tx_queues = *num_tx_queues;
+	} else {
+		*num_tx_queues = numtxqs;
+		*real_num_tx_queues = numtxqs;
+	}
+
+	return 0;
+}
+
 static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.kind		= "ifb",
-	.priv_size	= sizeof(struct ifb_private),
 	.setup		= ifb_setup,
 	.validate	= ifb_validate,
+	.get_tx_queues	= ifb_get_tx_queues,
+	.priv_size	= sizeof(struct ifb_private),
 };
 
-/* Number of ifb devices to be set up by this module. */
-module_param(numifbs, int, 0);
-MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-
 static int __init ifb_init_one(int index)
 {
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
-				 "ifb%d", ifb_setup);
-
+	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
+				  ifb_setup, numtxqs);
 	if (!dev_ifb)
 		return -ENOMEM;
 
@@ -268,9 +456,12 @@ static int __init ifb_init_module(void)
 {
 	int i, err;
 
+	if (numtxqs < 1)
+		return -EINVAL;
+
+	get_random_bytes(&simple_tx_hashrnd, 4);
 	rtnl_lock();
 	err = __rtnl_link_register(&ifb_link_ops);
-
 	for (i = 0; i < numifbs && !err; i++)
 		err = ifb_init_one(i);
 	if (err)
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1d3b242..99da411 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -78,6 +78,7 @@ enum {
 #define IFLA_LINKINFO IFLA_LINKINFO
 	IFLA_NET_NS_PID,
 	IFLA_IFALIAS,
+	IFLA_NTXQ,
 	__IFLA_MAX
 };
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7ca72b7..0507c33 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -469,6 +469,14 @@ static inline void tasklet_schedule(struct tasklet_struct *t)
 		__tasklet_schedule(t);
 }
 
+static inline void tasklet_schedule_on(struct tasklet_struct *t, int cpu)
+{
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state) &&
+	    unlikely(smp_call_function_single(cpu,
+				(void(*)(void*))__tasklet_schedule, t, 0)))
+		__tasklet_schedule(t);
+}
+
 extern void __tasklet_hi_schedule(struct tasklet_struct *t);
 
 static inline void tasklet_hi_schedule(struct tasklet_struct *t)
@@ -477,6 +485,14 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
 		__tasklet_hi_schedule(t);
 }
 
+static inline void tasklet_hi_schedule_on(struct tasklet_struct *t, int cpu)
+{
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state) &&
+	    unlikely(smp_call_function_single(cpu,
+				(void(*)(void*))__tasklet_hi_schedule, t, 0)))
+		__tasklet_hi_schedule(t);
+}
+
 extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
 
 /*
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33148a5..1cbe555 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -597,6 +597,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
 	       + nla_total_size(4) /* IFLA_MASTER */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
+	       + nla_total_size(4) /* IFLA_NTXQ */
 	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
 }
 
@@ -627,6 +628,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		   netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
 	NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
 	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+	NLA_PUT_U32(skb, IFLA_NTXQ, dev->real_num_tx_queues);
 
 	if (dev->ifindex != dev->iflink)
 		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -725,6 +727,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	[IFLA_NTXQ]		= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL(ifla_policy);
 



^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-14 12:53                           ` Eric Dumazet
@ 2009-11-14 13:30                             ` Changli Gao
  0 siblings, 0 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-14 13:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Jarek Poplawski, David S. Miller,
	Patrick McHardy, Tom Herbert, netdev

On Sat, Nov 14, 2009 at 8:53 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Changli Gao a écrit :
>> Yea, the overhead of SoftIRQ is less than kernel threads, and I'll try
>> to find a way to solve both flexibility and efficiency. Maybe I need
>> some real NIC drivers as examples. Is there a standard API to bind RQs
>> of NIC to CPUs, such as ioctl or setsockopt?
>
> Thats a good question... napi is bound to a cpu, and you'll need
> things that were done by Tom Herbert in its RPS patch, to
> be able to deleguate work to remote cpus napi contexts.
>
> But if we consider RPS being close to be committed, shouldnt
> IFB use its, in order to not duplicate changes ?
>
> Or, if you prefer, once RPS is in, we dont need to change IFB, since
> RPS will already split the load to multiple cpus before entering IFB.
>
>

I had known RPS before I began my IFB work, so I added Tom to the CC
list, but I don't know if RPS will be in the mainline. Since there
isn't any core change in IFB as is in RPS, I think it is easier to
merge IFB into the mainline. If RPS is merged before IFB-MQ, I'll give
IFB-MQ up, and vice versa I think.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13 23:42                         ` Changli Gao
@ 2009-11-14 12:53                           ` Eric Dumazet
  2009-11-14 13:30                             ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2009-11-14 12:53 UTC (permalink / raw)
  To: Changli Gao
  Cc: Stephen Hemminger, Jarek Poplawski, David S. Miller,
	Patrick McHardy, Tom Herbert, netdev

Changli Gao a écrit :
> Yea, the overhead of SoftIRQ is less than kernel threads, and I'll try
> to find a way to solve both flexibility and efficiency. Maybe I need
> some real NIC drivers as examples. Is there a standard API to bind RQs
> of NIC to CPUs, such as ioctl or setsockopt?

Thats a good question... napi is bound to a cpu, and you'll need
things that were done by Tom Herbert in its RPS patch, to
be able to deleguate work to remote cpus napi contexts.

But if we consider RPS being close to be committed, shouldnt
IFB use its, in order to not duplicate changes ?

Or, if you prefer, once RPS is in, we dont need to change IFB, since
RPS will already split the load to multiple cpus before entering IFB.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13 23:32                       ` Stephen Hemminger
@ 2009-11-13 23:42                         ` Changli Gao
  2009-11-14 12:53                           ` Eric Dumazet
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-13 23:42 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jarek Poplawski, Eric Dumazet, David S. Miller, Patrick McHardy,
	Tom Herbert, netdev

On Sat, Nov 14, 2009 at 7:32 AM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> On Sat, 14 Nov 2009 07:28:42 +0800
> Changli Gao <xiaosuo@gmail.com> wrote:
>
>> On Sat, Nov 14, 2009 at 12:15 AM, Stephen Hemminger
>>
>> It needs to send remote SoftIRQ, as Receiving Packet Steering, and we
>> must support a extra interface to map load to CPUs.
>>
>
> But it still could use NAPI to avoid causing excess packet overhead.
> The softirq could be equivalent of hardirq in network device.
>

Yea, the overhead of SoftIRQ is less than kernel threads, and I'll try
to find a way to solve both flexibility and efficiency. Maybe I need
some real NIC drivers as examples. Is there a standard API to bind RQs
of NIC to CPUs, such as ioctl or setsockopt?


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13 23:28                     ` Changli Gao
@ 2009-11-13 23:32                       ` Stephen Hemminger
  2009-11-13 23:42                         ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2009-11-13 23:32 UTC (permalink / raw)
  To: Changli Gao
  Cc: Jarek Poplawski, Eric Dumazet, David S. Miller, Patrick McHardy,
	Tom Herbert, netdev

On Sat, 14 Nov 2009 07:28:42 +0800
Changli Gao <xiaosuo@gmail.com> wrote:

> On Sat, Nov 14, 2009 at 12:15 AM, Stephen Hemminger
> <shemminger@vyatta.com> wrote:
> > On Fri, 13 Nov 2009 17:38:56 +0800
> > Changli Gao <xiaosuo@gmail.com> wrote:
> >>
> >> Oh, :) . I know more than one companies use kernel threads to forward
> >> packets, and there isn't explicit extra overhead at all. And as you
> >> know, as throughput increases, NAPI will bind the NIC to a CPU, and
> >> softirqd will be waked up to do the work, which should be done in
> >> SoftIRQ context. At that time, there isn't any difference between my
> >> approach and the current kernel's.
> >>
> > Why not make IFB a NAPI device. This would get rid of the extra soft-irq
> > round trip from going through netif_rx().  It would also behave like
> > regular multi-queue recieive device, and eliminate need for seperate
> > tasklets or threads.
> >
> 
> It needs to send remote SoftIRQ, as Receiving Packet Steering, and we
> must support a extra interface to map load to CPUs.
> 

But it still could use NAPI to avoid causing excess packet overhead.
The softirq could be equivalent of hardirq in network device.

-- 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13 16:15                   ` Stephen Hemminger
@ 2009-11-13 23:28                     ` Changli Gao
  2009-11-13 23:32                       ` Stephen Hemminger
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-13 23:28 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jarek Poplawski, Eric Dumazet, David S. Miller, Patrick McHardy,
	Tom Herbert, netdev

On Sat, Nov 14, 2009 at 12:15 AM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> On Fri, 13 Nov 2009 17:38:56 +0800
> Changli Gao <xiaosuo@gmail.com> wrote:
>>
>> Oh, :) . I know more than one companies use kernel threads to forward
>> packets, and there isn't explicit extra overhead at all. And as you
>> know, as throughput increases, NAPI will bind the NIC to a CPU, and
>> softirqd will be waked up to do the work, which should be done in
>> SoftIRQ context. At that time, there isn't any difference between my
>> approach and the current kernel's.
>>
> Why not make IFB a NAPI device. This would get rid of the extra soft-irq
> round trip from going through netif_rx().  It would also behave like
> regular multi-queue recieive device, and eliminate need for seperate
> tasklets or threads.
>

It needs to send remote SoftIRQ, as Receiving Packet Steering, and we
must support a extra interface to map load to CPUs.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  9:38                 ` Changli Gao
  2009-11-13  9:57                   ` Jarek Poplawski
@ 2009-11-13 16:15                   ` Stephen Hemminger
  2009-11-13 23:28                     ` Changli Gao
  1 sibling, 1 reply; 62+ messages in thread
From: Stephen Hemminger @ 2009-11-13 16:15 UTC (permalink / raw)
  To: Changli Gao
  Cc: Jarek Poplawski, Eric Dumazet, David S. Miller, Patrick McHardy,
	Tom Herbert, netdev

On Fri, 13 Nov 2009 17:38:56 +0800
Changli Gao <xiaosuo@gmail.com> wrote:

> On Fri, Nov 13, 2009 at 5:18 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> > On Fri, Nov 13, 2009 at 04:54:50PM +0800, Changli Gao wrote:
> >> On Fri, Nov 13, 2009 at 3:45 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> >>
> >> I have done a simple test. I run a simple program on computer A, which
> >> sends SYN packets with random source ports to Computer B's 80 port (No
> >> socket listens on that port, so tcp reset packets will be sent) in
> >> 90kpps. On computer B, I redirect the traffic to IFB. At the same
> >> time, I ping from B to A to get the RTT between them. I can't see any
> >> difference between the original IFB and my MQ version. They are both:
> >>
> >> CPU idle: 50%
> >> Latency: 0.3-0.4ms, burst 2ms.
> >>
> >
> > I'm mostly concerned with routers doing forwarding with 1Gb or 10Gb
> > NICs (including multiqueue). Alas/happily I don't have such a problem,
> > but can't help you with testing either.
> >
> 
> Oh, :) . I know more than one companies use kernel threads to forward
> packets, and there isn't explicit extra overhead at all. And as you
> know, as throughput increases, NAPI will bind the NIC to a CPU, and
> softirqd will be waked up to do the work, which should be done in
> SoftIRQ context. At that time, there isn't any difference between my
> approach and the current kernel's.
> 
> 

Why not make IFB a NAPI device. This would get rid of the extra soft-irq
round trip from going through netif_rx().  It would also behave like
regular multi-queue recieive device, and eliminate need for seperate
tasklets or threads.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  8:54             ` Changli Gao
  2009-11-13  9:18               ` Jarek Poplawski
@ 2009-11-13 13:55               ` Eric Dumazet
  1 sibling, 0 replies; 62+ messages in thread
From: Eric Dumazet @ 2009-11-13 13:55 UTC (permalink / raw)
  To: Changli Gao
  Cc: Jarek Poplawski, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

Changli Gao a écrit :

> I have done a simple test. I run a simple program on computer A, which
> sends SYN packets with random source ports to Computer B's 80 port (No
> socket listens on that port, so tcp reset packets will be sent) in
> 90kpps. On computer B, I redirect the traffic to IFB. At the same
> time, I ping from B to A to get the RTT between them. I can't see any
> difference between the original IFB and my MQ version. They are both:
> 
> CPU idle: 50%
> Latency: 0.3-0.4ms, burst 2ms.

Yes, but redo your test with normal load (including several user threads
fighting for scheduler/cpu use) :)

softirq preempts user threads instantly (they are interrupts after all :) ),
while workqueue do compete with other workqueues/threads.

Tom Herbert RPS solution is still using softirqs.

I like to test workqueue based solution with IFB, but some people might still
keep previous pure softirq solution for production environment, not benchmarks...


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13 11:25                     ` Changli Gao
  2009-11-13 12:32                       ` Jarek Poplawski
@ 2009-11-13 13:10                       ` Eric Dumazet
  1 sibling, 0 replies; 62+ messages in thread
From: Eric Dumazet @ 2009-11-13 13:10 UTC (permalink / raw)
  To: Changli Gao
  Cc: Jarek Poplawski, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

Changli Gao a écrit :
> On Fri, Nov 13, 2009 at 5:57 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
>> I'm not against your solution at all. It only needs more proof...
> 
> I know. :)
> 
>> You seem to forget the main networking paths now are just softirq, and
>> it's probably for some reason. If kernel threads are good enough, it
>> seems we should do more such changes.
>>
> 
> I find there is still a softirqd kernel thread for each online CPU,
> and these threads will be waked up if there are more SoftIRQs need to
> be done after restarting the SoftIRQ processing many times, to keep
> the whole system responsible. Did I miss sth.? And at the other side,
> real time branch just wants to make all the activities based on kernel
> threads, even ISR.
> 
> 


You seem to focus on stress loads, where we enter ksoftirqd more to get
more throughput (and cpu fairness), at latency expense.

But in normal situations, ksoftirq is not started at all.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13 11:25                     ` Changli Gao
@ 2009-11-13 12:32                       ` Jarek Poplawski
  2009-11-13 13:10                       ` Eric Dumazet
  1 sibling, 0 replies; 62+ messages in thread
From: Jarek Poplawski @ 2009-11-13 12:32 UTC (permalink / raw)
  To: Changli Gao
  Cc: Eric Dumazet, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

On Fri, Nov 13, 2009 at 07:25:33PM +0800, Changli Gao wrote:
> I find there is still a softirqd kernel thread for each online CPU,
> and these threads will be waked up if there are more SoftIRQs need to
> be done after restarting the SoftIRQ processing many times, to keep
> the whole system responsible. Did I miss sth.? And at the other side,
> real time branch just wants to make all the activities based on kernel
> threads, even ISR.

I wish them well! But again: for some strange reason network admins,
who sometimes care to complain here, rarely (very rarely) mention this
RT branch... (Actually, I recall only one case, and it probably wasn't
even real admin ;-)

Jarek P.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  9:57                   ` Jarek Poplawski
@ 2009-11-13 11:25                     ` Changli Gao
  2009-11-13 12:32                       ` Jarek Poplawski
  2009-11-13 13:10                       ` Eric Dumazet
  0 siblings, 2 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-13 11:25 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Eric Dumazet, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

On Fri, Nov 13, 2009 at 5:57 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
>
> I'm not against your solution at all. It only needs more proof...

I know. :)

> You seem to forget the main networking paths now are just softirq, and
> it's probably for some reason. If kernel threads are good enough, it
> seems we should do more such changes.
>

I find there is still a softirqd kernel thread for each online CPU,
and these threads will be waked up if there are more SoftIRQs need to
be done after restarting the SoftIRQ processing many times, to keep
the whole system responsible. Did I miss sth.? And at the other side,
real time branch just wants to make all the activities based on kernel
threads, even ISR.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  9:38                 ` Changli Gao
@ 2009-11-13  9:57                   ` Jarek Poplawski
  2009-11-13 11:25                     ` Changli Gao
  2009-11-13 16:15                   ` Stephen Hemminger
  1 sibling, 1 reply; 62+ messages in thread
From: Jarek Poplawski @ 2009-11-13  9:57 UTC (permalink / raw)
  To: Changli Gao
  Cc: Eric Dumazet, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

On Fri, Nov 13, 2009 at 05:38:56PM +0800, Changli Gao wrote:
> On Fri, Nov 13, 2009 at 5:18 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> > I'm mostly concerned with routers doing forwarding with 1Gb or 10Gb
> > NICs (including multiqueue). Alas/happily I don't have such a problem,
> > but can't help you with testing either.
> >
> 
> Oh, :) . I know more than one companies use kernel threads to forward
> packets, and there isn't explicit extra overhead at all. And as you
> know, as throughput increases, NAPI will bind the NIC to a CPU, and
> softirqd will be waked up to do the work, which should be done in
> SoftIRQ context. At that time, there isn't any difference between my
> approach and the current kernel's.

I'm not against your solution at all. It only needs more proof... You
seem to forget the main networking paths now are just softirq, and
it's probably for some reason. If kernel threads are good enough, it
seems we should do more such changes.

Jarek P.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  9:18               ` Jarek Poplawski
@ 2009-11-13  9:38                 ` Changli Gao
  2009-11-13  9:57                   ` Jarek Poplawski
  2009-11-13 16:15                   ` Stephen Hemminger
  0 siblings, 2 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-13  9:38 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Eric Dumazet, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

On Fri, Nov 13, 2009 at 5:18 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> On Fri, Nov 13, 2009 at 04:54:50PM +0800, Changli Gao wrote:
>> On Fri, Nov 13, 2009 at 3:45 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
>>
>> I have done a simple test. I run a simple program on computer A, which
>> sends SYN packets with random source ports to Computer B's 80 port (No
>> socket listens on that port, so tcp reset packets will be sent) in
>> 90kpps. On computer B, I redirect the traffic to IFB. At the same
>> time, I ping from B to A to get the RTT between them. I can't see any
>> difference between the original IFB and my MQ version. They are both:
>>
>> CPU idle: 50%
>> Latency: 0.3-0.4ms, burst 2ms.
>>
>
> I'm mostly concerned with routers doing forwarding with 1Gb or 10Gb
> NICs (including multiqueue). Alas/happily I don't have such a problem,
> but can't help you with testing either.
>

Oh, :) . I know more than one companies use kernel threads to forward
packets, and there isn't explicit extra overhead at all. And as you
know, as throughput increases, NAPI will bind the NIC to a CPU, and
softirqd will be waked up to do the work, which should be done in
SoftIRQ context. At that time, there isn't any difference between my
approach and the current kernel's.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  8:54             ` Changli Gao
@ 2009-11-13  9:18               ` Jarek Poplawski
  2009-11-13  9:38                 ` Changli Gao
  2009-11-13 13:55               ` Eric Dumazet
  1 sibling, 1 reply; 62+ messages in thread
From: Jarek Poplawski @ 2009-11-13  9:18 UTC (permalink / raw)
  To: Changli Gao
  Cc: Eric Dumazet, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

On Fri, Nov 13, 2009 at 04:54:50PM +0800, Changli Gao wrote:
> On Fri, Nov 13, 2009 at 3:45 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> > On 13-11-2009 07:16, Changli Gao wrote:
> >
> > I don't think so. There would be a lot of code duplication and later
> > maintenance problems only because of the scheduling method. The main
> > question is to establish if there is really no performance difference
> > (which I doubt) - unless Changli can show some tests for various
> > setups now. On the other hand, if there is a difference, why keep
> > ineffective solution - similar thing should be possible to do in the
> > softirq context as well.
> >
> > So it should not be a big problem to do it a bit messy for some
> > testing time. Since we can use separate ->ndo_start_xmit() etc. it
> > shouldn't be too messy, I guess.
> >
> 
> I have done a simple test. I run a simple program on computer A, which
> sends SYN packets with random source ports to Computer B's 80 port (No
> socket listens on that port, so tcp reset packets will be sent) in
> 90kpps. On computer B, I redirect the traffic to IFB. At the same
> time, I ping from B to A to get the RTT between them. I can't see any
> difference between the original IFB and my MQ version. They are both:
> 
> CPU idle: 50%
> Latency: 0.3-0.4ms, burst 2ms.
> 

I'm mostly concerned with routers doing forwarding with 1Gb or 10Gb
NICs (including multiqueue). Alas/happily I don't have such a problem,
but can't help you with testing either.

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  7:45           ` Jarek Poplawski
@ 2009-11-13  8:54             ` Changli Gao
  2009-11-13  9:18               ` Jarek Poplawski
  2009-11-13 13:55               ` Eric Dumazet
  0 siblings, 2 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-13  8:54 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Eric Dumazet, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

On Fri, Nov 13, 2009 at 3:45 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> On 13-11-2009 07:16, Changli Gao wrote:
>
> I don't think so. There would be a lot of code duplication and later
> maintenance problems only because of the scheduling method. The main
> question is to establish if there is really no performance difference
> (which I doubt) - unless Changli can show some tests for various
> setups now. On the other hand, if there is a difference, why keep
> ineffective solution - similar thing should be possible to do in the
> softirq context as well.
>
> So it should not be a big problem to do it a bit messy for some
> testing time. Since we can use separate ->ndo_start_xmit() etc. it
> shouldn't be too messy, I guess.
>

I have done a simple test. I run a simple program on computer A, which
sends SYN packets with random source ports to Computer B's 80 port (No
socket listens on that port, so tcp reset packets will be sent) in
90kpps. On computer B, I redirect the traffic to IFB. At the same
time, I ping from B to A to get the RTT between them. I can't see any
difference between the original IFB and my MQ version. They are both:

CPU idle: 50%
Latency: 0.3-0.4ms, burst 2ms.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  6:16         ` Changli Gao
@ 2009-11-13  7:45           ` Jarek Poplawski
  2009-11-13  8:54             ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Jarek Poplawski @ 2009-11-13  7:45 UTC (permalink / raw)
  To: Changli Gao
  Cc: Eric Dumazet, David S. Miller, Stephen Hemminger,
	Patrick McHardy, Tom Herbert, netdev

On 13-11-2009 07:16, Changli Gao wrote:
> 2009/11/13 Eric Dumazet <eric.dumazet@gmail.com>:
>> Messy ? Because of few tests added in code, and branches always
>> correctly predicted ?
>>
>> Still some people might rely on tasklet instead of workqueues
>> and added scheduler stress and latency penalty. Tasklet are softirq
>> and normally are processed a few nanosecs later than RX softirq,
>> on the same CPU, while with your workqueue, I guess the scheduler will
>> try to not migrate it, so we add a penalty for light to moderate load.
>>
>> I guess this new ifb mode would be a regression for them ?
>>
>> If you dont want to maintain a compatibility mode, maybe you
>> should introduce a complete new driver, drivers/net/ifbmq.c or ifbwq.c
>>
>> (multiqueue or workqueue references)
>>
> 
> It sounds a good idea.

I don't think so. There would be a lot of code duplication and later
maintenance problems only because of the scheduling method. The main
question is to establish if there is really no performance difference
(which I doubt) - unless Changli can show some tests for various
setups now. On the other hand, if there is a difference, why keep
ineffective solution - similar thing should be possible to do in the
softirq context as well.

So it should not be a big problem to do it a bit messy for some
testing time. Since we can use separate ->ndo_start_xmit() etc. it
shouldn't be too messy, I guess.

Jarek P.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  1:32       ` Changli Gao
@ 2009-11-13  7:18         ` Patrick McHardy
  0 siblings, 0 replies; 62+ messages in thread
From: Patrick McHardy @ 2009-11-13  7:18 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert, netdev

Changli Gao wrote:
> On Thu, Nov 12, 2009 at 11:11 PM, Patrick McHardy <kaber@trash.net> wrote:
>> Changli Gao wrote:
>>> The corresponding iprout2 patch is attached.
>> Printing the value during dumps is missing from this patch.
>>
> 
> I doesn't know the rules about adding more dumps. Can I add it in the
> end of the corresponding line?
> 
> ifb0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN qlen 32 ntxqs 4

Yes, sounds fine.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  5:56       ` Eric Dumazet
@ 2009-11-13  6:16         ` Changli Gao
  2009-11-13  7:45           ` Jarek Poplawski
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-13  6:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Stephen Hemminger, Patrick McHardy, Tom Herbert, netdev

2009/11/13 Eric Dumazet <eric.dumazet@gmail.com>:
>
> Messy ? Because of few tests added in code, and branches always
> correctly predicted ?
>
> Still some people might rely on tasklet instead of workqueues
> and added scheduler stress and latency penalty. Tasklet are softirq
> and normally are processed a few nanosecs later than RX softirq,
> on the same CPU, while with your workqueue, I guess the scheduler will
> try to not migrate it, so we add a penalty for light to moderate load.
>
> I guess this new ifb mode would be a regression for them ?
>
> If you dont want to maintain a compatibility mode, maybe you
> should introduce a complete new driver, drivers/net/ifbmq.c or ifbwq.c
>
> (multiqueue or workqueue references)
>

It sounds a good idea.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  1:26     ` Changli Gao
@ 2009-11-13  5:56       ` Eric Dumazet
  2009-11-13  6:16         ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2009-11-13  5:56 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, Stephen Hemminger, Patrick McHardy, Tom Herbert, netdev

Changli Gao a écrit :
> 2009/11/12 Eric Dumazet <eric.dumazet@gmail.com>:
>> I believe this patch is fine, but maybe Jarek concern about workqueue
>> vs tasklet should be addressed...
>>
>> We could use the previous handling in case numtxqs==1 , ie use a tasklet
>> instead of a work queue ?
> 
> I don't think it is a good idea. If we do so, the code will be messy,
> and lost the flexibility of process. In fact, latency isn't a problem
> when system load isn't high, and when system load is high (due to too
> many NIC IRQs), throughput and interaction is more important, and the
> current linux networking subsystem just dose so through the softirqd
> mechanism.
> 

Messy ? Because of few tests added in code, and branches always
correctly predicted ?

Still some people might rely on tasklet instead of workqueues
and added scheduler stress and latency penalty. Tasklet are softirq
and normally are processed a few nanosecs later than RX softirq, 
on the same CPU, while with your workqueue, I guess the scheduler will
try to not migrate it, so we add a penalty for light to moderate load.

I guess this new ifb mode would be a regression for them ?

If you dont want to maintain a compatibility mode, maybe you
should introduce a complete new driver, drivers/net/ifbmq.c or ifbwq.c

(multiqueue or workqueue references)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-13  4:42 Changli Gao
@ 2009-11-13  4:46 ` Changli Gao
  0 siblings, 0 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-13  4:46 UTC (permalink / raw)
  To: David S. Miller, Stephen Hemminger, Patrick McHardy
  Cc: xiaosuo, Eric Dumazet, Tom Herbert, netdev

[-- Attachment #1: Type: text/plain, Size: 1102 bytes --]

2009/11/13 Changli Gao <xiaosuo@gmail.com>:
> ifb: add multi-queue support
>
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0
>
> This patch also introduces a ip link option "numtxqs" for specifying the
> number of the TX queues. so you can add a new ifb with the command:
>
> ip link add numtxqs 4 type ifb
>
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----

the corresponding iproute2 patch is attached.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

[-- Attachment #2: iproute2-support-numtxqs.diff --]
[-- Type: application/octet-stream, Size: 1628 bytes --]

--- ip/iplink.c.orig	2009-11-11 16:56:41.000000000 +0800
+++ ip/iplink.c	2009-11-12 14:57:25.000000000 +0800
@@ -48,6 +48,7 @@
 		fprintf(stderr, "                   [ address LLADDR ]\n");
 		fprintf(stderr, "                   [ broadcast LLADDR ]\n");
 		fprintf(stderr, "                   [ mtu MTU ]\n");
+		fprintf(stderr, "                   [ numtxqs NUMTXQS ]\n");
 		fprintf(stderr, "                   type TYPE [ ARGS ]\n");
 		fprintf(stderr, "       ip link delete DEV type TYPE [ ARGS ]\n");
 		fprintf(stderr, "\n");
@@ -180,6 +181,7 @@
 	int qlen = -1;
 	int mtu = -1;
 	int netns = -1;
+	int numtxqs = -1;
 
 	ret = argc;
 
@@ -221,6 +223,13 @@
 			if (get_integer(&mtu, *argv, 0))
 				invarg("Invalid \"mtu\" value\n", *argv);
 			addattr_l(&req->n, sizeof(*req), IFLA_MTU, &mtu, 4);
+		} else if (strcmp(*argv, "numtxqs") == 0) {
+			NEXT_ARG();
+			if (numtxqs != -1)
+				duparg("numtxqs", *argv);
+			if (get_integer(&numtxqs, *argv, 0))
+				invarg("Invalid \"numtxqs\" value\n", *argv);
+			addattr_l(&req->n, sizeof(*req), IFLA_NTXQ, &numtxqs, 4);
                 } else if (strcmp(*argv, "netns") == 0) {
                         NEXT_ARG();
                         if (netns != -1)
--- ip/ipaddress.c.orig	2009-11-13 12:00:33.000000000 +0800
+++ ip/ipaddress.c	2009-11-13 12:04:14.000000000 +0800
@@ -255,6 +255,9 @@
 	if (filter.showqueue)
 		print_queuelen(fp, (char*)RTA_DATA(tb[IFLA_IFNAME]));
 
+	if (tb[IFLA_NTXQ])
+		fprintf(fp, " numtxqs %d", *(unsigned int*)RTA_DATA(tb[IFLA_NTXQ]));
+
 	if (!filter.family || filter.family == AF_PACKET) {
 		SPRINT_BUF(b1);
 		fprintf(fp, "%s", _SL_);

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH] ifb: add multi-queue support
@ 2009-11-13  4:42 Changli Gao
  2009-11-13  4:46 ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-13  4:42 UTC (permalink / raw)
  To: David S. Miller, Stephen Hemminger, Patrick McHardy
  Cc: xiaosuo, Eric Dumazet, Tom Herbert, netdev

ifb: add multi-queue support

Add multi-queue support, and one kernel thread is created for per queue.
It can used to emulate multi-queue NIC in software, and distribute work
among CPUs.
gentux linux # modprobe ifb numtxqs=2
gentux linux # ifconfig ifb0 up
gentux linux # pgrep ifb0
18508
18509
gentux linux # taskset -p 1 18508
pid 18508's current affinity mask: 3
pid 18508's new affinity mask: 1
gentux linux # taskset -p 2 18509
pid 18509's current affinity mask: 3
pid 18509's new affinity mask: 2
gentux linux # tc qdisc add dev br0 ingress
gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
action mirred egress redirect dev ifb0

This patch also introduces a ip link option "numtxqs" for specifying the
number of the TX queues. so you can add a new ifb with the command:

ip link add numtxqs 4 type ifb

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
drivers/net/ifb.c | 491 ++++++++++++++++++++++++++++++++++--------------
include/linux/if_link.h | 1
net/core/rtnetlink.c | 3
3 files changed, 360 insertions(+), 135 deletions(-)

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 69c2566..96ab20a 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -33,161 +33,162 @@
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
 #include <net/pkt_sched.h>
 #include <net/net_namespace.h>
 
-#define TX_TIMEOUT  (2*HZ)
-
 #define TX_Q_LIMIT    32
+
+struct ifb_private_q {
+	struct net_device	*dev;
+	struct sk_buff_head	rq;
+	wait_queue_head_t	wq;
+	struct task_struct	*task;
+	unsigned long		rx_packets;
+	unsigned long		rx_bytes;
+	unsigned long		rx_dropped;
+} ____cacheline_aligned_in_smp;
+
 struct ifb_private {
-	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
-	/* mostly debug stats leave in for now */
-	unsigned long   st_task_enter; /* tasklet entered */
-	unsigned long   st_txq_refl_try; /* transmit queue refill attempt */
-	unsigned long   st_rxq_enter; /* receive queue entered */
-	unsigned long   st_rx2tx_tran; /* receive to trasmit transfers */
-	unsigned long   st_rxq_notenter; /*receiveQ not entered, resched */
-	unsigned long   st_rx_frm_egr; /* received from egress path */
-	unsigned long   st_rx_frm_ing; /* received from ingress path */
-	unsigned long   st_rxq_check;
-	unsigned long   st_rxq_rsch;
-	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
+	struct ifb_private_q	*pq;
 };
 
+/* Number of ifb devices to be set up by this module. */
 static int numifbs = 2;
+module_param(numifbs, int, 0444);
+MODULE_PARM_DESC(numifbs, "Number of ifb devices");
 
-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
+/* Number of TX queues per ifb */
+static unsigned int numtxqs = 1;
+module_param(numtxqs, uint, 0444);
+MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
 
-static void ri_tasklet(unsigned long dev)
+static int ifb_thread(void *priv)
 {
-
-	struct net_device *_dev = (struct net_device *)dev;
-	struct ifb_private *dp = netdev_priv(_dev);
-	struct net_device_stats *stats = &_dev->stats;
-	struct netdev_queue *txq;
+	struct ifb_private_q *pq = priv;
+	struct net_device *dev = pq->dev, *_dev;
+	int num = pq - ((struct ifb_private *)netdev_priv(dev))->pq;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 	struct sk_buff *skb;
-
-	txq = netdev_get_tx_queue(_dev, 0);
-	dp->st_task_enter++;
-	if ((skb = skb_peek(&dp->tq)) == NULL) {
-		dp->st_txq_refl_try++;
-		if (__netif_tx_trylock(txq)) {
-			dp->st_rxq_enter++;
-			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
-				skb_queue_tail(&dp->tq, skb);
-				dp->st_rx2tx_tran++;
-			}
-			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			dp->st_rxq_notenter++;
-			goto resched;
+	DEFINE_WAIT(wait);
+	struct sk_buff_head tq;
+
+	__skb_queue_head_init(&tq);
+	while (1) {
+		/* move skb from rq to tq */
+		while (1) {
+			prepare_to_wait(&pq->wq, &wait, TASK_UNINTERRUPTIBLE);
+			__netif_tx_lock_bh(txq);
+			skb_queue_splice_tail_init(&pq->rq, &tq);
+			if (netif_queue_stopped(dev))
+				netif_wake_queue(dev);
+			__netif_tx_unlock_bh(txq);
+			if (kthread_should_stop() || !skb_queue_empty(&tq))
+				break;
+			schedule();
 		}
-	}
-
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
-		u32 from = G_TC_FROM(skb->tc_verd);
-
-		skb->tc_verd = 0;
-		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
-		stats->tx_packets++;
-		stats->tx_bytes +=skb->len;
-
-		rcu_read_lock();
-		skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
-		if (!skb->dev) {
-			rcu_read_unlock();
-			dev_kfree_skb(skb);
-			stats->tx_dropped++;
+		finish_wait(&pq->wq, &wait);
+		if (kthread_should_stop()) {
+			__skb_queue_purge(&tq);
 			break;
 		}
-		rcu_read_unlock();
-		skb->iif = _dev->ifindex;
-
-		if (from & AT_EGRESS) {
-			dp->st_rx_frm_egr++;
-			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
-			dp->st_rx_frm_ing++;
-			skb_pull(skb, skb->dev->hard_header_len);
-			netif_rx(skb);
-		} else
-			BUG();
-	}
-
-	if (__netif_tx_trylock(txq)) {
-		dp->st_rxq_check++;
-		if ((skb = skb_peek(&dp->rq)) == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			dp->st_rxq_rsch++;
-			__netif_tx_unlock(txq);
-			goto resched;
+		if (need_resched())
+			schedule();
+
+		/* transfer packets */
+		while ((skb = __skb_dequeue(&tq)) != NULL) {
+			u32 from = G_TC_FROM(skb->tc_verd);
+	
+			skb->tc_verd = 0;
+			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+			txq->tx_packets++;
+			txq->tx_bytes +=skb->len;
+
+			rcu_read_lock();
+			_dev = dev_get_by_index_rcu(&init_net, skb->iif);
+			if (_dev != NULL) {
+				skb->dev = _dev;
+				skb->iif = dev->ifindex;
+				if (from & AT_EGRESS) {
+					dev_queue_xmit(skb);
+				} else if (from & AT_INGRESS) {
+					skb_pull(skb, _dev->hard_header_len);
+					netif_rx_ni(skb);
+				} else {
+					BUG();
+				}
+			} else {
+				dev_kfree_skb(skb);
+				txq->tx_dropped++;
+			}
+			rcu_read_unlock();
 		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
 	}
 
+	return 0;
 }
 
-static const struct net_device_ops ifb_netdev_ops = {
-	.ndo_open	= ifb_open,
-	.ndo_stop	= ifb_close,
-	.ndo_start_xmit	= ifb_xmit,
-	.ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
+struct net_device_stats* ifb_get_stats(struct net_device *dev)
 {
-	/* Initialize the device structure. */
-	dev->destructor = free_netdev;
-	dev->netdev_ops = &ifb_netdev_ops;
+	struct net_device_stats *stats = &dev->stats;
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	struct netdev_queue *txq;
+	int i;
+	unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
+	unsigned long tx_packets = 0, tx_bytes = 0, tx_dropped = 0;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		rx_packets += pq[i].rx_packets;
+		rx_bytes += pq[i].rx_bytes;
+		rx_dropped += pq[i].rx_dropped;
+		txq = netdev_get_tx_queue(dev, i);
+		tx_packets += txq->tx_packets;
+		tx_bytes += txq->tx_bytes;
+		tx_dropped += txq->tx_dropped;
+	}
 
-	/* Fill in device structure with ethernet-generic values. */
-	ether_setup(dev);
-	dev->tx_queue_len = TX_Q_LIMIT;
+	stats->rx_packets = rx_packets;
+	stats->rx_bytes = rx_bytes;
+	stats->rx_dropped = rx_dropped;
+	stats->tx_packets = tx_packets;
+	stats->tx_bytes = tx_bytes;
+	stats->tx_dropped = tx_dropped;
 
-	dev->flags |= IFF_NOARP;
-	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
-	random_ether_addr(dev->dev_addr);
+	return stats;
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
 	u32 from = G_TC_FROM(skb->tc_verd);
+	int num = skb_get_queue_mapping(skb);
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq + num;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 
-	stats->rx_packets++;
-	stats->rx_bytes+=skb->len;
+	pq->rx_packets++;
+	pq->rx_bytes += skb->len;
 
 	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
 		dev_kfree_skb(skb);
-		stats->rx_dropped++;
+		pq->rx_dropped++;
 		return NETDEV_TX_OK;
 	}
 
-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len) {
+	txq->trans_start = jiffies;
+	__skb_queue_tail(&pq->rq, skb);
+	if (skb_queue_len(&pq->rq) >= dev->tx_queue_len)
 		netif_stop_queue(dev);
-	}
-
-	dev->trans_start = jiffies;
-	skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
+	if (skb_queue_len(&pq->rq) == 1)
+		wake_up(&pq->wq);
 
 	return NETDEV_TX_OK;
 }
@@ -195,26 +196,230 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 static int ifb_close(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
 
-	tasklet_kill(&dp->ifb_tasklet);
 	netif_stop_queue(dev);
-	skb_queue_purge(&dp->rq);
-	skb_queue_purge(&dp->tq);
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		kthread_stop(pq[i].task);
+		__skb_queue_purge(&pq[i].rq);
+	}
+
 	return 0;
 }
 
 static int ifb_open(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
-
-	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
-	skb_queue_head_init(&dp->rq);
-	skb_queue_head_init(&dp->tq);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
+	
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].task = kthread_run(ifb_thread, &pq[i], "%s/%d", dev->name,
+					 i);
+		if (IS_ERR(pq[i].task)) {
+			int err = PTR_ERR(pq[i].task);
+			while (--i >= 0)
+				kthread_stop(pq[i].task);
+			return err;
+		}
+	}
 	netif_start_queue(dev);
 
 	return 0;
 }
 
+static u32 simple_tx_hashrnd;
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+	return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+			sizeof(struct pppoe_hdr)));
+}
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2;
+	u32 hash;
+	union {
+		u16 in16[2];
+		u32 in32;
+	} ports;
+	u8 ip_proto;
+	unsigned int pull_len, proto_pull_len;
+	int ihl;
+
+	if ((hash = skb_rx_queue_recorded(skb))) {
+		while (hash >= dev->real_num_tx_queues)
+			hash -= dev->real_num_tx_queues;
+		return hash;
+	}
+
+	/* act_mirred pushed the skb before calling dev_queue_xmit() */
+	proto_pull_len = 0;
+	pull_len = skb_network_header(skb) - skb->data;
+	if (unlikely(skb_pull(skb, pull_len) == NULL)) {
+		pull_len = 0;
+		goto process_other;
+	}
+
+	ihl = 0;
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_8021Q):
+		if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL))
+			goto process_other;
+		pull_len += VLAN_HLEN;
+		skb->network_header += VLAN_HLEN;
+		proto_pull_len = VLAN_HLEN;
+		switch (vlan_eth_hdr(skb)->h_vlan_encapsulated_proto) {
+		case __constant_htons(ETH_P_IP):
+			goto process_ip;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case __constant_htons(ETH_P_IPV6):
+			goto process_ipv6;
+#endif
+		default:
+			goto process_other;
+		}
+		break;
+	case __constant_htons(ETH_P_PPP_SES):
+		if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL))
+			goto process_other;
+		pull_len += PPPOE_SES_HLEN;
+		skb->network_header += PPPOE_SES_HLEN;
+		proto_pull_len = PPPOE_SES_HLEN;
+		switch (pppoe_proto(skb)) {
+		case __constant_htons(ETH_P_IP):
+			goto process_ip;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case __constant_htons(ETH_P_IPV6):
+			goto process_ipv6;
+#endif
+		default:
+			goto process_other;
+		}
+		break;
+	case __constant_htons(ETH_P_IP):
+process_ip:
+		if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
+			goto process_other;
+		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+			ip_proto = ip_hdr(skb)->protocol;
+		else
+			ip_proto = 0;
+		addr1 = ip_hdr(skb)->saddr;
+		addr2 = ip_hdr(skb)->daddr;
+		ihl = ip_hdrlen(skb);
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case __constant_htons(ETH_P_IPV6):
+process_ipv6:
+		if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr))))
+			goto process_other;
+		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+		ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto);
+		if (unlikely(ihl < 0))
+			goto process_other_trans;
+		break;
+#endif
+	default:
+process_other:
+		if (pull_len != 0) {
+			skb_push(skb, pull_len);
+			if (proto_pull_len != 0)
+				skb->network_header -= proto_pull_len;
+		}
+		return skb->protocol % dev->real_num_tx_queues;
+	}
+	if (addr1 > addr2)
+		swap(addr1, addr2);
+
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (unlikely(skb_copy_bits(skb, ihl, &ports.in32, 4) < 0))
+			goto process_other_trans;
+		if (ports.in16[0] > ports.in16[1])
+			swap(ports.in16[0], ports.in16[1]);
+		break;
+	default:
+process_other_trans:
+		ports.in32 = 0;
+		break;
+	}
+
+	if (pull_len != 0) {
+		skb_push(skb, pull_len);
+		if (proto_pull_len != 0)
+			skb->network_header -= proto_pull_len;
+	}
+
+	hash = jhash_3words(addr1, addr2, ports.in32,
+			    simple_tx_hashrnd ^ ip_proto);
+
+	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static int ifb_init(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
+
+	pq = kcalloc(dev->real_num_tx_queues, sizeof(*pq), GFP_KERNEL);
+	if (pq == NULL)
+		return -ENOMEM;
+	dp->pq = pq;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].dev = dev;
+		__skb_queue_head_init(&pq[i].rq);
+		init_waitqueue_head(&pq[i].wq);
+	}
+
+	return 0;
+}
+
+static void ifb_uninit(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+
+	kfree(dp->pq);
+}
+
+static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_init		= ifb_init,
+	.ndo_uninit		= ifb_uninit,
+	.ndo_open		= ifb_open,
+	.ndo_stop		= ifb_close,
+	.ndo_start_xmit		= ifb_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_select_queue	= ifb_select_queue,
+	.ndo_get_stats		= ifb_get_stats,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+	/* Initialize the device structure. */
+	dev->destructor = free_netdev;
+	dev->netdev_ops = &ifb_netdev_ops;
+
+	/* Fill in device structure with ethernet-generic values. */
+	ether_setup(dev);
+	dev->tx_queue_len = TX_Q_LIMIT;
+
+	dev->flags |= IFF_NOARP;
+	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	random_ether_addr(dev->dev_addr);
+}
+
 static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {
@@ -226,25 +431,38 @@ static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
+			     unsigned int *num_tx_queues,
+			     unsigned int *real_num_tx_queues)
+{
+	if (tb[IFLA_NTXQ]) {
+		*num_tx_queues = nla_get_u32(tb[IFLA_NTXQ]);
+		if (*num_tx_queues < 1)
+			return -EINVAL;
+		*real_num_tx_queues = *num_tx_queues;
+	} else {
+		*num_tx_queues = numtxqs;
+		*real_num_tx_queues = numtxqs;
+	}
+
+	return 0;
+}
+
 static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.kind		= "ifb",
-	.priv_size	= sizeof(struct ifb_private),
 	.setup		= ifb_setup,
 	.validate	= ifb_validate,
+	.get_tx_queues	= ifb_get_tx_queues,
+	.priv_size	= sizeof(struct ifb_private),
 };
 
-/* Number of ifb devices to be set up by this module. */
-module_param(numifbs, int, 0);
-MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-
 static int __init ifb_init_one(int index)
 {
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
-				 "ifb%d", ifb_setup);
-
+	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
+				  ifb_setup, numtxqs);
 	if (!dev_ifb)
 		return -ENOMEM;
 
@@ -268,9 +486,12 @@ static int __init ifb_init_module(void)
 {
 	int i, err;
 
+	if (numtxqs < 1)
+		return -EINVAL;
+
+	get_random_bytes(&simple_tx_hashrnd, 4);
 	rtnl_lock();
 	err = __rtnl_link_register(&ifb_link_ops);
-
 	for (i = 0; i < numifbs && !err; i++)
 		err = ifb_init_one(i);
 	if (err)
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1d3b242..99da411 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -78,6 +78,7 @@ enum {
 #define IFLA_LINKINFO IFLA_LINKINFO
 	IFLA_NET_NS_PID,
 	IFLA_IFALIAS,
+	IFLA_NTXQ,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33148a5..1cbe555 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -597,6 +597,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
 	       + nla_total_size(4) /* IFLA_MASTER */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
+	       + nla_total_size(4) /* IFLA_NTXQ */
 	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
 }
 
@@ -627,6 +628,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		   netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
 	NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
 	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+	NLA_PUT_U32(skb, IFLA_NTXQ, dev->real_num_tx_queues);
 
 	if (dev->ifindex != dev->iflink)
 		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -725,6 +727,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	[IFLA_NTXQ]		= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL(ifla_policy);
 



^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH] ifb: add multi-queue support
  2009-11-12  9:44 ` Changli Gao
  2009-11-12  9:48   ` Changli Gao
  2009-11-12 12:48   ` Eric Dumazet
@ 2009-11-13  4:37   ` Changli Gao
  2009-11-16 16:39     ` Stephen Hemminger
  2 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-13  4:37 UTC (permalink / raw)
  To: David S. Miller, Stephen Hemminger, Patrick McHardy
  Cc: xiaosuo, Eric Dumazet, Tom Herbert, netdev

ifb: add multi-queue support

Add multi-queue support, and one kernel thread is created for per queue.
It can used to emulate multi-queue NIC in software, and distribute work
among CPUs.
gentux linux # modprobe ifb numtxqs=2
gentux linux # ifconfig ifb0 up
gentux linux # pgrep ifb0
18508
18509
gentux linux # taskset -p 1 18508
pid 18508's current affinity mask: 3
pid 18508's new affinity mask: 1
gentux linux # taskset -p 2 18509
pid 18509's current affinity mask: 3
pid 18509's new affinity mask: 2
gentux linux # tc qdisc add dev br0 ingress
gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
action mirred egress redirect dev ifb0

This patch also introduces a ip link option "numtxqs" for specifying the
number of the TX queues. so you can add a new ifb with the command:

ip link add numtxqs 4 type ifb

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
drivers/net/ifb.c | 491 ++++++++++++++++++++++++++++++++++--------------
include/linux/if_link.h | 1
net/core/rtnetlink.c | 3
3 files changed, 360 insertions(+), 135 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 69c2566..96ab20a 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -33,161 +33,162 @@
#include <linux/etherdevice.h>
#include <linux/init.h>
#include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
#include <net/pkt_sched.h>
#include <net/net_namespace.h>

-#define TX_TIMEOUT (2*HZ)
-
#define TX_Q_LIMIT 32
+
+struct ifb_private_q {
+ struct net_device *dev;
+ struct sk_buff_head rq;
+ wait_queue_head_t wq;
+ struct task_struct *task;
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long rx_dropped;
+} ____cacheline_aligned_in_smp;
+
struct ifb_private {
- struct tasklet_struct ifb_tasklet;
- int tasklet_pending;
- /* mostly debug stats leave in for now */
- unsigned long st_task_enter; /* tasklet entered */
- unsigned long st_txq_refl_try; /* transmit queue refill attempt */
- unsigned long st_rxq_enter; /* receive queue entered */
- unsigned long st_rx2tx_tran; /* receive to trasmit transfers */
- unsigned long st_rxq_notenter; /*receiveQ not entered, resched */
- unsigned long st_rx_frm_egr; /* received from egress path */
- unsigned long st_rx_frm_ing; /* received from ingress path */
- unsigned long st_rxq_check;
- unsigned long st_rxq_rsch;
- struct sk_buff_head rq;
- struct sk_buff_head tq;
+ struct ifb_private_q *pq;
};

+/* Number of ifb devices to be set up by this module. */
static int numifbs = 2;
+module_param(numifbs, int, 0444);
+MODULE_PARM_DESC(numifbs, "Number of ifb devices");

-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
+/* Number of TX queues per ifb */
+static unsigned int numtxqs = 1;
+module_param(numtxqs, uint, 0444);
+MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");

-static void ri_tasklet(unsigned long dev)
+static int ifb_thread(void *priv)
{
-
- struct net_device *_dev = (struct net_device *)dev;
- struct ifb_private *dp = netdev_priv(_dev);
- struct net_device_stats *stats = &_dev->stats;
- struct netdev_queue *txq;
+ struct ifb_private_q *pq = priv;
+ struct net_device *dev = pq->dev, *_dev;
+ int num = pq - ((struct ifb_private *)netdev_priv(dev))->pq;
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
struct sk_buff *skb;
-
- txq = netdev_get_tx_queue(_dev, 0);
- dp->st_task_enter++;
- if ((skb = skb_peek(&dp->tq)) == NULL) {
- dp->st_txq_refl_try++;
- if (__netif_tx_trylock(txq)) {
- dp->st_rxq_enter++;
- while ((skb = skb_dequeue(&dp->rq)) != NULL) {
- skb_queue_tail(&dp->tq, skb);
- dp->st_rx2tx_tran++;
- }
- __netif_tx_unlock(txq);
- } else {
- /* reschedule */
- dp->st_rxq_notenter++;
- goto resched;
+ DEFINE_WAIT(wait);
+ struct sk_buff_head tq;
+
+ __skb_queue_head_init(&tq);
+ while (1) {
+ /* move skb from rq to tq */
+ while (1) {
+ prepare_to_wait(&pq->wq, &wait, TASK_UNINTERRUPTIBLE);
+ __netif_tx_lock_bh(txq);
+ skb_queue_splice_tail_init(&pq->rq, &tq);
+ if (netif_queue_stopped(dev))
+ netif_wake_queue(dev);
+ __netif_tx_unlock_bh(txq);
+ if (kthread_should_stop() || !skb_queue_empty(&tq))
+ break;
+ schedule();
}
- }
-
- while ((skb = skb_dequeue(&dp->tq)) != NULL) {
- u32 from = G_TC_FROM(skb->tc_verd);
-
- skb->tc_verd = 0;
- skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
- stats->tx_packets++;
- stats->tx_bytes +=skb->len;
-
- rcu_read_lock();
- skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
- if (!skb->dev) {
- rcu_read_unlock();
- dev_kfree_skb(skb);
- stats->tx_dropped++;
+ finish_wait(&pq->wq, &wait);
+ if (kthread_should_stop()) {
+ __skb_queue_purge(&tq);
break;
}
- rcu_read_unlock();
- skb->iif = _dev->ifindex;
-
- if (from & AT_EGRESS) {
- dp->st_rx_frm_egr++;
- dev_queue_xmit(skb);
- } else if (from & AT_INGRESS) {
- dp->st_rx_frm_ing++;
- skb_pull(skb, skb->dev->hard_header_len);
- netif_rx(skb);
- } else
- BUG();
- }
-
- if (__netif_tx_trylock(txq)) {
- dp->st_rxq_check++;
- if ((skb = skb_peek(&dp->rq)) == NULL) {
- dp->tasklet_pending = 0;
- if (netif_queue_stopped(_dev))
- netif_wake_queue(_dev);
- } else {
- dp->st_rxq_rsch++;
- __netif_tx_unlock(txq);
- goto resched;
+ if (need_resched())
+ schedule();
+
+ /* transfer packets */
+ while ((skb = __skb_dequeue(&tq)) != NULL) {
+ u32 from = G_TC_FROM(skb->tc_verd);
+
+ skb->tc_verd = 0;
+ skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+ txq->tx_packets++;
+ txq->tx_bytes +=skb->len;
+
+ rcu_read_lock();
+ _dev = dev_get_by_index_rcu(&init_net, skb->iif);
+ if (_dev != NULL) {
+ skb->dev = _dev;
+ skb->iif = dev->ifindex;
+ if (from & AT_EGRESS) {
+ dev_queue_xmit(skb);
+ } else if (from & AT_INGRESS) {
+ skb_pull(skb, _dev->hard_header_len);
+ netif_rx_ni(skb);
+ } else {
+ BUG();
+ }
+ } else {
+ dev_kfree_skb(skb);
+ txq->tx_dropped++;
+ }
+ rcu_read_unlock();
}
- __netif_tx_unlock(txq);
- } else {
-resched:
- dp->tasklet_pending = 1;
- tasklet_schedule(&dp->ifb_tasklet);
}

+ return 0;
}

-static const struct net_device_ops ifb_netdev_ops = {
- .ndo_open = ifb_open,
- .ndo_stop = ifb_close,
- .ndo_start_xmit = ifb_xmit,
- .ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
+struct net_device_stats* ifb_get_stats(struct net_device *dev)
{
- /* Initialize the device structure. */
- dev->destructor = free_netdev;
- dev->netdev_ops = &ifb_netdev_ops;
+ struct net_device_stats *stats = &dev->stats;
+ struct ifb_private *dp = netdev_priv(dev);
+ struct ifb_private_q *pq = dp->pq;
+ struct netdev_queue *txq;
+ int i;
+ unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
+ unsigned long tx_packets = 0, tx_bytes = 0, tx_dropped = 0;
+
+ for (i = 0; i < dev->real_num_tx_queues; i++) {
+ rx_packets += pq[i].rx_packets;
+ rx_bytes += pq[i].rx_bytes;
+ rx_dropped += pq[i].rx_dropped;
+ txq = netdev_get_tx_queue(dev, i);
+ tx_packets += txq->tx_packets;
+ tx_bytes += txq->tx_bytes;
+ tx_dropped += txq->tx_dropped;
+ }

- /* Fill in device structure with ethernet-generic values. */
- ether_setup(dev);
- dev->tx_queue_len = TX_Q_LIMIT;
+ stats->rx_packets = rx_packets;
+ stats->rx_bytes = rx_bytes;
+ stats->rx_dropped = rx_dropped;
+ stats->tx_packets = tx_packets;
+ stats->tx_bytes = tx_bytes;
+ stats->tx_dropped = tx_dropped;

- dev->flags |= IFF_NOARP;
- dev->flags &= ~IFF_MULTICAST;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
- random_ether_addr(dev->dev_addr);
+ return stats;
}

static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct ifb_private *dp = netdev_priv(dev);
- struct net_device_stats *stats = &dev->stats;
u32 from = G_TC_FROM(skb->tc_verd);
+ int num = skb_get_queue_mapping(skb);
+ struct ifb_private *dp = netdev_priv(dev);
+ struct ifb_private_q *pq = dp->pq + num;
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, num);

- stats->rx_packets++;
- stats->rx_bytes+=skb->len;
+ pq->rx_packets++;
+ pq->rx_bytes += skb->len;

if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
dev_kfree_skb(skb);
- stats->rx_dropped++;
+ pq->rx_dropped++;
return NETDEV_TX_OK;
}

- if (skb_queue_len(&dp->rq) >= dev->tx_queue_len) {
+ txq->trans_start = jiffies;
+ __skb_queue_tail(&pq->rq, skb);
+ if (skb_queue_len(&pq->rq) >= dev->tx_queue_len)
netif_stop_queue(dev);
- }
-
- dev->trans_start = jiffies;
- skb_queue_tail(&dp->rq, skb);
- if (!dp->tasklet_pending) {
- dp->tasklet_pending = 1;
- tasklet_schedule(&dp->ifb_tasklet);
- }
+ if (skb_queue_len(&pq->rq) == 1)
+ wake_up(&pq->wq);

return NETDEV_TX_OK;
}
@@ -195,26 +196,230 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb,
struct net_device *dev)
static int ifb_close(struct net_device *dev)
{
struct ifb_private *dp = netdev_priv(dev);
+ struct ifb_private_q *pq = dp->pq;
+ int i;

- tasklet_kill(&dp->ifb_tasklet);
netif_stop_queue(dev);
- skb_queue_purge(&dp->rq);
- skb_queue_purge(&dp->tq);
+ for (i = 0; i < dev->real_num_tx_queues; i++) {
+ kthread_stop(pq[i].task);
+ __skb_queue_purge(&pq[i].rq);
+ }
+
return 0;
}

static int ifb_open(struct net_device *dev)
{
struct ifb_private *dp = netdev_priv(dev);
-
- tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
- skb_queue_head_init(&dp->rq);
- skb_queue_head_init(&dp->tq);
+ struct ifb_private_q *pq = dp->pq;
+ int i;
+
+ for (i = 0; i < dev->real_num_tx_queues; i++) {
+ pq[i].task = kthread_run(ifb_thread, &pq[i], "%s/%d", dev->name,
+ i);
+ if (IS_ERR(pq[i].task)) {
+ int err = PTR_ERR(pq[i].task);
+ while (--i >= 0)
+ kthread_stop(pq[i].task);
+ return err;
+ }
+ }
netif_start_queue(dev);

return 0;
}

+static u32 simple_tx_hashrnd;
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+ return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+ sizeof(struct pppoe_hdr)));
+}
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+ u32 addr1, addr2;
+ u32 hash;
+ union {
+ u16 in16[2];
+ u32 in32;
+ } ports;
+ u8 ip_proto;
+ unsigned int pull_len, proto_pull_len;
+ int ihl;
+
+ if ((hash = skb_rx_queue_recorded(skb))) {
+ while (hash >= dev->real_num_tx_queues)
+ hash -= dev->real_num_tx_queues;
+ return hash;
+ }
+
+ /* act_mirred pushed the skb before calling dev_queue_xmit() */
+ proto_pull_len = 0;
+ pull_len = skb_network_header(skb) - skb->data;
+ if (unlikely(skb_pull(skb, pull_len) == NULL)) {
+ pull_len = 0;
+ goto process_other;
+ }
+
+ ihl = 0;
+ switch (skb->protocol) {
+ case __constant_htons(ETH_P_8021Q):
+ if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL))
+ goto process_other;
+ pull_len += VLAN_HLEN;
+ skb->network_header += VLAN_HLEN;
+ proto_pull_len = VLAN_HLEN;
+ switch (vlan_eth_hdr(skb)->h_vlan_encapsulated_proto) {
+ case __constant_htons(ETH_P_IP):
+ goto process_ip;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ case __constant_htons(ETH_P_IPV6):
+ goto process_ipv6;
+#endif
+ default:
+ goto process_other;
+ }
+ break;
+ case __constant_htons(ETH_P_PPP_SES):
+ if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL))
+ goto process_other;
+ pull_len += PPPOE_SES_HLEN;
+ skb->network_header += PPPOE_SES_HLEN;
+ proto_pull_len = PPPOE_SES_HLEN;
+ switch (pppoe_proto(skb)) {
+ case __constant_htons(ETH_P_IP):
+ goto process_ip;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ case __constant_htons(ETH_P_IPV6):
+ goto process_ipv6;
+#endif
+ default:
+ goto process_other;
+ }
+ break;
+ case __constant_htons(ETH_P_IP):
+process_ip:
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
+ goto process_other;
+ if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+ ip_proto = ip_hdr(skb)->protocol;
+ else
+ ip_proto = 0;
+ addr1 = ip_hdr(skb)->saddr;
+ addr2 = ip_hdr(skb)->daddr;
+ ihl = ip_hdrlen(skb);
+ break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ case __constant_htons(ETH_P_IPV6):
+process_ipv6:
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr))))
+ goto process_other;
+ addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+ addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+ ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto);
+ if (unlikely(ihl < 0))
+ goto process_other_trans;
+ break;
+#endif
+ default:
+process_other:
+ if (pull_len != 0) {
+ skb_push(skb, pull_len);
+ if (proto_pull_len != 0)
+ skb->network_header -= proto_pull_len;
+ }
+ return skb->protocol % dev->real_num_tx_queues;
+ }
+ if (addr1 > addr2)
+ swap(addr1, addr2);
+
+ switch (ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_AH:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ if (unlikely(skb_copy_bits(skb, ihl, &ports.in32, 4) < 0))
+ goto process_other_trans;
+ if (ports.in16[0] > ports.in16[1])
+ swap(ports.in16[0], ports.in16[1]);
+ break;
+ default:
+process_other_trans:
+ ports.in32 = 0;
+ break;
+ }
+
+ if (pull_len != 0) {
+ skb_push(skb, pull_len);
+ if (proto_pull_len != 0)
+ skb->network_header -= proto_pull_len;
+ }
+
+ hash = jhash_3words(addr1, addr2, ports.in32,
+ simple_tx_hashrnd ^ ip_proto);
+
+ return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static int ifb_init(struct net_device *dev)
+{
+ struct ifb_private *dp = netdev_priv(dev);
+ struct ifb_private_q *pq = dp->pq;
+ int i;
+
+ pq = kcalloc(dev->real_num_tx_queues, sizeof(*pq), GFP_KERNEL);
+ if (pq == NULL)
+ return -ENOMEM;
+ dp->pq = pq;
+
+ for (i = 0; i < dev->real_num_tx_queues; i++) {
+ pq[i].dev = dev;
+ __skb_queue_head_init(&pq[i].rq);
+ init_waitqueue_head(&pq[i].wq);
+ }
+
+ return 0;
+}
+
+static void ifb_uninit(struct net_device *dev)
+{
+ struct ifb_private *dp = netdev_priv(dev);
+
+ kfree(dp->pq);
+}
+
+static const struct net_device_ops ifb_netdev_ops = {
+ .ndo_init = ifb_init,
+ .ndo_uninit = ifb_uninit,
+ .ndo_open = ifb_open,
+ .ndo_stop = ifb_close,
+ .ndo_start_xmit = ifb_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_select_queue = ifb_select_queue,
+ .ndo_get_stats = ifb_get_stats,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+ /* Initialize the device structure. */
+ dev->destructor = free_netdev;
+ dev->netdev_ops = &ifb_netdev_ops;
+
+ /* Fill in device structure with ethernet-generic values. */
+ ether_setup(dev);
+ dev->tx_queue_len = TX_Q_LIMIT;
+
+ dev->flags |= IFF_NOARP;
+ dev->flags &= ~IFF_MULTICAST;
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ random_ether_addr(dev->dev_addr);
+}
+
static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
{
if (tb[IFLA_ADDRESS]) {
@@ -226,25 +431,38 @@ static int ifb_validate(struct nlattr *tb[],
struct nlattr *data[])
return 0;
}

+static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
+ unsigned int *num_tx_queues,
+ unsigned int *real_num_tx_queues)
+{
+ if (tb[IFLA_NTXQ]) {
+ *num_tx_queues = nla_get_u32(tb[IFLA_NTXQ]);
+ if (*num_tx_queues < 1)
+ return -EINVAL;
+ *real_num_tx_queues = *num_tx_queues;
+ } else {
+ *num_tx_queues = numtxqs;
+ *real_num_tx_queues = numtxqs;
+ }
+
+ return 0;
+}
+
static struct rtnl_link_ops ifb_link_ops __read_mostly = {
.kind = "ifb",
- .priv_size = sizeof(struct ifb_private),
.setup = ifb_setup,
.validate = ifb_validate,
+ .get_tx_queues = ifb_get_tx_queues,
+ .priv_size = sizeof(struct ifb_private),
};

-/* Number of ifb devices to be set up by this module. */
-module_param(numifbs, int, 0);
-MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-
static int __init ifb_init_one(int index)
{
struct net_device *dev_ifb;
int err;

- dev_ifb = alloc_netdev(sizeof(struct ifb_private),
- "ifb%d", ifb_setup);
-
+ dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
+ ifb_setup, numtxqs);
if (!dev_ifb)
return -ENOMEM;

@@ -268,9 +486,12 @@ static int __init ifb_init_module(void)
{
int i, err;

+ if (numtxqs < 1)
+ return -EINVAL;
+
+ get_random_bytes(&simple_tx_hashrnd, 4);
rtnl_lock();
err = __rtnl_link_register(&ifb_link_ops);
-
for (i = 0; i < numifbs && !err; i++)
err = ifb_init_one(i);
if (err)
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1d3b242..99da411 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -78,6 +78,7 @@ enum {
#define IFLA_LINKINFO IFLA_LINKINFO
IFLA_NET_NS_PID,
IFLA_IFALIAS,
+ IFLA_NTXQ,
__IFLA_MAX
};

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33148a5..1cbe555 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -597,6 +597,7 @@ static inline size_t if_nlmsg_size(const struct
net_device *dev)
+ nla_total_size(4) /* IFLA_MASTER */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(4) /* IFLA_NTXQ */
+ rtnl_link_get_size(dev); /* IFLA_LINKINFO */
}

@@ -627,6 +628,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct net_device *dev,
netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+ NLA_PUT_U32(skb, IFLA_NTXQ, dev->real_num_tx_queues);

if (dev->ifindex != dev->iflink)
NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -725,6 +727,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_LINKINFO] = { .type = NLA_NESTED },
[IFLA_NET_NS_PID] = { .type = NLA_U32 },
[IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
+ [IFLA_NTXQ] = { .type = NLA_U32 },
};
EXPORT_SYMBOL(ifla_policy);


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12 15:11     ` Patrick McHardy
@ 2009-11-13  1:32       ` Changli Gao
  2009-11-13  7:18         ` Patrick McHardy
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-13  1:32 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert, netdev

On Thu, Nov 12, 2009 at 11:11 PM, Patrick McHardy <kaber@trash.net> wrote:
> Changli Gao wrote:
>> The corresponding iprout2 patch is attached.
>
> Printing the value during dumps is missing from this patch.
>

I doesn't know the rules about adding more dumps. Can I add it in the
end of the corresponding line?

ifb0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN qlen 32 ntxqs 4

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12 15:10     ` Patrick McHardy
@ 2009-11-13  1:28       ` Changli Gao
  0 siblings, 0 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-13  1:28 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert, netdev

On Thu, Nov 12, 2009 at 11:10 PM, Patrick McHardy <kaber@trash.net> wrote:
> Changli Gao wrote:
>>>> +                     rcu_read_lock();
>>>> +                     skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
>>>> +                     if (!skb->dev) {
>>>> +                             rcu_read_unlock();
>>>> +                             dev_kfree_skb(skb);
>>>> +                             txq->tx_dropped++;
>>>> +                             break;
>>>> +                     }
>>>> +                     rcu_read_unlock();
>>>> +                     skb->iif = dev->ifindex;
>>> What protects the device from disappearing here and below during
>>> dev_queue_xmit() and netif_rx_ni()?
>>
>> For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
>> problem. But for netif_rx_ni(), I don't know how to prevent the device
>> disappearing, and it seems that all the NIC drivers have this problem.
>> Maybe there was the assumption about the execution context of
>> netif_rx() before. Now softirq can't be executed by softirqd, so the
>> packet receiving path maybe interleaved. I don't know how to prevent
>> it happening.
>
> You can either take a reference of move the rcu_read_unlock()
> after the skb submission.

good idea!

>
>>>> +             break;
>>>> +     case __constant_htons(ETH_P_IPV6):
>>>> +             ip_proto = ipv6_hdr(skb)->nexthdr;
>>>> +             addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
>>>> +             addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
>>>> +             ihl = 10;
>>> Where does 10 come from?
>>
>> It should be 40, after reviewing IPV6, I found that I need to loop
>> until finding the right protocol value.
>
> There is a helper for this (ipv6_skip_exthdr).
>

It is a little late. Thanks all the same, and I'll use it.

>>>> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>>>> +                          unsigned int *num_tx_queues,
>>>> +                          unsigned int *real_num_tx_queues)
>>>> +{
>>>> +     if (tb[IFLA_NTXQ]) {
>>>> +             *num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
>>> We currently use unsigned ints for the queue number, so please
>>> use an u32 for the attribute as well.
>>>
>>>> +             *real_num_tx_queues = *num_tx_queues;
>>>> +     } else {
>>>> +             *num_tx_queues = numtxqs;
>>>> +             *real_num_tx_queues = numtxqs;
>>>> +     }
>>>> +
>>>> +     return 0;
>>>> +}
>>>> +
>>         u16                     (*ndo_select_queue)(struct net_device *dev,
>>                                                     struct sk_buff *skb);
>> use u16 as the return value so ..., and I think u16 is big enough. If
>> you insist on this, I'll use u32 instead.
>
> Well, get_tx_queues() uses unsigned int, as does struct net_device.
> I agree that it probably won't ever be needed, but there's no harm
> in using the correct type, the attribute encoding doesn't even need
> more room.
>

So, in the last patch, I use it instead. :)

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12 12:48   ` Eric Dumazet
@ 2009-11-13  1:26     ` Changli Gao
  2009-11-13  5:56       ` Eric Dumazet
  0 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-13  1:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Stephen Hemminger, Patrick McHardy, Tom Herbert, netdev

2009/11/12 Eric Dumazet <eric.dumazet@gmail.com>:
>
> I believe this patch is fine, but maybe Jarek concern about workqueue
> vs tasklet should be addressed...
>
> We could use the previous handling in case numtxqs==1 , ie use a tasklet
> instead of a work queue ?

I don't think it is a good idea. If we do so, the code will be messy,
and lost the flexibility of process. In fact, latency isn't a problem
when system load isn't high, and when system load is high (due to too
many NIC IRQs), throughput and interaction is more important, and the
current linux networking subsystem just dose so through the softirqd
mechanism.

>
>
> Sorry for this late idea...
>



-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12  9:48   ` Changli Gao
@ 2009-11-12 15:11     ` Patrick McHardy
  2009-11-13  1:32       ` Changli Gao
  0 siblings, 1 reply; 62+ messages in thread
From: Patrick McHardy @ 2009-11-12 15:11 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert, netdev

Changli Gao wrote:
> The corresponding iprout2 patch is attached.

Printing the value during dumps is missing from this patch.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12  3:12   ` Changli Gao
  2009-11-12  8:52     ` Jarek Poplawski
@ 2009-11-12 15:10     ` Patrick McHardy
  2009-11-13  1:28       ` Changli Gao
  1 sibling, 1 reply; 62+ messages in thread
From: Patrick McHardy @ 2009-11-12 15:10 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert, netdev

Changli Gao wrote:
> On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@trash.net> wrote:
>>> +static int numtxqs = 1;
>>> +module_param(numtxqs, int, 0444);
>>> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>> unsigned?
> 
> Yea, unsigned is better, and I need to check whether its value is
> smaller than 1 somewhere. The same will be done with IFLA_NTXQ.

Good point.

>>> +                     rcu_read_lock();
>>> +                     skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
>>> +                     if (!skb->dev) {
>>> +                             rcu_read_unlock();
>>> +                             dev_kfree_skb(skb);
>>> +                             txq->tx_dropped++;
>>> +                             break;
>>> +                     }
>>> +                     rcu_read_unlock();
>>> +                     skb->iif = dev->ifindex;
>> What protects the device from disappearing here and below during
>> dev_queue_xmit() and netif_rx_ni()?
> 
> For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
> problem. But for netif_rx_ni(), I don't know how to prevent the device
> disappearing, and it seems that all the NIC drivers have this problem.
> Maybe there was the assumption about the execution context of
> netif_rx() before. Now softirq can't be executed by softirqd, so the
> packet receiving path maybe interleaved. I don't know how to prevent
> it happening.

You can either take a reference of move the rcu_read_unlock()
after the skb submission.

>>> +             break;
>>> +     case __constant_htons(ETH_P_IPV6):
>>> +             ip_proto = ipv6_hdr(skb)->nexthdr;
>>> +             addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
>>> +             addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
>>> +             ihl = 10;
>> Where does 10 come from?
> 
> It should be 40, after reviewing IPV6, I found that I need to loop
> until finding the right protocol value.

There is a helper for this (ipv6_skip_exthdr).

>>> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>>> +                          unsigned int *num_tx_queues,
>>> +                          unsigned int *real_num_tx_queues)
>>> +{
>>> +     if (tb[IFLA_NTXQ]) {
>>> +             *num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
>> We currently use unsigned ints for the queue number, so please
>> use an u32 for the attribute as well.
>>
>>> +             *real_num_tx_queues = *num_tx_queues;
>>> +     } else {
>>> +             *num_tx_queues = numtxqs;
>>> +             *real_num_tx_queues = numtxqs;
>>> +     }
>>> +
>>> +     return 0;
>>> +}
>>> +
>         u16                     (*ndo_select_queue)(struct net_device *dev,
>                                                     struct sk_buff *skb);
> use u16 as the return value so ..., and I think u16 is big enough. If
> you insist on this, I'll use u32 instead.

Well, get_tx_queues() uses unsigned int, as does struct net_device.
I agree that it probably won't ever be needed, but there's no harm
in using the correct type, the attribute encoding doesn't even need
more room.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12  9:44 ` Changli Gao
  2009-11-12  9:48   ` Changli Gao
@ 2009-11-12 12:48   ` Eric Dumazet
  2009-11-13  1:26     ` Changli Gao
  2009-11-13  4:37   ` Changli Gao
  2 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2009-11-12 12:48 UTC (permalink / raw)
  To: xiaosuo
  Cc: David S. Miller, Stephen Hemminger, Patrick McHardy, Tom Herbert, netdev

Changli Gao a écrit :
> ifb: add multi-queue support
> 
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0
> 
> This patch also introduces a ip link option "numtxqs" for specifying the
> number of the TX queues. so you can add a new ifb with the command:
> 
> ip link add numtxqs 4 type ifb
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----

I believe this patch is fine, but maybe Jarek concern about workqueue 
vs tasklet should be addressed...

We could use the previous handling in case numtxqs==1 , ie use a tasklet
instead of a work queue ?


Sorry for this late idea...

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12  9:44 ` Changli Gao
@ 2009-11-12  9:48   ` Changli Gao
  2009-11-12 15:11     ` Patrick McHardy
  2009-11-12 12:48   ` Eric Dumazet
  2009-11-13  4:37   ` Changli Gao
  2 siblings, 1 reply; 62+ messages in thread
From: Changli Gao @ 2009-11-12  9:48 UTC (permalink / raw)
  To: David S. Miller, Stephen Hemminger, Patrick McHardy
  Cc: xiaosuo, Eric Dumazet, Tom Herbert, netdev

[-- Attachment #1: Type: text/plain, Size: 96 bytes --]

The corresponding iprout2 patch is attached.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

[-- Attachment #2: iproute2-support-numtxqs.diff --]
[-- Type: application/octet-stream, Size: 1214 bytes --]

--- ip/iplink.c.orig	2009-11-11 16:56:41.000000000 +0800
+++ ip/iplink.c	2009-11-12 14:57:25.000000000 +0800
@@ -48,6 +48,7 @@
 		fprintf(stderr, "                   [ address LLADDR ]\n");
 		fprintf(stderr, "                   [ broadcast LLADDR ]\n");
 		fprintf(stderr, "                   [ mtu MTU ]\n");
+		fprintf(stderr, "                   [ numtxqs NUMTXQS ]\n");
 		fprintf(stderr, "                   type TYPE [ ARGS ]\n");
 		fprintf(stderr, "       ip link delete DEV type TYPE [ ARGS ]\n");
 		fprintf(stderr, "\n");
@@ -180,6 +181,7 @@
 	int qlen = -1;
 	int mtu = -1;
 	int netns = -1;
+	int numtxqs = -1;
 
 	ret = argc;
 
@@ -221,6 +223,13 @@
 			if (get_integer(&mtu, *argv, 0))
 				invarg("Invalid \"mtu\" value\n", *argv);
 			addattr_l(&req->n, sizeof(*req), IFLA_MTU, &mtu, 4);
+		} else if (strcmp(*argv, "numtxqs") == 0) {
+			NEXT_ARG();
+			if (numtxqs != -1)
+				duparg("numtxqs", *argv);
+			if (get_integer(&numtxqs, *argv, 0))
+				invarg("Invalid \"numtxqs\" value\n", *argv);
+			addattr_l(&req->n, sizeof(*req), IFLA_NTXQ, &numtxqs, 4);
                 } else if (strcmp(*argv, "netns") == 0) {
                         NEXT_ARG();
                         if (netns != -1)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH] ifb: add multi-queue support
  2009-11-11  9:51 Changli Gao
                   ` (2 preceding siblings ...)
  2009-11-11 15:59 ` Patrick McHardy
@ 2009-11-12  9:44 ` Changli Gao
  2009-11-12  9:48   ` Changli Gao
                     ` (2 more replies)
  3 siblings, 3 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-12  9:44 UTC (permalink / raw)
  To: David S. Miller, Stephen Hemminger, Patrick McHardy
  Cc: xiaosuo, Eric Dumazet, Tom Herbert, netdev

ifb: add multi-queue support

Add multi-queue support, and one kernel thread is created for per queue.
It can used to emulate multi-queue NIC in software, and distribute work
among CPUs.
gentux linux # modprobe ifb numtxqs=2
gentux linux # ifconfig ifb0 up
gentux linux # pgrep ifb0
18508
18509
gentux linux # taskset -p 1 18508
pid 18508's current affinity mask: 3
pid 18508's new affinity mask: 1
gentux linux # taskset -p 2 18509
pid 18509's current affinity mask: 3
pid 18509's new affinity mask: 2
gentux linux # tc qdisc add dev br0 ingress
gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
action mirred egress redirect dev ifb0

This patch also introduces a ip link option "numtxqs" for specifying the
number of the TX queues. so you can add a new ifb with the command:

ip link add numtxqs 4 type ifb

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
drivers/net/ifb.c | 506 +++++++++++++++++++++++++++++++++++-------------
include/linux/if_link.h | 1
net/core/rtnetlink.c | 3
3 files changed, 375 insertions(+), 135 deletions(-)

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 69c2566..7116953 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -33,161 +33,166 @@
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
 #include <net/pkt_sched.h>
 #include <net/net_namespace.h>
 
-#define TX_TIMEOUT  (2*HZ)
-
 #define TX_Q_LIMIT    32
+
+struct ifb_private_q {
+	struct net_device	*dev;
+	struct sk_buff_head	rq;
+	wait_queue_head_t	wq;
+	struct task_struct	*task;
+	unsigned long		rx_packets;
+	unsigned long		rx_bytes;
+	unsigned long		rx_dropped;
+} ____cacheline_aligned_in_smp;
+
 struct ifb_private {
-	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
-	/* mostly debug stats leave in for now */
-	unsigned long   st_task_enter; /* tasklet entered */
-	unsigned long   st_txq_refl_try; /* transmit queue refill attempt */
-	unsigned long   st_rxq_enter; /* receive queue entered */
-	unsigned long   st_rx2tx_tran; /* receive to trasmit transfers */
-	unsigned long   st_rxq_notenter; /*receiveQ not entered, resched */
-	unsigned long   st_rx_frm_egr; /* received from egress path */
-	unsigned long   st_rx_frm_ing; /* received from ingress path */
-	unsigned long   st_rxq_check;
-	unsigned long   st_rxq_rsch;
-	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
+	struct ifb_private_q	*pq;
 };
 
+/* Number of ifb devices to be set up by this module. */
 static int numifbs = 2;
+module_param(numifbs, int, 0444);
+MODULE_PARM_DESC(numifbs, "Number of ifb devices");
 
-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
+/* Number of TX queues per ifb */
+static unsigned int numtxqs = 1;
+module_param(numtxqs, uint, 0444);
+MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
 
-static void ri_tasklet(unsigned long dev)
+static int ifb_thread(void *priv)
 {
-
-	struct net_device *_dev = (struct net_device *)dev;
-	struct ifb_private *dp = netdev_priv(_dev);
-	struct net_device_stats *stats = &_dev->stats;
-	struct netdev_queue *txq;
+	struct ifb_private_q *pq = priv;
+	struct net_device *dev = pq->dev, *_dev;
+	int num = pq - ((struct ifb_private *)netdev_priv(dev))->pq;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 	struct sk_buff *skb;
-
-	txq = netdev_get_tx_queue(_dev, 0);
-	dp->st_task_enter++;
-	if ((skb = skb_peek(&dp->tq)) == NULL) {
-		dp->st_txq_refl_try++;
-		if (__netif_tx_trylock(txq)) {
-			dp->st_rxq_enter++;
-			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
-				skb_queue_tail(&dp->tq, skb);
-				dp->st_rx2tx_tran++;
-			}
-			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			dp->st_rxq_notenter++;
-			goto resched;
+	DEFINE_WAIT(wait);
+	struct sk_buff_head tq;
+
+	__skb_queue_head_init(&tq);
+	while (1) {
+		/* move skb from rq to tq */
+		while (1) {
+			prepare_to_wait(&pq->wq, &wait, TASK_UNINTERRUPTIBLE);
+			__netif_tx_lock_bh(txq);
+			skb_queue_splice_tail_init(&pq->rq, &tq);
+			if (netif_queue_stopped(dev))
+				netif_wake_queue(dev);
+			__netif_tx_unlock_bh(txq);
+			if (kthread_should_stop() || !skb_queue_empty(&tq))
+				break;
+			schedule();
 		}
-	}
-
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
-		u32 from = G_TC_FROM(skb->tc_verd);
-
-		skb->tc_verd = 0;
-		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
-		stats->tx_packets++;
-		stats->tx_bytes +=skb->len;
-
-		rcu_read_lock();
-		skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
-		if (!skb->dev) {
-			rcu_read_unlock();
-			dev_kfree_skb(skb);
-			stats->tx_dropped++;
+		finish_wait(&pq->wq, &wait);
+		if (kthread_should_stop()) {
+			__skb_queue_purge(&tq);
 			break;
 		}
-		rcu_read_unlock();
-		skb->iif = _dev->ifindex;
-
-		if (from & AT_EGRESS) {
-			dp->st_rx_frm_egr++;
-			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
-			dp->st_rx_frm_ing++;
-			skb_pull(skb, skb->dev->hard_header_len);
-			netif_rx(skb);
-		} else
-			BUG();
-	}
-
-	if (__netif_tx_trylock(txq)) {
-		dp->st_rxq_check++;
-		if ((skb = skb_peek(&dp->rq)) == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			dp->st_rxq_rsch++;
-			__netif_tx_unlock(txq);
-			goto resched;
+		if (need_resched())
+			schedule();
+
+		/* transfer packets */
+		while ((skb = __skb_dequeue(&tq)) != NULL) {
+			u32 from = G_TC_FROM(skb->tc_verd);
+	
+			skb->tc_verd = 0;
+			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+			txq->tx_packets++;
+			txq->tx_bytes +=skb->len;
+
+			rcu_read_lock();
+			_dev = dev_get_by_index_rcu(&init_net, skb->iif);
+			if (!_dev) {
+				rcu_read_unlock();
+				dev_kfree_skb(skb);
+				txq->tx_dropped++;
+				continue;
+			}
+			if (from & AT_INGRESS)
+				dev_hold(_dev);
+			rcu_read_unlock();
+			skb->dev = _dev;
+			skb->iif = dev->ifindex;
+	
+			if (from & AT_EGRESS) {
+				dev_queue_xmit(skb);
+			} else if (from & AT_INGRESS) {
+				skb_pull(skb, skb->dev->hard_header_len);
+				netif_rx_ni(skb);
+				dev_put(_dev);
+			} else
+				BUG();
 		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
 	}
 
+	return 0;
 }
 
-static const struct net_device_ops ifb_netdev_ops = {
-	.ndo_open	= ifb_open,
-	.ndo_stop	= ifb_close,
-	.ndo_start_xmit	= ifb_xmit,
-	.ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
+struct net_device_stats* ifb_get_stats(struct net_device *dev)
 {
-	/* Initialize the device structure. */
-	dev->destructor = free_netdev;
-	dev->netdev_ops = &ifb_netdev_ops;
+	struct net_device_stats *stats = &dev->stats;
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	struct netdev_queue *txq;
+	int i;
+	unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
+	unsigned long tx_packets = 0, tx_bytes = 0, tx_dropped = 0;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		rx_packets += pq[i].rx_packets;
+		rx_bytes += pq[i].rx_bytes;
+		rx_dropped += pq[i].rx_dropped;
+		txq = netdev_get_tx_queue(dev, i);
+		tx_packets += txq->tx_packets;
+		tx_bytes += txq->tx_bytes;
+		tx_dropped += txq->tx_dropped;
+	}
 
-	/* Fill in device structure with ethernet-generic values. */
-	ether_setup(dev);
-	dev->tx_queue_len = TX_Q_LIMIT;
+	stats->rx_packets = rx_packets;
+	stats->rx_bytes = rx_bytes;
+	stats->rx_dropped = rx_dropped;
+	stats->tx_packets = tx_packets;
+	stats->tx_bytes = tx_bytes;
+	stats->tx_dropped = tx_dropped;
 
-	dev->flags |= IFF_NOARP;
-	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
-	random_ether_addr(dev->dev_addr);
+	return stats;
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
 	u32 from = G_TC_FROM(skb->tc_verd);
+	int num = skb_get_queue_mapping(skb);
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq + num;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 
-	stats->rx_packets++;
-	stats->rx_bytes+=skb->len;
+	pq->rx_packets++;
+	pq->rx_bytes += skb->len;
 
 	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
 		dev_kfree_skb(skb);
-		stats->rx_dropped++;
+		pq->rx_dropped++;
 		return NETDEV_TX_OK;
 	}
 
-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len) {
+	txq->trans_start = jiffies;
+	__skb_queue_tail(&pq->rq, skb);
+	if (skb_queue_len(&pq->rq) >= dev->tx_queue_len)
 		netif_stop_queue(dev);
-	}
-
-	dev->trans_start = jiffies;
-	skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
+	if (skb_queue_len(&pq->rq) == 1)
+		wake_up(&pq->wq);
 
 	return NETDEV_TX_OK;
 }
@@ -195,26 +200,241 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 static int ifb_close(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
 
-	tasklet_kill(&dp->ifb_tasklet);
 	netif_stop_queue(dev);
-	skb_queue_purge(&dp->rq);
-	skb_queue_purge(&dp->tq);
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		kthread_stop(pq[i].task);
+		__skb_queue_purge(&pq[i].rq);
+	}
+
 	return 0;
 }
 
 static int ifb_open(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
-
-	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
-	skb_queue_head_init(&dp->rq);
-	skb_queue_head_init(&dp->tq);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
+	
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].task = kthread_run(ifb_thread, &pq[i], "%s/%d", dev->name,
+					 i);
+		if (IS_ERR(pq[i].task)) {
+			int err = PTR_ERR(pq[i].task);
+			while (--i >= 0)
+				kthread_stop(pq[i].task);
+			return err;
+		}
+	}
 	netif_start_queue(dev);
 
 	return 0;
 }
 
+static u32 simple_tx_hashrnd;
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+	return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+			sizeof(struct pppoe_hdr)));
+}
+
+static u8 ifb_ipv6_ihl(struct sk_buff *skb, u32 *pihl)
+{
+	struct ipv6hdr *hdr = ipv6_hdr(skb);
+	u32 ihl = sizeof(*hdr);
+	u8 nexthdr = hdr->nexthdr;
+
+	while (1) {
+		switch (nexthdr) {
+		case NEXTHDR_HOP:
+		case NEXTHDR_ROUTING:
+		case NEXTHDR_DEST:
+			if (!pskb_may_pull(skb, ihl + 8) ||
+			    !pskb_may_pull(skb, ihl +
+			    	((skb_network_header(skb)[ihl + 1] + 1) << 3)))
+			return 0;
+			nexthdr = skb_network_header(skb)[ihl];
+			ihl += (skb_network_header(skb)[ihl + 1] + 1) << 3;
+			break;
+		case NEXTHDR_AUTH:
+		case NEXTHDR_ESP:
+		case NEXTHDR_NONE:
+		case NEXTHDR_FRAGMENT:
+			return 0;
+		default:
+			*pihl = ihl;
+			return nexthdr;
+		}
+	}
+}
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2;
+	u32 hash, ihl = 0;
+	union {
+		u16 in16[2];
+		u32 in32;
+	} ports;
+	u8 ip_proto;
+	unsigned int pull_len;
+
+	if ((hash = skb_rx_queue_recorded(skb))) {
+		while (hash >= dev->real_num_tx_queues)
+			hash -= dev->real_num_tx_queues;
+		return hash;
+	}
+
+	pull_len = 0;
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_8021Q):
+		if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL))
+			goto process_other;
+		pull_len = VLAN_HLEN;
+		skb->network_header += pull_len;
+		switch (vlan_eth_hdr(skb)->h_vlan_encapsulated_proto) {
+		case __constant_htons(ETH_P_IP):
+			goto process_ip;
+		case __constant_htons(ETH_P_IPV6):
+			goto process_ipv6;
+		default:
+			goto process_other;
+		}
+		break;
+	case __constant_htons(ETH_P_PPP_SES):
+		if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL))
+			goto process_other;
+		pull_len = PPPOE_SES_HLEN;
+		skb->network_header += pull_len;
+		switch (pppoe_proto(skb)) {
+		case __constant_htons(ETH_P_IP):
+			goto process_ip;
+		case __constant_htons(ETH_P_IPV6):
+			goto process_ipv6;
+		default:
+			goto process_other;
+		}
+		break;
+	case __constant_htons(ETH_P_IP):
+process_ip:
+		if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
+			goto process_other;
+		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+			ip_proto = ip_hdr(skb)->protocol;
+		else
+			ip_proto = 0;
+		addr1 = ip_hdr(skb)->saddr;
+		addr2 = ip_hdr(skb)->daddr;
+		ihl = ip_hdrlen(skb);
+		break;
+	case __constant_htons(ETH_P_IPV6):
+process_ipv6:
+		if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr))))
+			goto process_other;
+		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+		ip_proto = ifb_ipv6_ihl(skb, &ihl);
+		break;
+	default:
+process_other:
+		if (pull_len != 0) {
+			skb_push(skb, pull_len);
+			skb->network_header -= pull_len;
+		}
+		return skb->protocol % dev->real_num_tx_queues;
+	}
+	if (addr1 > addr2)
+		swap(addr1, addr2);
+
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (unlikely(!pskb_may_pull(skb, ihl + 4)))
+			goto process_other_trans;
+		ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
+		if (ports.in16[0] > ports.in16[1])
+			swap(ports.in16[0], ports.in16[1]);
+		break;
+
+	default:
+process_other_trans:
+		ports.in32 = 0;
+		break;
+	}
+
+	if (pull_len != 0) {
+		skb_push(skb, pull_len);
+		skb->network_header -= pull_len;
+	}
+
+	hash = jhash_3words(addr1, addr2, ports.in32,
+			    simple_tx_hashrnd ^ ip_proto);
+
+	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static int ifb_init(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
+
+	pq = kcalloc(dev->real_num_tx_queues, sizeof(*pq), GFP_KERNEL);
+	if (pq == NULL)
+		return -ENOMEM;
+	dp->pq = pq;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].dev = dev;
+		__skb_queue_head_init(&pq[i].rq);
+		init_waitqueue_head(&pq[i].wq);
+	}
+
+	return 0;
+}
+
+static void ifb_uninit(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+
+	kfree(dp->pq);
+}
+
+static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_init		= ifb_init,
+	.ndo_uninit		= ifb_uninit,
+	.ndo_open		= ifb_open,
+	.ndo_stop		= ifb_close,
+	.ndo_start_xmit		= ifb_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_select_queue	= ifb_select_queue,
+	.ndo_get_stats		= ifb_get_stats,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+	/* Initialize the device structure. */
+	dev->destructor = free_netdev;
+	dev->netdev_ops = &ifb_netdev_ops;
+
+	/* Fill in device structure with ethernet-generic values. */
+	ether_setup(dev);
+	dev->tx_queue_len = TX_Q_LIMIT;
+
+	dev->flags |= IFF_NOARP;
+	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	random_ether_addr(dev->dev_addr);
+}
+
 static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {
@@ -226,25 +446,38 @@ static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
+			     unsigned int *num_tx_queues,
+			     unsigned int *real_num_tx_queues)
+{
+	if (tb[IFLA_NTXQ]) {
+		*num_tx_queues = nla_get_u32(tb[IFLA_NTXQ]);
+		if (*num_tx_queues < 1)
+			return -EINVAL;
+		*real_num_tx_queues = *num_tx_queues;
+	} else {
+		*num_tx_queues = numtxqs;
+		*real_num_tx_queues = numtxqs;
+	}
+
+	return 0;
+}
+
 static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.kind		= "ifb",
-	.priv_size	= sizeof(struct ifb_private),
 	.setup		= ifb_setup,
 	.validate	= ifb_validate,
+	.get_tx_queues	= ifb_get_tx_queues,
+	.priv_size	= sizeof(struct ifb_private),
 };
 
-/* Number of ifb devices to be set up by this module. */
-module_param(numifbs, int, 0);
-MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-
 static int __init ifb_init_one(int index)
 {
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
-				 "ifb%d", ifb_setup);
-
+	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
+				  ifb_setup, numtxqs);
 	if (!dev_ifb)
 		return -ENOMEM;
 
@@ -268,9 +501,12 @@ static int __init ifb_init_module(void)
 {
 	int i, err;
 
+	if (numtxqs < 1)
+		return -EINVAL;
+
+	get_random_bytes(&simple_tx_hashrnd, 4);
 	rtnl_lock();
 	err = __rtnl_link_register(&ifb_link_ops);
-
 	for (i = 0; i < numifbs && !err; i++)
 		err = ifb_init_one(i);
 	if (err)
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1d3b242..99da411 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -78,6 +78,7 @@ enum {
 #define IFLA_LINKINFO IFLA_LINKINFO
 	IFLA_NET_NS_PID,
 	IFLA_IFALIAS,
+	IFLA_NTXQ,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33148a5..1cbe555 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -597,6 +597,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
 	       + nla_total_size(4) /* IFLA_MASTER */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
+	       + nla_total_size(4) /* IFLA_NTXQ */
 	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
 }
 
@@ -627,6 +628,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		   netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
 	NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
 	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+	NLA_PUT_U32(skb, IFLA_NTXQ, dev->real_num_tx_queues);
 
 	if (dev->ifindex != dev->iflink)
 		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -725,6 +727,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	[IFLA_NTXQ]		= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL(ifla_policy);
 



^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12  8:52     ` Jarek Poplawski
@ 2009-11-12  9:32       ` Changli Gao
  0 siblings, 0 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-12  9:32 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Patrick McHardy, David S. Miller, Stephen Hemminger,
	Eric Dumazet, Tom Herbert, netdev

On Thu, Nov 12, 2009 at 4:52 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> On 12-11-2009 04:12, Changli Gao wrote:
>> On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@trash.net> wrote:
> ...
>>> What protects the device from disappearing here and below during
>>> dev_queue_xmit() and netif_rx_ni()?
>>
>> For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
>> problem.

Oh, I'm wrong, and not all the skbs using dev_queue_xmit() has a valid
dst, such as AF_PACKET. As skb->dev doesn't own dev, so I think it is
AF_PACKETs fault. Too bad!

>> But for netif_rx_ni(), I don't know how to prevent the device
>> disappearing, and it seems that all the NIC drivers have this problem.
>> Maybe there was the assumption about the execution context of
>> netif_rx() before. Now softirq can't be executed by softirqd, so the
>> packet receiving path maybe interleaved. I don't know how to prevent
>> it happening.
>
> Btw, maybe I miss something, but ifb with act_mirred is for many users
> on the fast path. I'm not sure you proved enough why moving it to the
> process context and additional multiqueue functionality (which seems
> "pretty cool", but probably mostly for testing purposes) don't harm
> its main use?

Because process is more flexible than other activities, and you can
specify its priority and bind it to some CPU, and so on. If you have a
SMP system, and a NIC which doesn't support MQ. When you use it as a
firewall, you'll find all the work is nearly on a CPU. As throughput
increase, that CPU will be full and no more throughput can gain,
though the other CPUs are still idle. With IFB-MQ, you can distribute
the packets among all the CPUs, and archive good throughput.

I know you worry about the latency. From my experiments, It just OK.
No much explicit latency is added.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-12  3:12   ` Changli Gao
@ 2009-11-12  8:52     ` Jarek Poplawski
  2009-11-12  9:32       ` Changli Gao
  2009-11-12 15:10     ` Patrick McHardy
  1 sibling, 1 reply; 62+ messages in thread
From: Jarek Poplawski @ 2009-11-12  8:52 UTC (permalink / raw)
  To: Changli Gao
  Cc: Patrick McHardy, David S. Miller, Stephen Hemminger,
	Eric Dumazet, Tom Herbert, netdev

On 12-11-2009 04:12, Changli Gao wrote:
> On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@trash.net> wrote:
...
>> What protects the device from disappearing here and below during
>> dev_queue_xmit() and netif_rx_ni()?
> 
> For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
> problem. But for netif_rx_ni(), I don't know how to prevent the device
> disappearing, and it seems that all the NIC drivers have this problem.
> Maybe there was the assumption about the execution context of
> netif_rx() before. Now softirq can't be executed by softirqd, so the
> packet receiving path maybe interleaved. I don't know how to prevent
> it happening.

Btw, maybe I miss something, but ifb with act_mirred is for many users
on the fast path. I'm not sure you proved enough why moving it to the
process context and additional multiqueue functionality (which seems
"pretty cool", but probably mostly for testing purposes) don't harm
its main use?

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-11 15:59 ` Patrick McHardy
@ 2009-11-12  3:12   ` Changli Gao
  2009-11-12  8:52     ` Jarek Poplawski
  2009-11-12 15:10     ` Patrick McHardy
  0 siblings, 2 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-12  3:12 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert, netdev

On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@trash.net> wrote:
> Changli Gao wrote:
>> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
>> index 69c2566..ac04e85 100644
>> --- a/drivers/net/ifb.c
>> +++ b/drivers/net/ifb.c
>> ...
>> +/* Number of ifb devices to be set up by this module. */
>>  static int numifbs = 2;
>> +module_param(numifbs, int, 0444);
>> +MODULE_PARM_DESC(numifbs, "Number of ifb devices");
>>
>> -static void ri_tasklet(unsigned long dev);
>> -static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
>> -static int ifb_open(struct net_device *dev);
>> -static int ifb_close(struct net_device *dev);
>> +/* Number of TX queues per ifb */
>> +static int numtxqs = 1;
>> +module_param(numtxqs, int, 0444);
>> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>
> unsigned?

Yea, unsigned is better, and I need to check whether its value is
smaller than 1 somewhere. The same will be done with IFLA_NTXQ.

>
>> +             while ((skb = skb_dequeue(&pq->tq)) != NULL) {
>> +                     u32 from = G_TC_FROM(skb->tc_verd);
>> +
>> +                     skb->tc_verd = 0;
>> +                     skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
>> +                     txq->tx_packets++;
>> +                     txq->tx_bytes +=skb->len;
>> +
>> +                     rcu_read_lock();
>> +                     skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
>> +                     if (!skb->dev) {
>> +                             rcu_read_unlock();
>> +                             dev_kfree_skb(skb);
>> +                             txq->tx_dropped++;
>> +                             break;
>> +                     }
>> +                     rcu_read_unlock();
>> +                     skb->iif = dev->ifindex;
>
> What protects the device from disappearing here and below during
> dev_queue_xmit() and netif_rx_ni()?

For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
problem. But for netif_rx_ni(), I don't know how to prevent the device
disappearing, and it seems that all the NIC drivers have this problem.
Maybe there was the assumption about the execution context of
netif_rx() before. Now softirq can't be executed by softirqd, so the
packet receiving path maybe interleaved. I don't know how to prevent
it happening.


>> +
>> +static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
>> +{
>> +     u32 addr1, addr2;
>> +     u32 hash, ihl;
>> +     union {
>> +             u16 in16[2];
>> +             u32 in32;
>> +     } ports;
>> +     u8 ip_proto;
>> +
>> +     if ((hash = skb_rx_queue_recorded(skb))) {
>> +             while (hash >= dev->real_num_tx_queues)
>> +                     hash -= dev->real_num_tx_queues;
>> +             return hash;
>> +     }
>> +
>> +     switch (skb->protocol) {
>> +     case __constant_htons(ETH_P_IP):
>> +             if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
>> +                     ip_proto = ip_hdr(skb)->protocol;
>> +             else
>> +                     ip_proto = 0;
>
> So fragments will get reordered?

Yea. It is OK, as the number of fragments isn't large. If we decide to
avoid this, we have to reasm them as netfilter does. and it isn't
efficiency.

>
>> +             addr1 = ip_hdr(skb)->saddr;
>> +             addr2 = ip_hdr(skb)->daddr;
>> +             ihl = ip_hdr(skb)->ihl << 2;
>
> ip_hdrlen()?

Yea, I'll use ip_hdrlen() instead.

>
>> +             break;
>> +     case __constant_htons(ETH_P_IPV6):
>> +             ip_proto = ipv6_hdr(skb)->nexthdr;
>> +             addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
>> +             addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
>> +             ihl = 10;
>
> Where does 10 come from?

It should be 40, after reviewing IPV6, I found that I need to loop
until finding the right protocol value.

>
>> +             break;
>> +     default:
>> +             return 0;
>
> Perhaps hash on skb->protocol here.

use return skb->protocol % dev->real_num_tx_queues; instead.

>
>> +     }
>> +     if (addr1 > addr2)
>> +             swap(addr1, addr2);
>> +
>> +     switch (ip_proto) {
>> +     case IPPROTO_TCP:
>> +     case IPPROTO_UDP:
>> +     case IPPROTO_DCCP:
>> +     case IPPROTO_ESP:
>> +     case IPPROTO_AH:
>> +     case IPPROTO_SCTP:
>> +     case IPPROTO_UDPLITE:
>> +             ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
>> +             if (ports.in16[0] > ports.in16[1])
>> +                     swap(ports.in16[0], ports.in16[1]);
>> +             break;
>> +
>> +     default:
>> +             ports.in32 = 0;
>> +             break;
>> +     }
>> +
>> +     hash = jhash_3words(addr1, addr2, ports.in32,
>> +                         simple_tx_hashrnd ^ ip_proto);
>> +
>> +     return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
>> +}
>> +
>> +static int ifb_init(struct net_device *dev)
>> +{
>> +     struct ifb_private *dp = netdev_priv(dev);
>> +     struct ifb_private_q *pq = dp->pq;
>> +     int i;
>> +
>> +     pq = kmalloc(sizeof(*pq) * dev->real_num_tx_queues, GFP_KERNEL);
>
> kcalloc()?

OK.

>> +     if (pq == NULL)
>> +             return -ENOMEM;
>> +     dp->pq = pq;
>> +
>> +     for (i = 0; i < dev->real_num_tx_queues; i++) {
>> +             pq[i].dev = dev;
>> +             skb_queue_head_init(&pq[i].rq);
>> +             skb_queue_head_init(&pq[i].tq);
>> +             init_waitqueue_head(&pq[i].wq);
>> +             pq[i].rx_packets = 0;
>> +             pq[i].rx_bytes = 0;
>> +             pq[i].rx_dropped = 0;
>> +     }
>> +
>> +     return 0;
>> +}
>
>> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>> +                          unsigned int *num_tx_queues,
>> +                          unsigned int *real_num_tx_queues)
>> +{
>> +     if (tb[IFLA_NTXQ]) {
>> +             *num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
>
> We currently use unsigned ints for the queue number, so please
> use an u32 for the attribute as well.
>
>> +             *real_num_tx_queues = *num_tx_queues;
>> +     } else {
>> +             *num_tx_queues = numtxqs;
>> +             *real_num_tx_queues = numtxqs;
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>
        u16                     (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb);
use u16 as the return value so ..., and I think u16 is big enough. If
you insist on this, I'll use u32 instead.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-11  9:51 Changli Gao
  2009-11-11  9:56 ` Changli Gao
  2009-11-11 10:30 ` Eric Dumazet
@ 2009-11-11 15:59 ` Patrick McHardy
  2009-11-12  3:12   ` Changli Gao
  2009-11-12  9:44 ` Changli Gao
  3 siblings, 1 reply; 62+ messages in thread
From: Patrick McHardy @ 2009-11-11 15:59 UTC (permalink / raw)
  To: xiaosuo
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert, netdev

Changli Gao wrote:
> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
> index 69c2566..ac04e85 100644
> --- a/drivers/net/ifb.c
> +++ b/drivers/net/ifb.c
> ...
> +/* Number of ifb devices to be set up by this module. */
>  static int numifbs = 2;
> +module_param(numifbs, int, 0444);
> +MODULE_PARM_DESC(numifbs, "Number of ifb devices");
>  
> -static void ri_tasklet(unsigned long dev);
> -static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
> -static int ifb_open(struct net_device *dev);
> -static int ifb_close(struct net_device *dev);
> +/* Number of TX queues per ifb */
> +static int numtxqs = 1;
> +module_param(numtxqs, int, 0444);
> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");

unsigned?

> +		while ((skb = skb_dequeue(&pq->tq)) != NULL) {
> +			u32 from = G_TC_FROM(skb->tc_verd);
> +	
> +			skb->tc_verd = 0;
> +			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
> +			txq->tx_packets++;
> +			txq->tx_bytes +=skb->len;
> +
> +			rcu_read_lock();
> +			skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
> +			if (!skb->dev) {
> +				rcu_read_unlock();
> +				dev_kfree_skb(skb);
> +				txq->tx_dropped++;
> +				break;
> +			}
> +			rcu_read_unlock();
> +			skb->iif = dev->ifindex;

What protects the device from disappearing here and below during
dev_queue_xmit() and netif_rx_ni()?

> +	
> +			if (from & AT_EGRESS) {
> +				dev_queue_xmit(skb);
> +			} else if (from & AT_INGRESS) {
> +				skb_pull(skb, skb->dev->hard_header_len);
> +				netif_rx_ni(skb);
> +			} else
> +				BUG();
>  		}

> +static u32 simple_tx_hashrnd;
> +
> +static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2;
> +	u32 hash, ihl;
> +	union {
> +		u16 in16[2];
> +		u32 in32;
> +	} ports;
> +	u8 ip_proto;
> +
> +	if ((hash = skb_rx_queue_recorded(skb))) {
> +		while (hash >= dev->real_num_tx_queues)
> +			hash -= dev->real_num_tx_queues;
> +		return hash;
> +	}
> +
> +	switch (skb->protocol) {
> +	case __constant_htons(ETH_P_IP):
> +		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
> +			ip_proto = ip_hdr(skb)->protocol;
> +		else
> +			ip_proto = 0;

So fragments will get reordered?

> +		addr1 = ip_hdr(skb)->saddr;
> +		addr2 = ip_hdr(skb)->daddr;
> +		ihl = ip_hdr(skb)->ihl << 2;

ip_hdrlen()?

> +		break;
> +	case __constant_htons(ETH_P_IPV6):
> +		ip_proto = ipv6_hdr(skb)->nexthdr;
> +		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
> +		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
> +		ihl = 10;

Where does 10 come from?

> +		break;
> +	default:
> +		return 0;

Perhaps hash on skb->protocol here.

> +	}
> +	if (addr1 > addr2)
> +		swap(addr1, addr2);
> +
> +	switch (ip_proto) {
> +	case IPPROTO_TCP:
> +	case IPPROTO_UDP:
> +	case IPPROTO_DCCP:
> +	case IPPROTO_ESP:
> +	case IPPROTO_AH:
> +	case IPPROTO_SCTP:
> +	case IPPROTO_UDPLITE:
> +		ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
> +		if (ports.in16[0] > ports.in16[1])
> +			swap(ports.in16[0], ports.in16[1]);
> +		break;
> +
> +	default:
> +		ports.in32 = 0;
> +		break;
> +	}
> +
> +	hash = jhash_3words(addr1, addr2, ports.in32,
> +			    simple_tx_hashrnd ^ ip_proto);
> +
> +	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
> +}
> +
> +static int ifb_init(struct net_device *dev)
> +{
> +	struct ifb_private *dp = netdev_priv(dev);
> +	struct ifb_private_q *pq = dp->pq;
> +	int i;
> +
> +	pq = kmalloc(sizeof(*pq) * dev->real_num_tx_queues, GFP_KERNEL);

kcalloc()?

> +	if (pq == NULL)
> +		return -ENOMEM;
> +	dp->pq = pq;
> +
> +	for (i = 0; i < dev->real_num_tx_queues; i++) {
> +		pq[i].dev = dev;
> +		skb_queue_head_init(&pq[i].rq);
> +		skb_queue_head_init(&pq[i].tq);
> +		init_waitqueue_head(&pq[i].wq);
> +		pq[i].rx_packets = 0;
> +		pq[i].rx_bytes = 0;
> +		pq[i].rx_dropped = 0;
> +	}
> +
> +	return 0;
> +}

> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
> +			     unsigned int *num_tx_queues,
> +			     unsigned int *real_num_tx_queues)
> +{
> +	if (tb[IFLA_NTXQ]) {
> +		*num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);

We currently use unsigned ints for the queue number, so please
use an u32 for the attribute as well.

> +		*real_num_tx_queues = *num_tx_queues;
> +	} else {
> +		*num_tx_queues = numtxqs;
> +		*real_num_tx_queues = numtxqs;
> +	}
> +
> +	return 0;
> +}
> +

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-11 10:30 ` Eric Dumazet
@ 2009-11-11 10:57   ` Changli Gao
  0 siblings, 0 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-11 10:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Stephen Hemminger, Patrick McHardy, Tom Herbert, netdev

2009/11/11 Eric Dumazet <eric.dumazet@gmail.com>:
>
> Or even better, tq could be local to ifb_xmit()
>
> ifb_xmit() would purge it itself, and could use __skb_dequeue() and
> other lockless primitives.
>
> The whole :
>
> while ((skb = skb_dequeue(&pq->rq)) != NULL)
>        skb_queue_tail(&pq->tq, skb);
>
> could be optimized a lot IMHO, its almost a skb_queue_splice() thing...
>

Thanks, it is a good idea. Any others?


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-11  9:51 Changli Gao
  2009-11-11  9:56 ` Changli Gao
@ 2009-11-11 10:30 ` Eric Dumazet
  2009-11-11 10:57   ` Changli Gao
  2009-11-11 15:59 ` Patrick McHardy
  2009-11-12  9:44 ` Changli Gao
  3 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2009-11-11 10:30 UTC (permalink / raw)
  To: xiaosuo
  Cc: David S. Miller, Stephen Hemminger, Patrick McHardy, Tom Herbert, netdev

Changli Gao a écrit :
> ifb: add multi-queue support
> 
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0
> 
> This patch also introduces a ip link option "numtxqs" for specifying the
> number of the TX queues. so you can add a new ifb with the command:
> 
> ip link add numtxqs 4 type ifb
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
> drivers/net/ifb.c | 414 ++++++++++++++++++++++++++++++++----------------
> include/linux/if_link.h | 1
> net/core/rtnetlink.c | 3
> 3 files changed, 283 insertions(+), 135 deletions(-)
> 
> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
> index 69c2566..ac04e85 100644
> --- a/drivers/net/ifb.c
> +++ b/drivers/net/ifb.c
> @@ -33,161 +33,157 @@
>  #include <linux/etherdevice.h>
>  #include <linux/init.h>
>  #include <linux/moduleparam.h>
> +#include <linux/wait.h>
> +#include <linux/sched.h>
> +#include <linux/kthread.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <net/ip.h>
>  #include <net/pkt_sched.h>
>  #include <net/net_namespace.h>
>  
> -#define TX_TIMEOUT  (2*HZ)
> -
>  #define TX_Q_LIMIT    32
> +
> +struct ifb_private_q {
> +	struct net_device	*dev;
> +	struct sk_buff_head	rq;
> +	struct sk_buff_head	tq;
> +	wait_queue_head_t	wq;
> +	struct task_struct	*task;
> +	unsigned long		rx_packets;
> +	unsigned long		rx_bytes;
> +	unsigned long		rx_dropped;
> +} ____cacheline_aligned_in_smp;
> +

Could you split this struct in two parts, one used by ifb_xmit(),
one used by the ifb_thread() ?
This would reduce number of cache line ping pongs...


struct ifb_private_q {

	struct net_device	*dev;
	struct task_struct	*task;

/* used by ifb_xmit() */

	struct sk_buff_head	rq;
	unsigned long		rx_packets;
	unsigned long		rx_bytes;
	unsigned long		rx_dropped;
	wait_queue_head_t	wq;

/* used by ifb_thread() */

	struct sk_buff_head	tq ____cacheline_aligned_in_smp;
} ____cacheline_aligned_in_smp;


Or even better, tq could be local to ifb_xmit()

ifb_xmit() would purge it itself, and could use __skb_dequeue() and
other lockless primitives.

The whole :

while ((skb = skb_dequeue(&pq->rq)) != NULL)
	skb_queue_tail(&pq->tq, skb);

could be optimized a lot IMHO, its almost a skb_queue_splice() thing...


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH] ifb: add multi-queue support
  2009-11-11  9:51 Changli Gao
@ 2009-11-11  9:56 ` Changli Gao
  2009-11-11 10:30 ` Eric Dumazet
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-11  9:56 UTC (permalink / raw)
  To: David S. Miller, Stephen Hemminger
  Cc: Patrick McHardy, Eric Dumazet, Tom Herbert, netdev

[-- Attachment #1: Type: text/plain, Size: 1106 bytes --]

2009/11/11 Changli Gao <xiaosuo@gmail.com>:
> ifb: add multi-queue support
>
> Add multi-queue support, and one kernel thread is created for per queue.
> It can used to emulate multi-queue NIC in software, and distribute work
> among CPUs.
> gentux linux # modprobe ifb numtxqs=2
> gentux linux # ifconfig ifb0 up
> gentux linux # pgrep ifb0
> 18508
> 18509
> gentux linux # taskset -p 1 18508
> pid 18508's current affinity mask: 3
> pid 18508's new affinity mask: 1
> gentux linux # taskset -p 2 18509
> pid 18509's current affinity mask: 3
> pid 18509's new affinity mask: 2
> gentux linux # tc qdisc add dev br0 ingress
> gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
> action mirred egress redirect dev ifb0
>
> This patch also introduces a ip link option "numtxqs" for specifying the
> number of the TX queues. so you can add a new ifb with the command:
>
> ip link add numtxqs 4 type ifb
>

The corresponding patch for iproute2 is attached too. And you need to
update the linux header files too.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

[-- Attachment #2: iproute2-support-numtxqs.diff --]
[-- Type: application/octet-stream, Size: 1214 bytes --]

--- ip/iplink.c.orig	2009-11-11 16:56:41.000000000 +0800
+++ ip/iplink.c	2009-11-11 17:32:52.000000000 +0800
@@ -48,6 +48,7 @@
 		fprintf(stderr, "                   [ address LLADDR ]\n");
 		fprintf(stderr, "                   [ broadcast LLADDR ]\n");
 		fprintf(stderr, "                   [ mtu MTU ]\n");
+		fprintf(stderr, "                   [ numtxqs NUMTXQS ]\n");
 		fprintf(stderr, "                   type TYPE [ ARGS ]\n");
 		fprintf(stderr, "       ip link delete DEV type TYPE [ ARGS ]\n");
 		fprintf(stderr, "\n");
@@ -180,6 +181,7 @@
 	int qlen = -1;
 	int mtu = -1;
 	int netns = -1;
+	int numtxqs = -1;
 
 	ret = argc;
 
@@ -221,6 +223,13 @@
 			if (get_integer(&mtu, *argv, 0))
 				invarg("Invalid \"mtu\" value\n", *argv);
 			addattr_l(&req->n, sizeof(*req), IFLA_MTU, &mtu, 4);
+		} else if (strcmp(*argv, "numtxqs") == 0) {
+			NEXT_ARG();
+			if (numtxqs != -1)
+				duparg("numtxqs", *argv);
+			if (get_integer(&numtxqs, *argv, 0))
+				invarg("Invalid \"numtxqs\" value\n", *argv);
+			addattr_l(&req->n, sizeof(*req), IFLA_NTXQ, &numtxqs, 2);
                 } else if (strcmp(*argv, "netns") == 0) {
                         NEXT_ARG();
                         if (netns != -1)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH] ifb: add multi-queue support
@ 2009-11-11  9:51 Changli Gao
  2009-11-11  9:56 ` Changli Gao
                   ` (3 more replies)
  0 siblings, 4 replies; 62+ messages in thread
From: Changli Gao @ 2009-11-11  9:51 UTC (permalink / raw)
  To: David S. Miller
  Cc: Stephen Hemminger, Patrick McHardy, Eric Dumazet, Tom Herbert, netdev

ifb: add multi-queue support

Add multi-queue support, and one kernel thread is created for per queue.
It can used to emulate multi-queue NIC in software, and distribute work
among CPUs.
gentux linux # modprobe ifb numtxqs=2
gentux linux # ifconfig ifb0 up
gentux linux # pgrep ifb0
18508
18509
gentux linux # taskset -p 1 18508
pid 18508's current affinity mask: 3
pid 18508's new affinity mask: 1
gentux linux # taskset -p 2 18509
pid 18509's current affinity mask: 3
pid 18509's new affinity mask: 2
gentux linux # tc qdisc add dev br0 ingress
gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
action mirred egress redirect dev ifb0

This patch also introduces a ip link option "numtxqs" for specifying the
number of the TX queues. so you can add a new ifb with the command:

ip link add numtxqs 4 type ifb

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
drivers/net/ifb.c | 414 ++++++++++++++++++++++++++++++++----------------
include/linux/if_link.h | 1
net/core/rtnetlink.c | 3
3 files changed, 283 insertions(+), 135 deletions(-)

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 69c2566..ac04e85 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -33,161 +33,157 @@
 #include <linux/etherdevice.h>
 #include <linux/init.h>
 #include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
 #include <net/pkt_sched.h>
 #include <net/net_namespace.h>
 
-#define TX_TIMEOUT  (2*HZ)
-
 #define TX_Q_LIMIT    32
+
+struct ifb_private_q {
+	struct net_device	*dev;
+	struct sk_buff_head	rq;
+	struct sk_buff_head	tq;
+	wait_queue_head_t	wq;
+	struct task_struct	*task;
+	unsigned long		rx_packets;
+	unsigned long		rx_bytes;
+	unsigned long		rx_dropped;
+} ____cacheline_aligned_in_smp;
+
 struct ifb_private {
-	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
-	/* mostly debug stats leave in for now */
-	unsigned long   st_task_enter; /* tasklet entered */
-	unsigned long   st_txq_refl_try; /* transmit queue refill attempt */
-	unsigned long   st_rxq_enter; /* receive queue entered */
-	unsigned long   st_rx2tx_tran; /* receive to trasmit transfers */
-	unsigned long   st_rxq_notenter; /*receiveQ not entered, resched */
-	unsigned long   st_rx_frm_egr; /* received from egress path */
-	unsigned long   st_rx_frm_ing; /* received from ingress path */
-	unsigned long   st_rxq_check;
-	unsigned long   st_rxq_rsch;
-	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
+	struct ifb_private_q	*pq;
 };
 
+/* Number of ifb devices to be set up by this module. */
 static int numifbs = 2;
+module_param(numifbs, int, 0444);
+MODULE_PARM_DESC(numifbs, "Number of ifb devices");
 
-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
+/* Number of TX queues per ifb */
+static int numtxqs = 1;
+module_param(numtxqs, int, 0444);
+MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
 
-static void ri_tasklet(unsigned long dev)
+static int ifb_thread(void *priv)
 {
-
-	struct net_device *_dev = (struct net_device *)dev;
-	struct ifb_private *dp = netdev_priv(_dev);
-	struct net_device_stats *stats = &_dev->stats;
-	struct netdev_queue *txq;
+	struct ifb_private_q *pq = priv;
+	struct net_device *dev = pq->dev;
+	int num = pq - ((struct ifb_private *)netdev_priv(dev))->pq;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 	struct sk_buff *skb;
-
-	txq = netdev_get_tx_queue(_dev, 0);
-	dp->st_task_enter++;
-	if ((skb = skb_peek(&dp->tq)) == NULL) {
-		dp->st_txq_refl_try++;
-		if (__netif_tx_trylock(txq)) {
-			dp->st_rxq_enter++;
-			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
-				skb_queue_tail(&dp->tq, skb);
-				dp->st_rx2tx_tran++;
-			}
-			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			dp->st_rxq_notenter++;
-			goto resched;
+	DEFINE_WAIT(wait);
+
+	while (1) {
+		/* move skb from rq to tq */
+		while (1) {
+			prepare_to_wait(&pq->wq, &wait, TASK_UNINTERRUPTIBLE);
+			__netif_tx_lock_bh(txq);
+			while ((skb = skb_dequeue(&pq->rq)) != NULL)
+				skb_queue_tail(&pq->tq, skb);
+			if (netif_queue_stopped(dev))
+				netif_wake_queue(dev);
+			__netif_tx_unlock_bh(txq);
+			if (kthread_should_stop() || !skb_queue_empty(&pq->tq))
+				break;
+			schedule();
 		}
-	}
-
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
-		u32 from = G_TC_FROM(skb->tc_verd);
-
-		skb->tc_verd = 0;
-		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
-		stats->tx_packets++;
-		stats->tx_bytes +=skb->len;
-
-		rcu_read_lock();
-		skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
-		if (!skb->dev) {
-			rcu_read_unlock();
-			dev_kfree_skb(skb);
-			stats->tx_dropped++;
+		finish_wait(&pq->wq, &wait);
+		if (kthread_should_stop())
 			break;
+		if (need_resched())
+			schedule();
+
+		/* transfer packets */
+		while ((skb = skb_dequeue(&pq->tq)) != NULL) {
+			u32 from = G_TC_FROM(skb->tc_verd);
+	
+			skb->tc_verd = 0;
+			skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+			txq->tx_packets++;
+			txq->tx_bytes +=skb->len;
+
+			rcu_read_lock();
+			skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
+			if (!skb->dev) {
+				rcu_read_unlock();
+				dev_kfree_skb(skb);
+				txq->tx_dropped++;
+				break;
+			}
+			rcu_read_unlock();
+			skb->iif = dev->ifindex;
+	
+			if (from & AT_EGRESS) {
+				dev_queue_xmit(skb);
+			} else if (from & AT_INGRESS) {
+				skb_pull(skb, skb->dev->hard_header_len);
+				netif_rx_ni(skb);
+			} else
+				BUG();
 		}
-		rcu_read_unlock();
-		skb->iif = _dev->ifindex;
-
-		if (from & AT_EGRESS) {
-			dp->st_rx_frm_egr++;
-			dev_queue_xmit(skb);
-		} else if (from & AT_INGRESS) {
-			dp->st_rx_frm_ing++;
-			skb_pull(skb, skb->dev->hard_header_len);
-			netif_rx(skb);
-		} else
-			BUG();
-	}
-
-	if (__netif_tx_trylock(txq)) {
-		dp->st_rxq_check++;
-		if ((skb = skb_peek(&dp->rq)) == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			dp->st_rxq_rsch++;
-			__netif_tx_unlock(txq);
-			goto resched;
-		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
 	}
 
+	return 0;
 }
 
-static const struct net_device_ops ifb_netdev_ops = {
-	.ndo_open	= ifb_open,
-	.ndo_stop	= ifb_close,
-	.ndo_start_xmit	= ifb_xmit,
-	.ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
+struct net_device_stats* ifb_get_stats(struct net_device *dev)
 {
-	/* Initialize the device structure. */
-	dev->destructor = free_netdev;
-	dev->netdev_ops = &ifb_netdev_ops;
+	struct net_device_stats *stats = &dev->stats;
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	struct netdev_queue *txq;
+	int i;
+	unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
+	unsigned long tx_packets = 0, tx_bytes = 0, tx_dropped = 0;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		rx_packets += pq[i].rx_packets;
+		rx_bytes += pq[i].rx_bytes;
+		rx_dropped += pq[i].rx_dropped;
+		txq = netdev_get_tx_queue(dev, i);
+		tx_packets += txq->tx_packets;
+		tx_bytes += txq->tx_bytes;
+		tx_dropped += txq->tx_dropped;
+	}
 
-	/* Fill in device structure with ethernet-generic values. */
-	ether_setup(dev);
-	dev->tx_queue_len = TX_Q_LIMIT;
+	stats->rx_packets = rx_packets;
+	stats->rx_bytes = rx_bytes;
+	stats->rx_dropped = rx_dropped;
+	stats->tx_packets = tx_packets;
+	stats->tx_bytes = tx_bytes;
+	stats->tx_dropped = tx_dropped;
 
-	dev->flags |= IFF_NOARP;
-	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
-	random_ether_addr(dev->dev_addr);
+	return stats;
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
 	u32 from = G_TC_FROM(skb->tc_verd);
+	int num = skb_get_queue_mapping(skb);
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq + num;
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
 
-	stats->rx_packets++;
-	stats->rx_bytes+=skb->len;
+	pq->rx_packets++;
+	pq->rx_bytes += skb->len;
 
 	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
 		dev_kfree_skb(skb);
-		stats->rx_dropped++;
+		pq->rx_dropped++;
 		return NETDEV_TX_OK;
 	}
 
-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len) {
+	txq->trans_start = jiffies;
+	skb_queue_tail(&pq->rq, skb);
+	if (skb_queue_len(&pq->rq) >= dev->tx_queue_len)
 		netif_stop_queue(dev);
-	}
-
-	dev->trans_start = jiffies;
-	skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
+	if (skb_queue_len(&pq->rq) == 1)
+		wake_up(&pq->wq);
 
 	return NETDEV_TX_OK;
 }
@@ -195,26 +191,162 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 static int ifb_close(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
 
-	tasklet_kill(&dp->ifb_tasklet);
 	netif_stop_queue(dev);
-	skb_queue_purge(&dp->rq);
-	skb_queue_purge(&dp->tq);
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		kthread_stop(pq[i].task);
+		skb_queue_purge(&pq[i].tq);
+		skb_queue_purge(&pq[i].rq);
+	}
+
 	return 0;
 }
 
 static int ifb_open(struct net_device *dev)
 {
 	struct ifb_private *dp = netdev_priv(dev);
-
-	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
-	skb_queue_head_init(&dp->rq);
-	skb_queue_head_init(&dp->tq);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
+	
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].task = kthread_run(ifb_thread, &pq[i], "%s/%d", dev->name,
+					 i);
+		if (IS_ERR(pq[i].task)) {
+			int err = PTR_ERR(pq[i].task);
+			while (--i >= 0)
+				kthread_stop(pq[i].task);
+			return err;
+		}
+	}
 	netif_start_queue(dev);
 
 	return 0;
 }
 
+static u32 simple_tx_hashrnd;
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2;
+	u32 hash, ihl;
+	union {
+		u16 in16[2];
+		u32 in32;
+	} ports;
+	u8 ip_proto;
+
+	if ((hash = skb_rx_queue_recorded(skb))) {
+		while (hash >= dev->real_num_tx_queues)
+			hash -= dev->real_num_tx_queues;
+		return hash;
+	}
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+			ip_proto = ip_hdr(skb)->protocol;
+		else
+			ip_proto = 0;
+		addr1 = ip_hdr(skb)->saddr;
+		addr2 = ip_hdr(skb)->daddr;
+		ihl = ip_hdr(skb)->ihl << 2;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		ip_proto = ipv6_hdr(skb)->nexthdr;
+		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+		ihl = 10;
+		break;
+	default:
+		return 0;
+	}
+	if (addr1 > addr2)
+		swap(addr1, addr2);
+
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
+		if (ports.in16[0] > ports.in16[1])
+			swap(ports.in16[0], ports.in16[1]);
+		break;
+
+	default:
+		ports.in32 = 0;
+		break;
+	}
+
+	hash = jhash_3words(addr1, addr2, ports.in32,
+			    simple_tx_hashrnd ^ ip_proto);
+
+	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static int ifb_init(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+	struct ifb_private_q *pq = dp->pq;
+	int i;
+
+	pq = kmalloc(sizeof(*pq) * dev->real_num_tx_queues, GFP_KERNEL);
+	if (pq == NULL)
+		return -ENOMEM;
+	dp->pq = pq;
+
+	for (i = 0; i < dev->real_num_tx_queues; i++) {
+		pq[i].dev = dev;
+		skb_queue_head_init(&pq[i].rq);
+		skb_queue_head_init(&pq[i].tq);
+		init_waitqueue_head(&pq[i].wq);
+		pq[i].rx_packets = 0;
+		pq[i].rx_bytes = 0;
+		pq[i].rx_dropped = 0;
+	}
+
+	return 0;
+}
+
+static void ifb_uninit(struct net_device *dev)
+{
+	struct ifb_private *dp = netdev_priv(dev);
+
+	kfree(dp->pq);
+}
+
+static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_init		= ifb_init,
+	.ndo_uninit		= ifb_uninit,
+	.ndo_open		= ifb_open,
+	.ndo_stop		= ifb_close,
+	.ndo_start_xmit		= ifb_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_select_queue	= ifb_select_queue,
+	.ndo_get_stats		= ifb_get_stats,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+	/* Initialize the device structure. */
+	dev->destructor = free_netdev;
+	dev->netdev_ops = &ifb_netdev_ops;
+
+	/* Fill in device structure with ethernet-generic values. */
+	ether_setup(dev);
+	dev->tx_queue_len = TX_Q_LIMIT;
+
+	dev->flags |= IFF_NOARP;
+	dev->flags &= ~IFF_MULTICAST;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	random_ether_addr(dev->dev_addr);
+}
+
 static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {
@@ -226,24 +358,36 @@ static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
+			     unsigned int *num_tx_queues,
+			     unsigned int *real_num_tx_queues)
+{
+	if (tb[IFLA_NTXQ]) {
+		*num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
+		*real_num_tx_queues = *num_tx_queues;
+	} else {
+		*num_tx_queues = numtxqs;
+		*real_num_tx_queues = numtxqs;
+	}
+
+	return 0;
+}
+
 static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.kind		= "ifb",
-	.priv_size	= sizeof(struct ifb_private),
 	.setup		= ifb_setup,
 	.validate	= ifb_validate,
+	.get_tx_queues	= ifb_get_tx_queues,
+	.priv_size	= sizeof(struct ifb_private),
 };
 
-/* Number of ifb devices to be set up by this module. */
-module_param(numifbs, int, 0);
-MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-
 static int __init ifb_init_one(int index)
 {
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
-				 "ifb%d", ifb_setup);
+	dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private), "ifb%d",
+				  ifb_setup, numtxqs);
 
 	if (!dev_ifb)
 		return -ENOMEM;
@@ -268,9 +412,9 @@ static int __init ifb_init_module(void)
 {
 	int i, err;
 
+	get_random_bytes(&simple_tx_hashrnd, 4);
 	rtnl_lock();
 	err = __rtnl_link_register(&ifb_link_ops);
-
 	for (i = 0; i < numifbs && !err; i++)
 		err = ifb_init_one(i);
 	if (err)
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1d3b242..99da411 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -78,6 +78,7 @@ enum {
 #define IFLA_LINKINFO IFLA_LINKINFO
 	IFLA_NET_NS_PID,
 	IFLA_IFALIAS,
+	IFLA_NTXQ,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33148a5..7b94fd7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -597,6 +597,7 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
 	       + nla_total_size(4) /* IFLA_MASTER */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
+	       + nla_total_size(2) /* IFLA_NTXQ */
 	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
 }
 
@@ -627,6 +628,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		   netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
 	NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
 	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+	NLA_PUT_U16(skb, IFLA_NTXQ, dev->real_num_tx_queues);
 
 	if (dev->ifindex != dev->iflink)
 		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -725,6 +727,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	[IFLA_NTXQ]		= { .type = NLA_U16 },
 };
 EXPORT_SYMBOL(ifla_policy);
 



^ permalink raw reply related	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2009-11-17  6:02 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-10  8:30 [PATCH] ifb: add multi-queue support Changli Gao
2009-11-10  9:07 ` Eric Dumazet
2009-11-10  9:43   ` Changli Gao
2009-11-10 10:57     ` Eric Dumazet
2009-11-10 11:14       ` Changli Gao
2009-11-10 11:41         ` Patrick McHardy
2009-11-10 12:14           ` Changli Gao
2009-11-10 12:19             ` Patrick McHardy
2009-11-10 12:37               ` Changli Gao
2009-11-10 12:45                 ` Patrick McHardy
2009-11-10 13:06                   ` Changli Gao
2009-11-10 13:34                     ` Eric Dumazet
2009-11-10 13:49                       ` Changli Gao
2009-11-10 16:45                         ` Stephen Hemminger
2009-11-11  6:30                           ` Changli Gao
2009-11-10 10:29 ` Patrick McHardy
2009-11-10 10:48   ` Changli Gao
2009-11-10 10:55     ` Eric Dumazet
2009-11-11  9:51 Changli Gao
2009-11-11  9:56 ` Changli Gao
2009-11-11 10:30 ` Eric Dumazet
2009-11-11 10:57   ` Changli Gao
2009-11-11 15:59 ` Patrick McHardy
2009-11-12  3:12   ` Changli Gao
2009-11-12  8:52     ` Jarek Poplawski
2009-11-12  9:32       ` Changli Gao
2009-11-12 15:10     ` Patrick McHardy
2009-11-13  1:28       ` Changli Gao
2009-11-12  9:44 ` Changli Gao
2009-11-12  9:48   ` Changli Gao
2009-11-12 15:11     ` Patrick McHardy
2009-11-13  1:32       ` Changli Gao
2009-11-13  7:18         ` Patrick McHardy
2009-11-12 12:48   ` Eric Dumazet
2009-11-13  1:26     ` Changli Gao
2009-11-13  5:56       ` Eric Dumazet
2009-11-13  6:16         ` Changli Gao
2009-11-13  7:45           ` Jarek Poplawski
2009-11-13  8:54             ` Changli Gao
2009-11-13  9:18               ` Jarek Poplawski
2009-11-13  9:38                 ` Changli Gao
2009-11-13  9:57                   ` Jarek Poplawski
2009-11-13 11:25                     ` Changli Gao
2009-11-13 12:32                       ` Jarek Poplawski
2009-11-13 13:10                       ` Eric Dumazet
2009-11-13 16:15                   ` Stephen Hemminger
2009-11-13 23:28                     ` Changli Gao
2009-11-13 23:32                       ` Stephen Hemminger
2009-11-13 23:42                         ` Changli Gao
2009-11-14 12:53                           ` Eric Dumazet
2009-11-14 13:30                             ` Changli Gao
2009-11-13 13:55               ` Eric Dumazet
2009-11-13  4:37   ` Changli Gao
2009-11-16 16:39     ` Stephen Hemminger
2009-11-17  3:10       ` David Miller
2009-11-17  5:38         ` Changli Gao
2009-11-17  6:02           ` Stephen Hemminger
2009-11-13  4:42 Changli Gao
2009-11-13  4:46 ` Changli Gao
2009-11-16  7:31 Changli Gao
2009-11-16  8:19 ` Eric Dumazet
2009-11-16  8:43   ` Changli Gao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.