netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
@ 2015-04-23 21:29 Mahesh Bandewar
  2015-04-24  0:32 ` Eric Dumazet
  2015-04-24 20:15 ` Dan Williams
  0 siblings, 2 replies; 10+ messages in thread
From: Mahesh Bandewar @ 2015-04-23 21:29 UTC (permalink / raw)
  To: netdev, Eric Dumazet; +Cc: Dan Willems, David Miller, Mahesh Bandewar

Processing multicast / broadcast in fast path is performance draining
and having more links means more clonning and bringing performance
down further.

Broadcast; in particular, need to be given to all the virtual links.
Earlier tricks of enabling broadcast bit for IPv4 only interfaces are not
really working since it fails autoconf. Which means enabling braodcast
for all the links if protocol specific hacks do not have to be added into
the driver.

This patch defers all (incoming as well as outgoing) multicast traffic to
a work-queue leaving only the unicast traffic in the fast-path. Now if we
need to apply any additional tricks to further reduce the impact of this
(multicast / broadcast) type of traffic, it can be implemented while
processing this work without affecting the fast-path.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
 drivers/net/ipvlan/ipvlan.h      |   5 ++
 drivers/net/ipvlan/ipvlan_core.c | 134 +++++++++++++++++++++++++--------------
 drivers/net/ipvlan/ipvlan_main.c |   5 ++
 3 files changed, 96 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 54549a6223dd..953a97492fab 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -39,6 +39,8 @@
 #define IPVLAN_MAC_FILTER_SIZE	(1 << IPVLAN_MAC_FILTER_BITS)
 #define IPVLAN_MAC_FILTER_MASK	(IPVLAN_MAC_FILTER_SIZE - 1)
 
+#define IPVLAN_QBACKLOG_LIMIT	1000
+
 typedef enum {
 	IPVL_IPV6 = 0,
 	IPVL_ICMPV6,
@@ -93,6 +95,8 @@ struct ipvl_port {
 	struct hlist_head	hlhead[IPVLAN_HASH_SIZE];
 	struct list_head	ipvlans;
 	struct rcu_head		rcu;
+	struct work_struct	wq;
+	struct sk_buff_head	backlog;
 	int			count;
 	u16			mode;
 };
@@ -112,6 +116,7 @@ void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval);
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
+void ipvlan_process_multicast(struct work_struct *work);
 int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev);
 void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr);
 struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index c30b5c300c05..58891666088c 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -189,64 +189,85 @@ unsigned int ipvlan_mac_hash(const unsigned char *addr)
 	return hash & IPVLAN_MAC_FILTER_MASK;
 }
 
-static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb,
-				   const struct ipvl_dev *in_dev, bool local)
+void ipvlan_process_multicast(struct work_struct *work)
 {
-	struct ethhdr *eth = eth_hdr(skb);
+	struct ipvl_port *port = container_of(work, struct ipvl_port, wq);
+	struct ethhdr *ethh;
 	struct ipvl_dev *ipvlan;
-	struct sk_buff *nskb;
+	struct sk_buff *skb, *nskb;
+	struct sk_buff_head list;
 	unsigned int len;
 	unsigned int mac_hash;
 	int ret;
+	u8 pkt_type;
+	bool hlocal, dlocal;
 
-	if (skb->protocol == htons(ETH_P_PAUSE))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) {
-		if (local && (ipvlan == in_dev))
-			continue;
+	__skb_queue_head_init(&list);
 
-		mac_hash = ipvlan_mac_hash(eth->h_dest);
-		if (!test_bit(mac_hash, ipvlan->mac_filters))
-			continue;
+	spin_lock_bh(&port->backlog.lock);
+	skb_queue_splice_tail_init(&port->backlog, &list);
+	spin_unlock_bh(&port->backlog.lock);
 
-		ret = NET_RX_DROP;
-		len = skb->len + ETH_HLEN;
-		nskb = skb_clone(skb, GFP_ATOMIC);
-		if (!nskb)
-			goto mcast_acct;
+	while ((skb = __skb_dequeue(&list)) != NULL) {
+		ethh = eth_hdr(skb);
+		hlocal = ether_addr_equal(ethh->h_source, port->dev->dev_addr);
+		mac_hash = ipvlan_mac_hash(ethh->h_dest);
 
-		if (ether_addr_equal(eth->h_dest, ipvlan->phy_dev->broadcast))
-			nskb->pkt_type = PACKET_BROADCAST;
+		if (ether_addr_equal(ethh->h_dest, port->dev->broadcast))
+			pkt_type = PACKET_BROADCAST;
 		else
-			nskb->pkt_type = PACKET_MULTICAST;
+			pkt_type = PACKET_MULTICAST;
+
+		dlocal = false;
+		rcu_read_lock();
+		list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) {
+			if (hlocal && (ipvlan->dev == skb->dev)) {
+				dlocal = true;
+				continue;
+			}
+			if (!test_bit(mac_hash, ipvlan->mac_filters))
+				continue;
+
+			ret = NET_RX_DROP;
+			len = skb->len + ETH_HLEN;
+			nskb = skb_clone(skb, GFP_ATOMIC);
+			if (!nskb)
+				goto acct;
+
+			nskb->pkt_type = pkt_type;
+			nskb->dev = ipvlan->dev;
+			if (hlocal)
+				ret = dev_forward_skb(ipvlan->dev, nskb);
+			else
+				ret = netif_rx(nskb);
+acct:
+			ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
+		}
+		rcu_read_unlock();
 
-		nskb->dev = ipvlan->dev;
-		if (local)
-			ret = dev_forward_skb(ipvlan->dev, nskb);
+		if (!dlocal)
+			nskb = skb;
 		else
-			ret = netif_rx(nskb);
-mcast_acct:
-		ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
-	}
-	rcu_read_unlock();
+			nskb = skb_clone(skb, GFP_ATOMIC);
 
-	/* Locally generated? ...Forward a copy to the main-device as
-	 * well. On the RX side we'll ignore it (wont give it to any
-	 * of the virtual devices.
-	 */
-	if (local) {
-		nskb = skb_clone(skb, GFP_ATOMIC);
 		if (nskb) {
-			if (ether_addr_equal(eth->h_dest, port->dev->broadcast))
-				nskb->pkt_type = PACKET_BROADCAST;
-			else
-				nskb->pkt_type = PACKET_MULTICAST;
+			/* Always forward a copy to the master device. */
+			if (hlocal) {
+				dev_forward_skb(port->dev, nskb);
+			} else {
+				nskb->dev = port->dev;
+				netif_rx(nskb);
+			}
+		}
 
-			dev_forward_skb(port->dev, nskb);
+		if (dlocal) {
+			/* If the packet originated here, send it out. */
+			skb->dev = port->dev;
+			skb->pkt_type = pkt_type;
+			dev_queue_xmit(skb);
 		}
 	}
+	return;
 }
 
 static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff *skb,
@@ -446,6 +467,24 @@ out:
 	return ret;
 }
 
+static void ipvlan_multicast_enqueue(struct ipvl_port *port,
+				     struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_PAUSE))
+		return;
+
+	spin_lock(&port->backlog.lock);
+	if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) {
+		__skb_queue_tail(&port->backlog, skb);
+		spin_unlock(&port->backlog.lock);
+	} else {
+		spin_unlock(&port->backlog.lock);
+		atomic_long_inc(&skb->dev->rx_dropped);
+		kfree_skb(skb);
+	}
+	schedule_work(&port->wq);
+}
+
 static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
 {
 	const struct ipvl_dev *ipvlan = netdev_priv(dev);
@@ -493,11 +532,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
 		return dev_forward_skb(ipvlan->phy_dev, skb);
 
 	} else if (is_multicast_ether_addr(eth->h_dest)) {
-		u8 ip_summed = skb->ip_summed;
-
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true);
-		skb->ip_summed = ip_summed;
+		ipvlan_multicast_enqueue(ipvlan->port, skb);
+		return NET_XMIT_SUCCESS;
 	}
 
 	skb->dev = ipvlan->phy_dev;
@@ -581,8 +617,10 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
 	int addr_type;
 
 	if (is_multicast_ether_addr(eth->h_dest)) {
-		if (ipvlan_external_frame(skb, port))
-			ipvlan_multicast_frame(port, skb, NULL, false);
+		if (ipvlan_external_frame(skb, port)) {
+			ipvlan_multicast_enqueue(port, skb);
+			return RX_HANDLER_CONSUMED;
+		}
 	} else {
 		struct ipvl_addr *addr;
 
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 77b92a0fe557..a16d3017fdc3 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -54,6 +54,9 @@ static int ipvlan_port_create(struct net_device *dev)
 	for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++)
 		INIT_HLIST_HEAD(&port->hlhead[idx]);
 
+	skb_queue_head_init(&port->backlog);
+	INIT_WORK(&port->wq, ipvlan_process_multicast);
+
 	err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
 	if (err)
 		goto err;
@@ -72,6 +75,8 @@ static void ipvlan_port_destroy(struct net_device *dev)
 
 	dev->priv_flags &= ~IFF_IPVLAN_MASTER;
 	netdev_rx_handler_unregister(dev);
+	cancel_work_sync(&port->wq);
+	__skb_queue_purge(&port->backlog);
 	kfree_rcu(port, rcu);
 }
 
-- 
2.2.0.rc0.207.ga3a616c

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-23 21:29 [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue Mahesh Bandewar
@ 2015-04-24  0:32 ` Eric Dumazet
  2015-04-24  2:54   ` Mahesh Bandewar
  2015-04-24 20:15 ` Dan Williams
  1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2015-04-24  0:32 UTC (permalink / raw)
  To: Mahesh Bandewar; +Cc: netdev, Eric Dumazet, Dan Willems, David Miller

On Thu, 2015-04-23 at 14:29 -0700, Mahesh Bandewar wrote:

> +static void ipvlan_multicast_enqueue(struct ipvl_port *port,
> +				     struct sk_buff *skb)
> +{
> +	if (skb->protocol == htons(ETH_P_PAUSE))
> +		return;

But what happens to this packet ? It seems leaked ?

> +
> +	spin_lock(&port->backlog.lock);
> +	if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) {
> +		__skb_queue_tail(&port->backlog, skb);
> +		spin_unlock(&port->backlog.lock);
> +	} else {
> +		spin_unlock(&port->backlog.lock);
> +		atomic_long_inc(&skb->dev->rx_dropped);
> +		kfree_skb(skb);
> +	}
> +	schedule_work(&port->wq);

No point calling schedule_work(&port->wq); if packet was dropped.

We are under pressure, so don't add extra cpu cycles ;)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-24  0:32 ` Eric Dumazet
@ 2015-04-24  2:54   ` Mahesh Bandewar
  2015-04-24  3:04     ` Eric Dumazet
  2015-04-24  4:28     ` David Miller
  0 siblings, 2 replies; 10+ messages in thread
From: Mahesh Bandewar @ 2015-04-24  2:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Eric Dumazet, Dan Willems, David Miller

On Thu, Apr 23, 2015 at 5:32 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2015-04-23 at 14:29 -0700, Mahesh Bandewar wrote:
>
>> +static void ipvlan_multicast_enqueue(struct ipvl_port *port,
>> +                                  struct sk_buff *skb)
>> +{
>> +     if (skb->protocol == htons(ETH_P_PAUSE))
>> +             return;
>
> But what happens to this packet ? It seems leaked ?
>
Hmm, will take care of it in v2.
>> +
>> +     spin_lock(&port->backlog.lock);
>> +     if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) {
>> +             __skb_queue_tail(&port->backlog, skb);
>> +             spin_unlock(&port->backlog.lock);
>> +     } else {
>> +             spin_unlock(&port->backlog.lock);
>> +             atomic_long_inc(&skb->dev->rx_dropped);
>> +             kfree_skb(skb);
>> +     }
>> +     schedule_work(&port->wq);
>
> No point calling schedule_work(&port->wq); if packet was dropped.
>
> We are under pressure, so don't add extra cpu cycles ;)
>
The only possibility of schedule_work() while doing kfree_skb() is
when the queue limit is reached. How can the queue be reduced if not
scheduled? May be I'm missing something..
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-24  2:54   ` Mahesh Bandewar
@ 2015-04-24  3:04     ` Eric Dumazet
  2015-04-24  4:28     ` David Miller
  1 sibling, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2015-04-24  3:04 UTC (permalink / raw)
  To: Mahesh Bandewar; +Cc: netdev, Eric Dumazet, Dan Willems, David Miller

On Thu, 2015-04-23 at 19:54 -0700, Mahesh Bandewar wrote:

> The only possibility of schedule_work() while doing kfree_skb() is
> when the queue limit is reached. How can the queue be reduced if not
> scheduled? May be I'm missing something..


What is the point calling schedule_work() if you queued no packet at
this round ? 

Do you really think it will expedite the queue drain ?

Just write instead :


     spin_lock(&port->backlog.lock);
     if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) {
             __skb_queue_tail(&port->backlog, skb);
             spin_unlock(&port->backlog.lock);
     	     schedule_work(&port->wq);
     } else {
             spin_unlock(&port->backlog.lock);
             atomic_long_inc(&skb->dev->rx_dropped);
             kfree_skb(skb);
     }

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-24  2:54   ` Mahesh Bandewar
  2015-04-24  3:04     ` Eric Dumazet
@ 2015-04-24  4:28     ` David Miller
  2015-04-24  4:40       ` Mahesh Bandewar
  1 sibling, 1 reply; 10+ messages in thread
From: David Miller @ 2015-04-24  4:28 UTC (permalink / raw)
  To: maheshb; +Cc: eric.dumazet, netdev, edumazet, dcbw

From: Mahesh Bandewar <maheshb@google.com>
Date: Thu, 23 Apr 2015 19:54:29 -0700

> On Thu, Apr 23, 2015 at 5:32 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Thu, 2015-04-23 at 14:29 -0700, Mahesh Bandewar wrote:
>>
>>> +static void ipvlan_multicast_enqueue(struct ipvl_port *port,
>>> +                                  struct sk_buff *skb)
>>> +{
>>> +     if (skb->protocol == htons(ETH_P_PAUSE))
>>> +             return;
>>
>> But what happens to this packet ? It seems leaked ?
>>
> Hmm, will take care of it in v2.

Please also provide a proper "[PATCH 0/3] " introduction posting
stating what the patch series is doing, and why.

Thanks.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-24  4:28     ` David Miller
@ 2015-04-24  4:40       ` Mahesh Bandewar
  0 siblings, 0 replies; 10+ messages in thread
From: Mahesh Bandewar @ 2015-04-24  4:40 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, linux-netdev, Eric Dumazet, dcbw

On Thu, Apr 23, 2015 at 9:28 PM, David Miller <davem@davemloft.net> wrote:
> From: Mahesh Bandewar <maheshb@google.com>
> Date: Thu, 23 Apr 2015 19:54:29 -0700
>
>> On Thu, Apr 23, 2015 at 5:32 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>> On Thu, 2015-04-23 at 14:29 -0700, Mahesh Bandewar wrote:
>>>
>>>> +static void ipvlan_multicast_enqueue(struct ipvl_port *port,
>>>> +                                  struct sk_buff *skb)
>>>> +{
>>>> +     if (skb->protocol == htons(ETH_P_PAUSE))
>>>> +             return;
>>>
>>> But what happens to this packet ? It seems leaked ?
>>>
>> Hmm, will take care of it in v2.
>
> Please also provide a proper "[PATCH 0/3] " introduction posting
> stating what the patch series is doing, and why.
>
Will do.

> Thanks.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-23 21:29 [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue Mahesh Bandewar
  2015-04-24  0:32 ` Eric Dumazet
@ 2015-04-24 20:15 ` Dan Williams
  2015-04-24 22:40   ` Mahesh Bandewar
  1 sibling, 1 reply; 10+ messages in thread
From: Dan Williams @ 2015-04-24 20:15 UTC (permalink / raw)
  To: Mahesh Bandewar; +Cc: netdev, Eric Dumazet, David Miller

On Thu, 2015-04-23 at 14:29 -0700, Mahesh Bandewar wrote:
> Processing multicast / broadcast in fast path is performance draining
> and having more links means more clonning and bringing performance
> down further.
> 
> Broadcast; in particular, need to be given to all the virtual links.
> Earlier tricks of enabling broadcast bit for IPv4 only interfaces are not
> really working since it fails autoconf. Which means enabling braodcast
> for all the links if protocol specific hacks do not have to be added into
> the driver.
> 
> This patch defers all (incoming as well as outgoing) multicast traffic to
> a work-queue leaving only the unicast traffic in the fast-path. Now if we
> need to apply any additional tricks to further reduce the impact of this
> (multicast / broadcast) type of traffic, it can be implemented while
> processing this work without affecting the fast-path.

These patches appear to work for me for the L2 + DHCP use-case, however
I experienced some quite odd behavior when pinging the ipvlan interface
from another machine.  I did this:

ip link add link eno1 type ipvlan mode l2
ip netns add ipv
ip link set dev ipvlan0 netns ipv
ip netns exec ipv /sbin/dhclient -B -4 -1 -v
-pf /run/dhclient-ipvlan0.pid -C adafdasdfasf ipvlan0
ip netns exec ping 4.2.2.1 <success>

However, when pinging from another machine, I got very inconsistent ping
replies:

64 bytes from 192.168.1.38: icmp_seq=1 ttl=64 time=11.4 ms
64 bytes from 192.168.1.38: icmp_seq=16 ttl=64 time=64.9 ms
64 bytes from 192.168.1.38: icmp_seq=25 ttl=64 time=87.9 ms
64 bytes from 192.168.1.38: icmp_seq=30 ttl=64 time=242 ms
64 bytes from 192.168.1.38: icmp_seq=35 ttl=64 time=40.1 ms
64 bytes from 192.168.1.38: icmp_seq=36 ttl=64 time=60.9 ms

But I cannot reproduce that in a second run (though I haven't rebooted
to test cleanly again).

And oddly, the dhclient process takes a consistent 5% CPU and wireshark
running on eno1 (not even the ipvlan interface) jumps to 100% CPU along
with the dumpcap process taking another 25%, none of which are normal.
This is a 4-core i4790 box, so something is wrong here; is something
holding onto a spinlock for way too long?

But at least it handles the packets ok, so I say progress!  Happy to
help track down the CPU usage issue if you want to give me patches to
test.

Dan

> Signed-off-by: Mahesh Bandewar <maheshb@google.com>
> ---
>  drivers/net/ipvlan/ipvlan.h      |   5 ++
>  drivers/net/ipvlan/ipvlan_core.c | 134 +++++++++++++++++++++++++--------------
>  drivers/net/ipvlan/ipvlan_main.c |   5 ++
>  3 files changed, 96 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
> index 54549a6223dd..953a97492fab 100644
> --- a/drivers/net/ipvlan/ipvlan.h
> +++ b/drivers/net/ipvlan/ipvlan.h
> @@ -39,6 +39,8 @@
>  #define IPVLAN_MAC_FILTER_SIZE	(1 << IPVLAN_MAC_FILTER_BITS)
>  #define IPVLAN_MAC_FILTER_MASK	(IPVLAN_MAC_FILTER_SIZE - 1)
>  
> +#define IPVLAN_QBACKLOG_LIMIT	1000
> +
>  typedef enum {
>  	IPVL_IPV6 = 0,
>  	IPVL_ICMPV6,
> @@ -93,6 +95,8 @@ struct ipvl_port {
>  	struct hlist_head	hlhead[IPVLAN_HASH_SIZE];
>  	struct list_head	ipvlans;
>  	struct rcu_head		rcu;
> +	struct work_struct	wq;
> +	struct sk_buff_head	backlog;
>  	int			count;
>  	u16			mode;
>  };
> @@ -112,6 +116,7 @@ void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval);
>  void ipvlan_init_secret(void);
>  unsigned int ipvlan_mac_hash(const unsigned char *addr);
>  rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
> +void ipvlan_process_multicast(struct work_struct *work);
>  int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev);
>  void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr);
>  struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
> diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
> index c30b5c300c05..58891666088c 100644
> --- a/drivers/net/ipvlan/ipvlan_core.c
> +++ b/drivers/net/ipvlan/ipvlan_core.c
> @@ -189,64 +189,85 @@ unsigned int ipvlan_mac_hash(const unsigned char *addr)
>  	return hash & IPVLAN_MAC_FILTER_MASK;
>  }
>  
> -static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb,
> -				   const struct ipvl_dev *in_dev, bool local)
> +void ipvlan_process_multicast(struct work_struct *work)
>  {
> -	struct ethhdr *eth = eth_hdr(skb);
> +	struct ipvl_port *port = container_of(work, struct ipvl_port, wq);
> +	struct ethhdr *ethh;
>  	struct ipvl_dev *ipvlan;
> -	struct sk_buff *nskb;
> +	struct sk_buff *skb, *nskb;
> +	struct sk_buff_head list;
>  	unsigned int len;
>  	unsigned int mac_hash;
>  	int ret;
> +	u8 pkt_type;
> +	bool hlocal, dlocal;
>  
> -	if (skb->protocol == htons(ETH_P_PAUSE))
> -		return;
> -
> -	rcu_read_lock();
> -	list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) {
> -		if (local && (ipvlan == in_dev))
> -			continue;
> +	__skb_queue_head_init(&list);
>  
> -		mac_hash = ipvlan_mac_hash(eth->h_dest);
> -		if (!test_bit(mac_hash, ipvlan->mac_filters))
> -			continue;
> +	spin_lock_bh(&port->backlog.lock);
> +	skb_queue_splice_tail_init(&port->backlog, &list);
> +	spin_unlock_bh(&port->backlog.lock);
>  
> -		ret = NET_RX_DROP;
> -		len = skb->len + ETH_HLEN;
> -		nskb = skb_clone(skb, GFP_ATOMIC);
> -		if (!nskb)
> -			goto mcast_acct;
> +	while ((skb = __skb_dequeue(&list)) != NULL) {
> +		ethh = eth_hdr(skb);
> +		hlocal = ether_addr_equal(ethh->h_source, port->dev->dev_addr);
> +		mac_hash = ipvlan_mac_hash(ethh->h_dest);
>  
> -		if (ether_addr_equal(eth->h_dest, ipvlan->phy_dev->broadcast))
> -			nskb->pkt_type = PACKET_BROADCAST;
> +		if (ether_addr_equal(ethh->h_dest, port->dev->broadcast))
> +			pkt_type = PACKET_BROADCAST;
>  		else
> -			nskb->pkt_type = PACKET_MULTICAST;
> +			pkt_type = PACKET_MULTICAST;
> +
> +		dlocal = false;
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) {
> +			if (hlocal && (ipvlan->dev == skb->dev)) {
> +				dlocal = true;
> +				continue;
> +			}
> +			if (!test_bit(mac_hash, ipvlan->mac_filters))
> +				continue;
> +
> +			ret = NET_RX_DROP;
> +			len = skb->len + ETH_HLEN;
> +			nskb = skb_clone(skb, GFP_ATOMIC);
> +			if (!nskb)
> +				goto acct;
> +
> +			nskb->pkt_type = pkt_type;
> +			nskb->dev = ipvlan->dev;
> +			if (hlocal)
> +				ret = dev_forward_skb(ipvlan->dev, nskb);
> +			else
> +				ret = netif_rx(nskb);
> +acct:
> +			ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
> +		}
> +		rcu_read_unlock();
>  
> -		nskb->dev = ipvlan->dev;
> -		if (local)
> -			ret = dev_forward_skb(ipvlan->dev, nskb);
> +		if (!dlocal)
> +			nskb = skb;
>  		else
> -			ret = netif_rx(nskb);
> -mcast_acct:
> -		ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
> -	}
> -	rcu_read_unlock();
> +			nskb = skb_clone(skb, GFP_ATOMIC);
>  
> -	/* Locally generated? ...Forward a copy to the main-device as
> -	 * well. On the RX side we'll ignore it (wont give it to any
> -	 * of the virtual devices.
> -	 */
> -	if (local) {
> -		nskb = skb_clone(skb, GFP_ATOMIC);
>  		if (nskb) {
> -			if (ether_addr_equal(eth->h_dest, port->dev->broadcast))
> -				nskb->pkt_type = PACKET_BROADCAST;
> -			else
> -				nskb->pkt_type = PACKET_MULTICAST;
> +			/* Always forward a copy to the master device. */
> +			if (hlocal) {
> +				dev_forward_skb(port->dev, nskb);
> +			} else {
> +				nskb->dev = port->dev;
> +				netif_rx(nskb);
> +			}
> +		}
>  
> -			dev_forward_skb(port->dev, nskb);
> +		if (dlocal) {
> +			/* If the packet originated here, send it out. */
> +			skb->dev = port->dev;
> +			skb->pkt_type = pkt_type;
> +			dev_queue_xmit(skb);
>  		}
>  	}
> +	return;
>  }
>  
>  static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff *skb,
> @@ -446,6 +467,24 @@ out:
>  	return ret;
>  }
>  
> +static void ipvlan_multicast_enqueue(struct ipvl_port *port,
> +				     struct sk_buff *skb)
> +{
> +	if (skb->protocol == htons(ETH_P_PAUSE))
> +		return;
> +
> +	spin_lock(&port->backlog.lock);
> +	if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) {
> +		__skb_queue_tail(&port->backlog, skb);
> +		spin_unlock(&port->backlog.lock);
> +	} else {
> +		spin_unlock(&port->backlog.lock);
> +		atomic_long_inc(&skb->dev->rx_dropped);
> +		kfree_skb(skb);
> +	}
> +	schedule_work(&port->wq);
> +}
> +
>  static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
>  {
>  	const struct ipvl_dev *ipvlan = netdev_priv(dev);
> @@ -493,11 +532,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
>  		return dev_forward_skb(ipvlan->phy_dev, skb);
>  
>  	} else if (is_multicast_ether_addr(eth->h_dest)) {
> -		u8 ip_summed = skb->ip_summed;
> -
> -		skb->ip_summed = CHECKSUM_UNNECESSARY;
> -		ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true);
> -		skb->ip_summed = ip_summed;
> +		ipvlan_multicast_enqueue(ipvlan->port, skb);
> +		return NET_XMIT_SUCCESS;
>  	}
>  
>  	skb->dev = ipvlan->phy_dev;
> @@ -581,8 +617,10 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
>  	int addr_type;
>  
>  	if (is_multicast_ether_addr(eth->h_dest)) {
> -		if (ipvlan_external_frame(skb, port))
> -			ipvlan_multicast_frame(port, skb, NULL, false);
> +		if (ipvlan_external_frame(skb, port)) {
> +			ipvlan_multicast_enqueue(port, skb);
> +			return RX_HANDLER_CONSUMED;
> +		}
>  	} else {
>  		struct ipvl_addr *addr;
>  
> diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
> index 77b92a0fe557..a16d3017fdc3 100644
> --- a/drivers/net/ipvlan/ipvlan_main.c
> +++ b/drivers/net/ipvlan/ipvlan_main.c
> @@ -54,6 +54,9 @@ static int ipvlan_port_create(struct net_device *dev)
>  	for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++)
>  		INIT_HLIST_HEAD(&port->hlhead[idx]);
>  
> +	skb_queue_head_init(&port->backlog);
> +	INIT_WORK(&port->wq, ipvlan_process_multicast);
> +
>  	err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
>  	if (err)
>  		goto err;
> @@ -72,6 +75,8 @@ static void ipvlan_port_destroy(struct net_device *dev)
>  
>  	dev->priv_flags &= ~IFF_IPVLAN_MASTER;
>  	netdev_rx_handler_unregister(dev);
> +	cancel_work_sync(&port->wq);
> +	__skb_queue_purge(&port->backlog);
>  	kfree_rcu(port, rcu);
>  }
>  

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-24 20:15 ` Dan Williams
@ 2015-04-24 22:40   ` Mahesh Bandewar
  2015-04-24 22:59     ` Dan Williams
  0 siblings, 1 reply; 10+ messages in thread
From: Mahesh Bandewar @ 2015-04-24 22:40 UTC (permalink / raw)
  To: Dan Williams; +Cc: netdev, Eric Dumazet, David Miller

On Fri, Apr 24, 2015 at 1:15 PM, Dan Williams <dcbw@redhat.com> wrote:
> On Thu, 2015-04-23 at 14:29 -0700, Mahesh Bandewar wrote:
>> Processing multicast / broadcast in fast path is performance draining
>> and having more links means more clonning and bringing performance
>> down further.
>>
>> Broadcast; in particular, need to be given to all the virtual links.
>> Earlier tricks of enabling broadcast bit for IPv4 only interfaces are not
>> really working since it fails autoconf. Which means enabling braodcast
>> for all the links if protocol specific hacks do not have to be added into
>> the driver.
>>
>> This patch defers all (incoming as well as outgoing) multicast traffic to
>> a work-queue leaving only the unicast traffic in the fast-path. Now if we
>> need to apply any additional tricks to further reduce the impact of this
>> (multicast / broadcast) type of traffic, it can be implemented while
>> processing this work without affecting the fast-path.
>
> These patches appear to work for me for the L2 + DHCP use-case, however
> I experienced some quite odd behavior when pinging the ipvlan interface
> from another machine.  I did this:
>
> ip link add link eno1 type ipvlan mode l2
> ip netns add ipv
> ip link set dev ipvlan0 netns ipv
> ip netns exec ipv /sbin/dhclient -B -4 -1 -v
> -pf /run/dhclient-ipvlan0.pid -C adafdasdfasf ipvlan0
> ip netns exec ping 4.2.2.1 <success>
>
> However, when pinging from another machine, I got very inconsistent ping
> replies:
>
> 64 bytes from 192.168.1.38: icmp_seq=1 ttl=64 time=11.4 ms
> 64 bytes from 192.168.1.38: icmp_seq=16 ttl=64 time=64.9 ms
> 64 bytes from 192.168.1.38: icmp_seq=25 ttl=64 time=87.9 ms
> 64 bytes from 192.168.1.38: icmp_seq=30 ttl=64 time=242 ms
> 64 bytes from 192.168.1.38: icmp_seq=35 ttl=64 time=40.1 ms
> 64 bytes from 192.168.1.38: icmp_seq=36 ttl=64 time=60.9 ms
>
We know that there is that PAUSE frame leak but that should not cause
this behavior if those are present in your network. The sched_work()
which is wrong (as pointed by Eric) especially when the machine is
busy and that might trigger something like this. Almost every 10th -
15th ping packet seems to be processed correctly.

I did get consistent results as opposed what you have shown here, but
will dig some more to see if something obviously wrong here.

> But I cannot reproduce that in a second run (though I haven't rebooted
> to test cleanly again).
>
> And oddly, the dhclient process takes a consistent 5% CPU and wireshark
> running on eno1 (not even the ipvlan interface) jumps to 100% CPU along
> with the dumpcap process taking another 25%, none of which are normal.
> This is a 4-core i4790 box, so something is wrong here; is something
> holding onto a spinlock for way too long?
>
> But at least it handles the packets ok, so I say progress!  Happy to
> help track down the CPU usage issue if you want to give me patches to
> test.
>
Which patch(es) you are referring to?

> Dan
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-24 22:40   ` Mahesh Bandewar
@ 2015-04-24 22:59     ` Dan Williams
  2015-04-27 18:14       ` Mahesh Bandewar
  0 siblings, 1 reply; 10+ messages in thread
From: Dan Williams @ 2015-04-24 22:59 UTC (permalink / raw)
  To: Mahesh Bandewar; +Cc: netdev, Eric Dumazet, David Miller

On Fri, 2015-04-24 at 15:40 -0700, Mahesh Bandewar wrote:
> On Fri, Apr 24, 2015 at 1:15 PM, Dan Williams <dcbw@redhat.com> wrote:
> > On Thu, 2015-04-23 at 14:29 -0700, Mahesh Bandewar wrote:
> >> Processing multicast / broadcast in fast path is performance draining
> >> and having more links means more clonning and bringing performance
> >> down further.
> >>
> >> Broadcast; in particular, need to be given to all the virtual links.
> >> Earlier tricks of enabling broadcast bit for IPv4 only interfaces are not
> >> really working since it fails autoconf. Which means enabling braodcast
> >> for all the links if protocol specific hacks do not have to be added into
> >> the driver.
> >>
> >> This patch defers all (incoming as well as outgoing) multicast traffic to
> >> a work-queue leaving only the unicast traffic in the fast-path. Now if we
> >> need to apply any additional tricks to further reduce the impact of this
> >> (multicast / broadcast) type of traffic, it can be implemented while
> >> processing this work without affecting the fast-path.
> >
> > These patches appear to work for me for the L2 + DHCP use-case, however
> > I experienced some quite odd behavior when pinging the ipvlan interface
> > from another machine.  I did this:
> >
> > ip link add link eno1 type ipvlan mode l2
> > ip netns add ipv
> > ip link set dev ipvlan0 netns ipv
> > ip netns exec ipv /sbin/dhclient -B -4 -1 -v
> > -pf /run/dhclient-ipvlan0.pid -C adafdasdfasf ipvlan0
> > ip netns exec ping 4.2.2.1 <success>
> >
> > However, when pinging from another machine, I got very inconsistent ping
> > replies:
> >
> > 64 bytes from 192.168.1.38: icmp_seq=1 ttl=64 time=11.4 ms
> > 64 bytes from 192.168.1.38: icmp_seq=16 ttl=64 time=64.9 ms
> > 64 bytes from 192.168.1.38: icmp_seq=25 ttl=64 time=87.9 ms
> > 64 bytes from 192.168.1.38: icmp_seq=30 ttl=64 time=242 ms
> > 64 bytes from 192.168.1.38: icmp_seq=35 ttl=64 time=40.1 ms
> > 64 bytes from 192.168.1.38: icmp_seq=36 ttl=64 time=60.9 ms
> >
> We know that there is that PAUSE frame leak but that should not cause
> this behavior if those are present in your network. The sched_work()
> which is wrong (as pointed by Eric) especially when the machine is
> busy and that might trigger something like this. Almost every 10th -
> 15th ping packet seems to be processed correctly.
> 
> I did get consistent results as opposed what you have shown here, but
> will dig some more to see if something obviously wrong here.
> 
> > But I cannot reproduce that in a second run (though I haven't rebooted
> > to test cleanly again).
> >
> > And oddly, the dhclient process takes a consistent 5% CPU and wireshark
> > running on eno1 (not even the ipvlan interface) jumps to 100% CPU along
> > with the dumpcap process taking another 25%, none of which are normal.
> > This is a 4-core i4790 box, so something is wrong here; is something
> > holding onto a spinlock for way too long?
> >
> > But at least it handles the packets ok, so I say progress!  Happy to
> > help track down the CPU usage issue if you want to give me patches to
> > test.
> >
> Which patch(es) you are referring to?

None that yet exist; simply that if any of the issues I described
triggered thoughts or patches on your end, I'm happy to test them.  I
will try to characterize the issues I have seen more next week and
report back.

Dan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue
  2015-04-24 22:59     ` Dan Williams
@ 2015-04-27 18:14       ` Mahesh Bandewar
  0 siblings, 0 replies; 10+ messages in thread
From: Mahesh Bandewar @ 2015-04-27 18:14 UTC (permalink / raw)
  To: Dan Williams; +Cc: netdev, Eric Dumazet, David Miller

On Fri, Apr 24, 2015 at 3:59 PM, Dan Williams <dcbw@redhat.com> wrote:
> On Fri, 2015-04-24 at 15:40 -0700, Mahesh Bandewar wrote:
>> On Fri, Apr 24, 2015 at 1:15 PM, Dan Williams <dcbw@redhat.com> wrote:
>> > On Thu, 2015-04-23 at 14:29 -0700, Mahesh Bandewar wrote:
>> >> Processing multicast / broadcast in fast path is performance draining
>> >> and having more links means more clonning and bringing performance
>> >> down further.
>> >>
>> >> Broadcast; in particular, need to be given to all the virtual links.
>> >> Earlier tricks of enabling broadcast bit for IPv4 only interfaces are not
>> >> really working since it fails autoconf. Which means enabling braodcast
>> >> for all the links if protocol specific hacks do not have to be added into
>> >> the driver.
>> >>
>> >> This patch defers all (incoming as well as outgoing) multicast traffic to
>> >> a work-queue leaving only the unicast traffic in the fast-path. Now if we
>> >> need to apply any additional tricks to further reduce the impact of this
>> >> (multicast / broadcast) type of traffic, it can be implemented while
>> >> processing this work without affecting the fast-path.
>> >
>> > These patches appear to work for me for the L2 + DHCP use-case, however
>> > I experienced some quite odd behavior when pinging the ipvlan interface
>> > from another machine.  I did this:
>> >
>> > ip link add link eno1 type ipvlan mode l2
>> > ip netns add ipv
>> > ip link set dev ipvlan0 netns ipv
>> > ip netns exec ipv /sbin/dhclient -B -4 -1 -v
>> > -pf /run/dhclient-ipvlan0.pid -C adafdasdfasf ipvlan0
>> > ip netns exec ping 4.2.2.1 <success>
>> >
>> > However, when pinging from another machine, I got very inconsistent ping
>> > replies:
>> >
>> > 64 bytes from 192.168.1.38: icmp_seq=1 ttl=64 time=11.4 ms
>> > 64 bytes from 192.168.1.38: icmp_seq=16 ttl=64 time=64.9 ms
>> > 64 bytes from 192.168.1.38: icmp_seq=25 ttl=64 time=87.9 ms
>> > 64 bytes from 192.168.1.38: icmp_seq=30 ttl=64 time=242 ms
>> > 64 bytes from 192.168.1.38: icmp_seq=35 ttl=64 time=40.1 ms
>> > 64 bytes from 192.168.1.38: icmp_seq=36 ttl=64 time=60.9 ms
>> >
>> We know that there is that PAUSE frame leak but that should not cause
>> this behavior if those are present in your network. The sched_work()
>> which is wrong (as pointed by Eric) especially when the machine is
>> busy and that might trigger something like this. Almost every 10th -
>> 15th ping packet seems to be processed correctly.
>>
>> I did get consistent results as opposed what you have shown here, but
>> will dig some more to see if something obviously wrong here.
>>
>> > But I cannot reproduce that in a second run (though I haven't rebooted
>> > to test cleanly again).
>> >
>> > And oddly, the dhclient process takes a consistent 5% CPU and wireshark
>> > running on eno1 (not even the ipvlan interface) jumps to 100% CPU along
>> > with the dumpcap process taking another 25%, none of which are normal.
>> > This is a 4-core i4790 box, so something is wrong here; is something
>> > holding onto a spinlock for way too long?
>> >
>> > But at least it handles the packets ok, so I say progress!  Happy to
>> > help track down the CPU usage issue if you want to give me patches to
>> > test.
>> >
>> Which patch(es) you are referring to?
>
> None that yet exist; simply that if any of the issues I described
> triggered thoughts or patches on your end, I'm happy to test them.  I
> will try to characterize the issues I have seen more next week and
> report back.
>
OK found the issue! RX frames  are never drained from the queue and
are processed again and again causing the CPU-usage spike you have
observed. I'll integrate the fix into v2

--mahesh..

> Dan
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-04-27 18:14 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-23 21:29 [PATCH next 1/3] ipvlan: Defer multicast / broadcast processing to a work-queue Mahesh Bandewar
2015-04-24  0:32 ` Eric Dumazet
2015-04-24  2:54   ` Mahesh Bandewar
2015-04-24  3:04     ` Eric Dumazet
2015-04-24  4:28     ` David Miller
2015-04-24  4:40       ` Mahesh Bandewar
2015-04-24 20:15 ` Dan Williams
2015-04-24 22:40   ` Mahesh Bandewar
2015-04-24 22:59     ` Dan Williams
2015-04-27 18:14       ` Mahesh Bandewar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).