All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH,RFC] macvlan: Handle broadcasts inline if we have only a few macvlans.
@ 2016-05-26 23:44 Lennert Buytenhek
  2016-05-27 17:56 ` Cong Wang
  2016-05-30  8:17 ` [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
  0 siblings, 2 replies; 17+ messages in thread
From: Lennert Buytenhek @ 2016-05-26 23:44 UTC (permalink / raw)
  To: Patrick McHardy, Herbert Xu; +Cc: netdev

Commit 412ca1550cbecb2c ("macvlan: Move broadcasts into a work queue")
moved processing of all macvlan multicasts into a work queue.  This
causes a noticable performance regression when there is heavy multicast
traffic on the underlying interface for multicast groups that the
macvlan subinterfaces are not members of, in which case we end up
cloning all those packets and then freeing them again from a work queue
without really doing any useful work with them in between.

The commit message for commit 412ca1550cbecb2c says:

|   Fundamentally, we need to ensure that the amount of work handled
|   in each netif_rx backlog run is constrained.  As broadcasts are
|   anything but constrained, it either needs to be limited per run
|   or moved to process context.

This patch moves multicast handling back into macvlan_handle_frame()
context if there are 100 or fewer macvlan subinterfaces, while keeping
the work queue for if there are more macvlan subinterfaces than that.

I played around with keeping track of the number of macvlan
subinterfaces that have each multicast filter bit set, but that ended
up being more complicated than I liked.  Conditionalising the work
queue deferring on the total number of macvlan subinterfaces seems
like a fair compromise.

On a quickly whipped together test program that creates an ethertap
interface with a single macvlan subinterface and then blasts 16 Mi
multicast packets through the ethertap interface for a multicast
group that the macvlan subinterface is not a member of, run time goes
from (vanilla kernel):

        # time ./stress
        real    0m41.864s
        user    0m0.622s
        sys     0m20.754s

to (with this patch):

        # time ./stress
        real    0m16.539s
        user    0m0.519s
        sys     0m15.949s

Reported-by: Grant Zhang <gzhang@fastly.com>
Signed-off-by: Lennert Buytenhek <lbuytenhek@fastly.com>
---
 drivers/net/macvlan.c | 71 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 26 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index cb01023..02934a5 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -231,7 +231,8 @@ static unsigned int mc_hash(const struct macvlan_dev *vlan,
 static void macvlan_broadcast(struct sk_buff *skb,
 			      const struct macvlan_port *port,
 			      struct net_device *src,
-			      enum macvlan_mode mode)
+			      enum macvlan_mode mode,
+			      bool do_rx_softirq)
 {
 	const struct ethhdr *eth = eth_hdr(skb);
 	const struct macvlan_dev *vlan;
@@ -254,17 +255,49 @@ static void macvlan_broadcast(struct sk_buff *skb,
 
 			err = NET_RX_DROP;
 			nskb = skb_clone(skb, GFP_ATOMIC);
-			if (likely(nskb))
+			if (likely(nskb)) {
 				err = macvlan_broadcast_one(
 					nskb, vlan, eth,
-					mode == MACVLAN_MODE_BRIDGE) ?:
-				      netif_rx_ni(nskb);
+					mode == MACVLAN_MODE_BRIDGE);
+				if (err == 0) {
+					if (do_rx_softirq)
+						err = netif_rx_ni(nskb);
+					else
+						err = netif_rx(nskb);
+				}
+			}
 			macvlan_count_rx(vlan, skb->len + ETH_HLEN,
 					 err == NET_RX_SUCCESS, true);
 		}
 	}
 }
 
+static void macvlan_process_one(struct sk_buff *skb,
+				struct macvlan_port *port,
+				const struct macvlan_dev *src,
+				bool do_rx_softirq)
+{
+	if (!src)
+		/* frame comes from an external address */
+		macvlan_broadcast(skb, port, NULL,
+				  MACVLAN_MODE_PRIVATE |
+				  MACVLAN_MODE_VEPA    |
+				  MACVLAN_MODE_PASSTHRU|
+				  MACVLAN_MODE_BRIDGE, do_rx_softirq);
+	else if (src->mode == MACVLAN_MODE_VEPA)
+		/* flood to everyone except source */
+		macvlan_broadcast(skb, port, src->dev,
+				  MACVLAN_MODE_VEPA |
+				  MACVLAN_MODE_BRIDGE, do_rx_softirq);
+	else
+		/*
+		 * flood only to VEPA ports, bridge ports
+		 * already saw the frame on the way out.
+		 */
+		macvlan_broadcast(skb, port, src->dev,
+				  MACVLAN_MODE_VEPA, do_rx_softirq);
+}
+
 static void macvlan_process_broadcast(struct work_struct *w)
 {
 	struct macvlan_port *port = container_of(w, struct macvlan_port,
@@ -282,27 +315,7 @@ static void macvlan_process_broadcast(struct work_struct *w)
 		const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;
 
 		rcu_read_lock();
-
-		if (!src)
-			/* frame comes from an external address */
-			macvlan_broadcast(skb, port, NULL,
-					  MACVLAN_MODE_PRIVATE |
-					  MACVLAN_MODE_VEPA    |
-					  MACVLAN_MODE_PASSTHRU|
-					  MACVLAN_MODE_BRIDGE);
-		else if (src->mode == MACVLAN_MODE_VEPA)
-			/* flood to everyone except source */
-			macvlan_broadcast(skb, port, src->dev,
-					  MACVLAN_MODE_VEPA |
-					  MACVLAN_MODE_BRIDGE);
-		else
-			/*
-			 * flood only to VEPA ports, bridge ports
-			 * already saw the frame on the way out.
-			 */
-			macvlan_broadcast(skb, port, src->dev,
-					  MACVLAN_MODE_VEPA);
-
+		macvlan_process_one(skb, port, src, true);
 		rcu_read_unlock();
 
 		kfree_skb(skb);
@@ -429,6 +442,11 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 			goto out;
 		}
 
+		if (port->count <= 100) {
+			macvlan_process_one(skb, port, src, false);
+			return RX_HANDLER_PASS;
+		}
+
 		MACVLAN_SKB_CB(skb)->src = src;
 		macvlan_broadcast_enqueue(port, skb);
 
@@ -479,7 +497,8 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		/* send to other bridge ports directly */
 		if (is_multicast_ether_addr(eth->h_dest)) {
-			macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
+			macvlan_broadcast(skb, port, dev,
+					  MACVLAN_MODE_BRIDGE, true);
 			goto xmit_world;
 		}
 
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH,RFC] macvlan: Handle broadcasts inline if we have only a few macvlans.
  2016-05-26 23:44 [PATCH,RFC] macvlan: Handle broadcasts inline if we have only a few macvlans Lennert Buytenhek
@ 2016-05-27 17:56 ` Cong Wang
  2016-05-27 18:03   ` Lennert Buytenhek
  2016-05-30  8:17 ` [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
  1 sibling, 1 reply; 17+ messages in thread
From: Cong Wang @ 2016-05-27 17:56 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Patrick McHardy, Herbert Xu, Linux Kernel Network Developers

On Thu, May 26, 2016 at 4:44 PM, Lennert Buytenhek
<buytenh@wantstofly.org> wrote:
> Commit 412ca1550cbecb2c ("macvlan: Move broadcasts into a work queue")
> moved processing of all macvlan multicasts into a work queue.  This
> causes a noticable performance regression when there is heavy multicast
> traffic on the underlying interface for multicast groups that the
> macvlan subinterfaces are not members of, in which case we end up
> cloning all those packets and then freeing them again from a work queue
> without really doing any useful work with them in between.

But we only queue up to 1000 packets in our backlog.

How about adding a quick check before cloning it?

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index cb01023..1c73d0f 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -315,6 +315,9 @@ static void macvlan_broadcast_enqueue(struct
macvlan_port *port,
        struct sk_buff *nskb;
        int err = -ENOMEM;

+       if (skb_queue_len(&port->bc_queue) >= MACVLAN_BC_QUEUE_LEN)
+               return;
+
        nskb = skb_clone(skb, GFP_ATOMIC);
        if (!nskb)
                goto err;

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH,RFC] macvlan: Handle broadcasts inline if we have only a few macvlans.
  2016-05-27 17:56 ` Cong Wang
@ 2016-05-27 18:03   ` Lennert Buytenhek
  0 siblings, 0 replies; 17+ messages in thread
From: Lennert Buytenhek @ 2016-05-27 18:03 UTC (permalink / raw)
  To: Cong Wang; +Cc: Patrick McHardy, Herbert Xu, Linux Kernel Network Developers

On Fri, May 27, 2016 at 10:56:44AM -0700, Cong Wang wrote:

> > Commit 412ca1550cbecb2c ("macvlan: Move broadcasts into a work queue")
> > moved processing of all macvlan multicasts into a work queue.  This
> > causes a noticable performance regression when there is heavy multicast
> > traffic on the underlying interface for multicast groups that the
> > macvlan subinterfaces are not members of, in which case we end up
> > cloning all those packets and then freeing them again from a work queue
> > without really doing any useful work with them in between.
> 
> But we only queue up to 1000 packets in our backlog.
> 
> How about adding a quick check before cloning it?
> 
> diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
> index cb01023..1c73d0f 100644
> --- a/drivers/net/macvlan.c
> +++ b/drivers/net/macvlan.c
> @@ -315,6 +315,9 @@ static void macvlan_broadcast_enqueue(struct
> macvlan_port *port,
>         struct sk_buff *nskb;
>         int err = -ENOMEM;
> 
> +       if (skb_queue_len(&port->bc_queue) >= MACVLAN_BC_QUEUE_LEN)
> +               return;
> +
>         nskb = skb_clone(skb, GFP_ATOMIC);
>         if (!nskb)
>                 goto err;

We're not hitting the bc_queue skb limit in our environment, as the
machine can keep up with the traffic -- it's just that taking an
extra clone of the skb and queueing and running the work queue item
to free it again is eating up a lot of cycles.

But doing the queue length check before the clone might not be a bad
idea?  (You'd probably want to atomic_long_inc(&skb->dev->rx_dropped)
before returning, though?)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning
  2016-05-26 23:44 [PATCH,RFC] macvlan: Handle broadcasts inline if we have only a few macvlans Lennert Buytenhek
  2016-05-27 17:56 ` Cong Wang
@ 2016-05-30  8:17 ` Herbert Xu
  2016-05-30  8:23   ` [PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
                     ` (2 more replies)
  1 sibling, 3 replies; 17+ messages in thread
From: Herbert Xu @ 2016-05-30  8:17 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: Patrick McHardy, netdev

On Fri, May 27, 2016 at 02:44:33AM +0300, Lennert Buytenhek wrote:
> Commit 412ca1550cbecb2c ("macvlan: Move broadcasts into a work queue")
> moved processing of all macvlan multicasts into a work queue.  This
> causes a noticable performance regression when there is heavy multicast
> traffic on the underlying interface for multicast groups that the
> macvlan subinterfaces are not members of, in which case we end up
> cloning all those packets and then freeing them again from a work queue
> without really doing any useful work with them in between.

OK so your motivation is to get rid of the unnecessary memory
allocation, right?

Here's my totally untested patch, it tries to resolve your problem
by maintaining a filter hash at the macvlan_port level so that we
can quickly determine whether a given packet is needed or not.

It is preceded by a patch that fixes a potential use-after-free
bug that I discovered while looking over this.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2] macvlan: Fix potential use-after free for broadcasts
  2016-05-30  8:17 ` [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
@ 2016-05-30  8:23   ` Herbert Xu
  2016-05-30  8:28   ` [PATCH 2/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
  2016-05-30 16:27   ` [PATCH " Lennert Buytenhek
  2 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2016-05-30  8:23 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: Patrick McHardy, netdev

When we postpone a broadcast packet we save the source port in
the skb if it is local.  However, the source port can disappear
before we get a chance to process the packet.

This patch fixes this by holding a ref count on the netdev.

It also delays the skb->cb modification until after we allocate
the new skb as you should not modify shared skbs.

Fixes: 412ca1550cbe ("macvlan: Move broadcasts into a work queue")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..78a00e3 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -305,11 +305,14 @@ static void macvlan_process_broadcast(struct work_struct *w)
 
 		rcu_read_unlock();
 
+		if (src)
+			dev_put(src->dev);
 		kfree_skb(skb);
 	}
 }
 
 static void macvlan_broadcast_enqueue(struct macvlan_port *port,
+				      const struct macvlan_dev *src,
 				      struct sk_buff *skb)
 {
 	struct sk_buff *nskb;
@@ -319,8 +322,12 @@ static void macvlan_broadcast_enqueue(struct macvlan_port *port,
 	if (!nskb)
 		goto err;
 
+	MACVLAN_SKB_CB(nskb)->src = src;
+
 	spin_lock(&port->bc_queue.lock);
 	if (skb_queue_len(&port->bc_queue) < MACVLAN_BC_QUEUE_LEN) {
+		if (src)
+			dev_hold(src->dev);
 		__skb_queue_tail(&port->bc_queue, nskb);
 		err = 0;
 	}
@@ -429,8 +436,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 			goto out;
 		}
 
-		MACVLAN_SKB_CB(skb)->src = src;
-		macvlan_broadcast_enqueue(port, skb);
+		macvlan_broadcast_enqueue(port, src, skb);
 
 		return RX_HANDLER_PASS;
 	}
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/2] macvlan: Avoid unnecessary multicast cloning
  2016-05-30  8:17 ` [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
  2016-05-30  8:23   ` [PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
@ 2016-05-30  8:28   ` Herbert Xu
  2016-05-31 21:07     ` David Miller
  2016-05-30 16:27   ` [PATCH " Lennert Buytenhek
  2 siblings, 1 reply; 17+ messages in thread
From: Herbert Xu @ 2016-05-30  8:28 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: Patrick McHardy, netdev, Jiri Pirko

Currently we always queue a multicast packet for further processing,
even if none of the macvlan devices are subscribed to the address.

This patch optimises this by adding a global multicast filter for
a macvlan_port.

Note that this patch doesn't handle the broadcast addresses of the
individual macvlan devices correctly, if they are not all identical.
However, this is already broken because there is no mechanism in
place to update the individual multicast filters when you change
the broadcast address.

If someone cares enough they should fix this by collecting all
broadcast addresses for a macvlan as we do for multicast and unicast.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index f55fe21..9fa4532 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -49,6 +49,7 @@ struct macvlan_port {
 	bool 			passthru;
 	int			count;
 	struct hlist_head	vlan_source_hash[MACVLAN_HASH_SIZE];
+	DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
 };
 
 struct macvlan_source_entry {
@@ -418,6 +419,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 
 	port = macvlan_port_get_rcu(skb->dev);
 	if (is_multicast_ether_addr(eth->h_dest)) {
+		unsigned int hash;
+
 		skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN);
 		if (!skb)
 			return RX_HANDLER_CONSUMED;
@@ -435,7 +438,9 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 			goto out;
 		}
 
-		macvlan_broadcast_enqueue(port, src, skb);
+		hash = mc_hash(NULL, eth->h_dest);
+		if (test_bit(hash, port->mc_filter))
+			macvlan_broadcast_enqueue(port, src, skb);
 
 		return RX_HANDLER_PASS;
 	}
@@ -725,6 +730,8 @@ static void macvlan_set_mac_lists(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
+	dev_uc_sync(vlan->lowerdev, dev);
+	dev_mc_sync(vlan->lowerdev, dev);
 	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
 		bitmap_fill(vlan->mc_filter, MACVLAN_MC_FILTER_SZ);
 	} else {
@@ -739,9 +746,31 @@ static void macvlan_set_mac_lists(struct net_device *dev)
 		__set_bit(mc_hash(vlan, dev->broadcast), filter);
 
 		bitmap_copy(vlan->mc_filter, filter, MACVLAN_MC_FILTER_SZ);
+
+		/* This is slightly inaccurate as we're including
+		 * the subscription list of vlan->lowerdev too.
+		 */
+		bitmap_zero(filter, MACVLAN_MC_FILTER_SZ);
+		netdev_for_each_mc_addr(ha, vlan->lowerdev) {
+			__set_bit(mc_hash(NULL, ha->addr), filter);
+		}
+
+		/* Bug alert: This only works if everyone has the
+		 * same broadcast address.  As soon as someone
+		 * changes theirs this will break.
+		 *
+		 * However, this is already broken as when you
+		 * change your broadcast address we don't get
+		 * called.
+		 *
+		 * The solution is to maintain a list of broadcast
+		 * addresses like we do for uc/mc, if you care.
+		 */
+		__set_bit(mc_hash(NULL, dev->broadcast), filter);
+
+		bitmap_copy(vlan->port->mc_filter, filter,
+			    MACVLAN_MC_FILTER_SZ);
 	}
-	dev_uc_sync(vlan->lowerdev, dev);
-	dev_mc_sync(vlan->lowerdev, dev);
 }
 
 static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning
  2016-05-30  8:17 ` [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
  2016-05-30  8:23   ` [PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
  2016-05-30  8:28   ` [PATCH 2/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
@ 2016-05-30 16:27   ` Lennert Buytenhek
  2016-05-31  0:49     ` Herbert Xu
  2 siblings, 1 reply; 17+ messages in thread
From: Lennert Buytenhek @ 2016-05-30 16:27 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Patrick McHardy, netdev

On Mon, May 30, 2016 at 04:17:52PM +0800, Herbert Xu wrote:

> > Commit 412ca1550cbecb2c ("macvlan: Move broadcasts into a work queue")
> > moved processing of all macvlan multicasts into a work queue.  This
> > causes a noticable performance regression when there is heavy multicast
> > traffic on the underlying interface for multicast groups that the
> > macvlan subinterfaces are not members of, in which case we end up
> > cloning all those packets and then freeing them again from a work queue
> > without really doing any useful work with them in between.
> 
> OK so your motivation is to get rid of the unnecessary memory
> allocation, right?

That and stack switches to kworker threads and serialisation on
the bc_queue queue lock.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning
  2016-05-30 16:27   ` [PATCH " Lennert Buytenhek
@ 2016-05-31  0:49     ` Herbert Xu
  2016-05-31 18:40       ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Herbert Xu @ 2016-05-31  0:49 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: Patrick McHardy, netdev

On Mon, May 30, 2016 at 07:27:59PM +0300, Lennert Buytenhek wrote:
>
> That and stack switches to kworker threads and serialisation on
> the bc_queue queue lock.

My patch should resolve these problems too since the packet is
discarded if nobody is interested in it.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning
  2016-05-31  0:49     ` Herbert Xu
@ 2016-05-31 18:40       ` David Miller
  0 siblings, 0 replies; 17+ messages in thread
From: David Miller @ 2016-05-31 18:40 UTC (permalink / raw)
  To: herbert; +Cc: buytenh, kaber, netdev

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 31 May 2016 08:49:45 +0800

> On Mon, May 30, 2016 at 07:27:59PM +0300, Lennert Buytenhek wrote:
>>
>> That and stack switches to kworker threads and serialisation on
>> the bc_queue queue lock.
> 
> My patch should resolve these problems too since the packet is
> discarded if nobody is interested in it.

+1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] macvlan: Avoid unnecessary multicast cloning
  2016-05-30  8:28   ` [PATCH 2/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
@ 2016-05-31 21:07     ` David Miller
  2016-06-01  3:42       ` [v2 PATCH 0/2] " Herbert Xu
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2016-05-31 21:07 UTC (permalink / raw)
  To: herbert; +Cc: buytenh, kaber, netdev, jpirko

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 30 May 2016 16:28:28 +0800

> @@ -725,6 +730,8 @@ static void macvlan_set_mac_lists(struct net_device *dev)
>  {
>  	struct macvlan_dev *vlan = netdev_priv(dev);
>  
> +	dev_uc_sync(vlan->lowerdev, dev);
> +	dev_mc_sync(vlan->lowerdev, dev);
>  	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
>  		bitmap_fill(vlan->mc_filter, MACVLAN_MC_FILTER_SZ);
>  	} else {

I think you need to set the vlan->port->mc_filter to all 1's in the
PROMISC/ALLMUTI branch here.

Otherwise packets won't properly pass your new hash test.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [v2 PATCH 0/2] macvlan: Avoid unnecessary multicast cloning
  2016-05-31 21:07     ` David Miller
@ 2016-06-01  3:42       ` Herbert Xu
  2016-06-01  3:43         ` [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
                           ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Herbert Xu @ 2016-06-01  3:42 UTC (permalink / raw)
  To: David Miller; +Cc: buytenh, kaber, netdev, jpirko

On Tue, May 31, 2016 at 02:07:13PM -0700, David Miller wrote:
> 
> I think you need to set the vlan->port->mc_filter to all 1's in the
> PROMISC/ALLMUTI branch here.
> 
> Otherwise packets won't properly pass your new hash test.

Good point.  Here's v2.

This patch tries to improve macvlan multicast performance by
maintaining a filter hash at the macvlan_port level so that we
can quickly determine whether a given packet is needed or not.

It is preceded by a patch that fixes a potential use-after-free
bug that I discovered while looking over this.

v2 fixed a bug where promiscuous/allmulti settings weren't handled
correctly.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts
  2016-06-01  3:42       ` [v2 PATCH 0/2] " Herbert Xu
@ 2016-06-01  3:43         ` Herbert Xu
  2016-06-01  4:19           ` Cong Wang
  2016-06-01  3:45         ` [v2 PATCH 2/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
  2016-06-02  0:49         ` [v2 PATCH 0/2] " David Miller
  2 siblings, 1 reply; 17+ messages in thread
From: Herbert Xu @ 2016-06-01  3:43 UTC (permalink / raw)
  To: David Miller; +Cc: buytenh, kaber, netdev, jpirko

When we postpone a broadcast packet we save the source port in
the skb if it is local.  However, the source port can disappear
before we get a chance to process the packet.

This patch fixes this by holding a ref count on the netdev.

It also delays the skb->cb modification until after we allocate
the new skb as you should not modify shared skbs.

Fixes: 412ca1550cbe ("macvlan: Move broadcasts into a work queue")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..78a00e3 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -305,11 +305,14 @@ static void macvlan_process_broadcast(struct work_struct *w)
 
 		rcu_read_unlock();
 
+		if (src)
+			dev_put(src->dev);
 		kfree_skb(skb);
 	}
 }
 
 static void macvlan_broadcast_enqueue(struct macvlan_port *port,
+				      const struct macvlan_dev *src,
 				      struct sk_buff *skb)
 {
 	struct sk_buff *nskb;
@@ -319,8 +322,12 @@ static void macvlan_broadcast_enqueue(struct macvlan_port *port,
 	if (!nskb)
 		goto err;
 
+	MACVLAN_SKB_CB(nskb)->src = src;
+
 	spin_lock(&port->bc_queue.lock);
 	if (skb_queue_len(&port->bc_queue) < MACVLAN_BC_QUEUE_LEN) {
+		if (src)
+			dev_hold(src->dev);
 		__skb_queue_tail(&port->bc_queue, nskb);
 		err = 0;
 	}
@@ -429,8 +436,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 			goto out;
 		}
 
-		MACVLAN_SKB_CB(skb)->src = src;
-		macvlan_broadcast_enqueue(port, skb);
+		macvlan_broadcast_enqueue(port, src, skb);
 
 		return RX_HANDLER_PASS;
 	}
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [v2 PATCH 2/2] macvlan: Avoid unnecessary multicast cloning
  2016-06-01  3:42       ` [v2 PATCH 0/2] " Herbert Xu
  2016-06-01  3:43         ` [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
@ 2016-06-01  3:45         ` Herbert Xu
  2016-06-02  0:49         ` [v2 PATCH 0/2] " David Miller
  2 siblings, 0 replies; 17+ messages in thread
From: Herbert Xu @ 2016-06-01  3:45 UTC (permalink / raw)
  To: David Miller; +Cc: buytenh, kaber, netdev, jpirko

Currently we always queue a multicast packet for further processing,
even if none of the macvlan devices are subscribed to the address.

This patch optimises this by adding a global multicast filter for
a macvlan_port.

Note that this patch doesn't handle the broadcast addresses of the
individual macvlan devices correctly, if they are not all identical
to vlan->lowerdev.  However, this is already broken because there
is no mechanism in place to update the individual multicast filters
when you change the broadcast address.

If someone cares enough they should fix this by collecting all
broadcast addresses for a macvlan as we do for multicast and unicast.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index a71fa59..0c65bd9 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -49,6 +49,7 @@ struct macvlan_port {
 	bool 			passthru;
 	int			count;
 	struct hlist_head	vlan_source_hash[MACVLAN_HASH_SIZE];
+	DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
 };
 
 struct macvlan_source_entry {
@@ -419,6 +420,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 
 	port = macvlan_port_get_rcu(skb->dev);
 	if (is_multicast_ether_addr(eth->h_dest)) {
+		unsigned int hash;
+
 		skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN);
 		if (!skb)
 			return RX_HANDLER_CONSUMED;
@@ -436,7 +439,9 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 			goto out;
 		}
 
-		macvlan_broadcast_enqueue(port, src, skb);
+		hash = mc_hash(NULL, eth->h_dest);
+		if (test_bit(hash, port->mc_filter))
+			macvlan_broadcast_enqueue(port, src, skb);
 
 		return RX_HANDLER_PASS;
 	}
@@ -722,12 +727,12 @@ static void macvlan_change_rx_flags(struct net_device *dev, int change)
 	}
 }
 
-static void macvlan_set_mac_lists(struct net_device *dev)
+static void macvlan_compute_filter(unsigned long *mc_filter,
+				   struct net_device *dev,
+				   struct macvlan_dev *vlan)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
-
 	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
-		bitmap_fill(vlan->mc_filter, MACVLAN_MC_FILTER_SZ);
+		bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ);
 	} else {
 		struct netdev_hw_addr *ha;
 		DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ);
@@ -739,10 +744,33 @@ static void macvlan_set_mac_lists(struct net_device *dev)
 
 		__set_bit(mc_hash(vlan, dev->broadcast), filter);
 
-		bitmap_copy(vlan->mc_filter, filter, MACVLAN_MC_FILTER_SZ);
+		bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ);
 	}
+}
+
+static void macvlan_set_mac_lists(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	macvlan_compute_filter(vlan->mc_filter, dev, vlan);
+
 	dev_uc_sync(vlan->lowerdev, dev);
 	dev_mc_sync(vlan->lowerdev, dev);
+
+	/* This is slightly inaccurate as we're including the subscription
+	 * list of vlan->lowerdev too.
+	 *
+	 * Bug alert: This only works if everyone has the same broadcast
+	 * address as lowerdev.  As soon as someone changes theirs this
+	 * will break.
+	 *
+	 * However, this is already broken as when you change your broadcast
+	 * address we don't get called.
+	 *
+	 * The solution is to maintain a list of broadcast addresses like
+	 * we do for uc/mc, if you care.
+	 */
+	macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL);
 }
 
 static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts
  2016-06-01  3:43         ` [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
@ 2016-06-01  4:19           ` Cong Wang
  2016-06-01  4:27             ` Herbert Xu
  0 siblings, 1 reply; 17+ messages in thread
From: Cong Wang @ 2016-06-01  4:19 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David Miller, Lennert Buytenhek, Patrick McHardy,
	Linux Kernel Network Developers, Jiri Pirko

On Tue, May 31, 2016 at 8:43 PM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> When we postpone a broadcast packet we save the source port in
> the skb if it is local.  However, the source port can disappear
> before we get a chance to process the packet.
>

Hmm, why could this happen? The upper device should be linked
with the lower device, where a refcount is already held.
Also, the work is cancelled in ->uninit().

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts
  2016-06-01  4:19           ` Cong Wang
@ 2016-06-01  4:27             ` Herbert Xu
  2016-06-01 23:36               ` Cong Wang
  0 siblings, 1 reply; 17+ messages in thread
From: Herbert Xu @ 2016-06-01  4:27 UTC (permalink / raw)
  To: Cong Wang
  Cc: David Miller, Lennert Buytenhek, Patrick McHardy,
	Linux Kernel Network Developers, Jiri Pirko

On Tue, May 31, 2016 at 09:19:37PM -0700, Cong Wang wrote:
>
> Hmm, why could this happen? The upper device should be linked
> with the lower device, where a refcount is already held.
> Also, the work is cancelled in ->uninit().

Of course it can happen.  We are talking about the source macvlan
device that we just looked up using the Ethernet address.  That
device has nothing to do with the packet now so it may be deleted
at any time.

We do flush the work but only when the all macvlan devices on a
port have been deleted.  Perhaps you're confusing the source
device with vlan->lowerdev which is confusingly the actual
hardware device?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts
  2016-06-01  4:27             ` Herbert Xu
@ 2016-06-01 23:36               ` Cong Wang
  0 siblings, 0 replies; 17+ messages in thread
From: Cong Wang @ 2016-06-01 23:36 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David Miller, Lennert Buytenhek, Patrick McHardy,
	Linux Kernel Network Developers, Jiri Pirko

On Tue, May 31, 2016 at 9:27 PM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Tue, May 31, 2016 at 09:19:37PM -0700, Cong Wang wrote:
>>
>> Hmm, why could this happen? The upper device should be linked
>> with the lower device, where a refcount is already held.
>> Also, the work is cancelled in ->uninit().
>
> Of course it can happen.  We are talking about the source macvlan
> device that we just looked up using the Ethernet address.  That
> device has nothing to do with the packet now so it may be deleted
> at any time.
>
> We do flush the work but only when the all macvlan devices on a
> port have been deleted.  Perhaps you're confusing the source
> device with vlan->lowerdev which is confusingly the actual
> hardware device?

I thought all the on-flying packets are waited by synchronize_net()
during the removal of any of these devices. But since you moved
them to a workqueue, aka process context, so I think it won't
work any more.

Your patch makes sense to me now. :)

Thanks!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [v2 PATCH 0/2] macvlan: Avoid unnecessary multicast cloning
  2016-06-01  3:42       ` [v2 PATCH 0/2] " Herbert Xu
  2016-06-01  3:43         ` [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
  2016-06-01  3:45         ` [v2 PATCH 2/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
@ 2016-06-02  0:49         ` David Miller
  2 siblings, 0 replies; 17+ messages in thread
From: David Miller @ 2016-06-02  0:49 UTC (permalink / raw)
  To: herbert; +Cc: buytenh, kaber, netdev, jpirko

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 1 Jun 2016 11:42:18 +0800

> This patch tries to improve macvlan multicast performance by
> maintaining a filter hash at the macvlan_port level so that we
> can quickly determine whether a given packet is needed or not.
> 
> It is preceded by a patch that fixes a potential use-after-free
> bug that I discovered while looking over this.
> 
> v2 fixed a bug where promiscuous/allmulti settings weren't handled
> correctly.

Series applied to net-next, thanks Herbert.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-06-02  0:49 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-26 23:44 [PATCH,RFC] macvlan: Handle broadcasts inline if we have only a few macvlans Lennert Buytenhek
2016-05-27 17:56 ` Cong Wang
2016-05-27 18:03   ` Lennert Buytenhek
2016-05-30  8:17 ` [PATCH 0/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
2016-05-30  8:23   ` [PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
2016-05-30  8:28   ` [PATCH 2/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
2016-05-31 21:07     ` David Miller
2016-06-01  3:42       ` [v2 PATCH 0/2] " Herbert Xu
2016-06-01  3:43         ` [v2 PATCH 1/2] macvlan: Fix potential use-after free for broadcasts Herbert Xu
2016-06-01  4:19           ` Cong Wang
2016-06-01  4:27             ` Herbert Xu
2016-06-01 23:36               ` Cong Wang
2016-06-01  3:45         ` [v2 PATCH 2/2] macvlan: Avoid unnecessary multicast cloning Herbert Xu
2016-06-02  0:49         ` [v2 PATCH 0/2] " David Miller
2016-05-30 16:27   ` [PATCH " Lennert Buytenhek
2016-05-31  0:49     ` Herbert Xu
2016-05-31 18:40       ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.