netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 0/2] Open vSwitch zerocopy upcall
@ 2013-05-24 14:52 Thomas Graf
  2013-05-24 14:52 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
  2013-05-24 14:52 ` [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall Thomas Graf
  0 siblings, 2 replies; 20+ messages in thread
From: Thomas Graf @ 2013-05-24 14:52 UTC (permalink / raw)
  To: netdev, jesse; +Cc: dev, eric.dumazet

Posting this to netdev due to dependency on first patch.

Thomas Graf (2):
  net: Export skb_zerocopy() to zerocopy from one skb to another
  openvswitch: Use zerocopy if applicable when performing the upcall

 include/linux/skbuff.h               |  2 ++
 net/core/skbuff.c                    | 46 +++++++++++++++++++++++++++++++++++
 net/netfilter/nfnetlink_queue_core.c | 47 +-----------------------------------
 net/openvswitch/datapath.c           | 38 ++++++++++++++++++++++++-----
 4 files changed, 81 insertions(+), 52 deletions(-)

-- 
1.7.11.7

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another
  2013-05-24 14:52 [PATCH net-next 0/2] Open vSwitch zerocopy upcall Thomas Graf
@ 2013-05-24 14:52 ` Thomas Graf
  2013-05-24 15:31   ` Eric Dumazet
  2013-05-24 14:52 ` [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall Thomas Graf
  1 sibling, 1 reply; 20+ messages in thread
From: Thomas Graf @ 2013-05-24 14:52 UTC (permalink / raw)
  To: netdev, jesse; +Cc: dev, eric.dumazet

Make the skb zerocopy logic written for nfnetlink queue available for
use by other modules.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 include/linux/skbuff.h               |  2 ++
 net/core/skbuff.c                    | 46 +++++++++++++++++++++++++++++++++++
 net/netfilter/nfnetlink_queue_core.c | 47 +-----------------------------------
 3 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2e0ced1..ff2cbe8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2478,6 +2478,8 @@ extern int             skb_splice_bits(struct sk_buff *skb,
 						unsigned int len,
 						unsigned int flags);
 extern void	       skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
+extern void	       skb_zerocopy(struct sk_buff *to, const struct sk_buff *from,
+				    int len, int hlen);
 extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
 extern int	       skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d629891..cb60bf6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2118,6 +2118,52 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
 }
 EXPORT_SYMBOL(skb_copy_and_csum_dev);
 
+void
+skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
+{
+	int i, j = 0;
+	int plen = 0; /* length of skb->head fragment */
+	struct page *page;
+	unsigned int offset;
+
+	/* dont bother with small payloads */
+	if (len <= skb_tailroom(to)) {
+		skb_copy_bits(from, 0, skb_put(to, len), len);
+		return;
+	}
+
+	if (hlen) {
+		skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+		len -= hlen;
+	} else {
+		plen = min_t(int, skb_headlen(from), len);
+		if (plen) {
+			page = virt_to_head_page(from->head);
+			offset = from->data - (unsigned char *)page_address(page);
+			__skb_fill_page_desc(to, 0, page, offset, plen);
+			get_page(page);
+			j = 1;
+			len -= plen;
+		}
+	}
+
+	to->truesize += len + plen;
+	to->len += len + plen;
+	to->data_len += len + plen;
+
+	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+		if (!len)
+			break;
+		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+		skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+		len -= skb_shinfo(to)->frags[j].size;
+		skb_frag_ref(to, j);
+		j++;
+	}
+	skb_shinfo(to)->nr_frags = j;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy);
+
 /**
  *	skb_dequeue - remove from the head of the queue
  *	@list: list to dequeue from
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 2e0e835..7ac9dba 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -227,51 +227,6 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 	spin_unlock_bh(&queue->lock);
 }
 
-static void
-nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
-{
-	int i, j = 0;
-	int plen = 0; /* length of skb->head fragment */
-	struct page *page;
-	unsigned int offset;
-
-	/* dont bother with small payloads */
-	if (len <= skb_tailroom(to)) {
-		skb_copy_bits(from, 0, skb_put(to, len), len);
-		return;
-	}
-
-	if (hlen) {
-		skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
-		len -= hlen;
-	} else {
-		plen = min_t(int, skb_headlen(from), len);
-		if (plen) {
-			page = virt_to_head_page(from->head);
-			offset = from->data - (unsigned char *)page_address(page);
-			__skb_fill_page_desc(to, 0, page, offset, plen);
-			get_page(page);
-			j = 1;
-			len -= plen;
-		}
-	}
-
-	to->truesize += len + plen;
-	to->len += len + plen;
-	to->data_len += len + plen;
-
-	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
-		if (!len)
-			break;
-		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
-		skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
-		len -= skb_shinfo(to)->frags[j].size;
-		skb_frag_ref(to, j);
-		j++;
-	}
-	skb_shinfo(to)->nr_frags = j;
-}
-
 static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet)
 {
 	__u32 flags = 0;
@@ -481,7 +436,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 		nla->nla_type = NFQA_PAYLOAD;
 		nla->nla_len = nla_attr_size(data_len);
 
-		nfqnl_zcopy(skb, entskb, data_len, hlen);
+		skb_zerocopy(skb, entskb, data_len, hlen);
 	}
 
 	nlh->nlmsg_len = skb->len;
-- 
1.7.11.7

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 14:52 [PATCH net-next 0/2] Open vSwitch zerocopy upcall Thomas Graf
  2013-05-24 14:52 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
@ 2013-05-24 14:52 ` Thomas Graf
  2013-05-24 17:24   ` Jesse Gross
  1 sibling, 1 reply; 20+ messages in thread
From: Thomas Graf @ 2013-05-24 14:52 UTC (permalink / raw)
  To: netdev, jesse; +Cc: dev, eric.dumazet, Thomas Graf

From: Thomas Graf <tgraf@rhlap.localdomain>

Avoids a memcpy() that is expensive for large packets:

from:
  4.51%  [kernel]                      [k] memcpy
to:
  1.20%  [kernel]                      [k] memcpy
  1.01%  [kernel]                      [k] skb_zerocopy

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 net/openvswitch/datapath.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index d12d6b8..b434adf 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -375,10 +375,11 @@ static size_t key_attr_size(void)
 }
 
 static size_t upcall_msg_size(const struct sk_buff *skb,
-			      const struct nlattr *userdata)
+			      const struct nlattr *userdata,
+			      unsigned int hdrlen)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
-		+ nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */
+		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
 		+ nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */
 
 	/* OVS_PACKET_ATTR_USERDATA */
@@ -396,6 +397,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
 	struct sk_buff *nskb = NULL;
 	struct sk_buff *user_skb; /* to be queued to userspace */
 	struct nlattr *nla;
+	unsigned int plen, hlen = 0;
 	int err;
 
 	if (vlan_tx_tag_present(skb)) {
@@ -416,7 +418,21 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
 		goto out;
 	}
 
-	user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC);
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    (err = skb_checksum_help(skb)))
+		goto out;
+
+	if (!skb->head_frag ||
+	    skb_headlen(skb) < L1_CACHE_BYTES ||
+	    skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)
+		hlen = skb_headlen(skb);
+
+	if (skb_has_frag_list(skb))
+		hlen = skb->len;
+
+	hlen = min_t(int, skb->len, hlen);
+
+	user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata, hlen), GFP_ATOMIC);
 	if (!user_skb) {
 		err = -ENOMEM;
 		goto out;
@@ -435,11 +451,21 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
 			  nla_len(upcall_info->userdata),
 			  nla_data(upcall_info->userdata));
 
-	nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
+	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0)))
+		goto out;
+	nla->nla_len = nla_attr_size(skb->len);
+
+	skb_zerocopy(user_skb, skb, skb->len, hlen);
 
-	skb_copy_and_csum_dev(skb, nla_data(nla));
+	plen = NLA_ALIGN(user_skb->len) - user_skb->len;
+	if (plen > 0)
+		memset(skb_put(user_skb, plen), 0, plen);
+
+	/* Fix alignment of .nlmsg_len, OVS user space enforces a strict
+	 * total message size alignment.
+	 */
+	((struct nlmsghdr *) user_skb->data)->nlmsg_len = NLA_ALIGN(user_skb->len);
 
-	genlmsg_end(user_skb, upcall);
 	err = genlmsg_unicast(net, user_skb, upcall_info->portid);
 
 out:
-- 
1.7.11.7

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another
  2013-05-24 14:52 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
@ 2013-05-24 15:31   ` Eric Dumazet
  2013-05-24 16:01     ` Thomas Graf
  0 siblings, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2013-05-24 15:31 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, jesse, dev

On Fri, 2013-05-24 at 16:52 +0200, Thomas Graf wrote:
> Make the skb zerocopy logic written for nfnetlink queue available for
> use by other modules.
> 
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
>  include/linux/skbuff.h               |  2 ++
>  net/core/skbuff.c                    | 46 +++++++++++++++++++++++++++++++++++
>  net/netfilter/nfnetlink_queue_core.c | 47 +-----------------------------------
>  3 files changed, 49 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 2e0ced1..ff2cbe8 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -2478,6 +2478,8 @@ extern int             skb_splice_bits(struct sk_buff *skb,
>  						unsigned int len,
>  						unsigned int flags);
>  extern void	       skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
> +extern void	       skb_zerocopy(struct sk_buff *to, const struct sk_buff *from,
> +				    int len, int hlen);
>  extern void	       skb_split(struct sk_buff *skb,
>  				 struct sk_buff *skb1, const u32 len);
>  extern int	       skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index d629891..cb60bf6 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -2118,6 +2118,52 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
>  }
>  EXPORT_SYMBOL(skb_copy_and_csum_dev);
>  
> +void
> +skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
> +{
> +	int i, j = 0;
> +	int plen = 0; /* length of skb->head fragment */
> +	struct page *page;
> +	unsigned int offset;
> +
> +	/* dont bother with small payloads */
> +	if (len <= skb_tailroom(to)) {
> +		skb_copy_bits(from, 0, skb_put(to, len), len);
> +		return;
> +	}
> +
> +	if (hlen) {
> +		skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
> +		len -= hlen;
> +	} else {
> +		plen = min_t(int, skb_headlen(from), len);
> +		if (plen) {
> +			page = virt_to_head_page(from->head);

Hmm. Big warning here : In the nfnetlink case, we specifically made sure
that this path was taken only if skb->head_frag was set (all incoming
packets have this property, but not outgoing ones yet)

This should be documented if we move it to net/core/skbuff.c, or we
should add a BUG_ON()

I see your openvswitch code seems OK.

Maybe also put in net/core/skbuff.c the code dealing with  this stuff ?

+       if (!skb->head_frag ||
+           skb_headlen(skb) < L1_CACHE_BYTES ||
+           skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)
+               hlen = skb_headlen(skb);
+
+       if (skb_has_frag_list(skb))
+               hlen = skb->len;



> +			offset = from->data - (unsigned char *)page_address(page);
> +			__skb_fill_page_desc(to, 0, page, offset, plen);
> +			get_page(page);
> +			j = 1;
> +			len -= plen;
> +		}
> +	}

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another
  2013-05-24 15:31   ` Eric Dumazet
@ 2013-05-24 16:01     ` Thomas Graf
  0 siblings, 0 replies; 20+ messages in thread
From: Thomas Graf @ 2013-05-24 16:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, jesse, dev

On 05/24/13 at 08:31am, Eric Dumazet wrote:
> Hmm. Big warning here : In the nfnetlink case, we specifically made sure
> that this path was taken only if skb->head_frag was set (all incoming
> packets have this property, but not outgoing ones yet)
> 
> This should be documented if we move it to net/core/skbuff.c, or we
> should add a BUG_ON()

Good point.

I guess we could ensure in skb_zerocopy() that if !head_frag then we
enforce hlen = max(hlen, skb_headlen(skb)) to always fall back to
skb_copy_bits() for !head_frag.

> I see your openvswitch code seems OK.
> 
> Maybe also put in net/core/skbuff.c the code dealing with  this stuff ?
> 
> +       if (!skb->head_frag ||
> +           skb_headlen(skb) < L1_CACHE_BYTES ||
> +           skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)
> +               hlen = skb_headlen(skb);
> +
> +       if (skb_has_frag_list(skb))
> +               hlen = skb->len;

I'll put this into skb_zerocopy_hdrlen()

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 14:52 ` [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall Thomas Graf
@ 2013-05-24 17:24   ` Jesse Gross
  2013-05-24 18:58     ` Eric Dumazet
  0 siblings, 1 reply; 20+ messages in thread
From: Jesse Gross @ 2013-05-24 17:24 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, dev, Eric Dumazet

On Fri, May 24, 2013 at 7:52 AM, Thomas Graf <tgraf@suug.ch> wrote:
> From: Thomas Graf <tgraf@rhlap.localdomain>
>
> Avoids a memcpy() that is expensive for large packets:
>
> from:
>   4.51%  [kernel]                      [k] memcpy
> to:
>   1.20%  [kernel]                      [k] memcpy
>   1.01%  [kernel]                      [k] skb_zerocopy
>
> Signed-off-by: Thomas Graf <tgraf@suug.ch>

Does this have any impact on small packets? Those are usually the
common case (i.e. TCP SYN) and I think this is slightly less optimal
for those.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 17:24   ` Jesse Gross
@ 2013-05-24 18:58     ` Eric Dumazet
  2013-05-24 21:23       ` Jesse Gross
  0 siblings, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2013-05-24 18:58 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Thomas Graf, netdev, dev

On Fri, 2013-05-24 at 10:24 -0700, Jesse Gross wrote:

> Does this have any impact on small packets? Those are usually the
> common case (i.e. TCP SYN) and I think this is slightly less optimal
> for those.

No difference at all, small packets are copied anyway in skb->head

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 18:58     ` Eric Dumazet
@ 2013-05-24 21:23       ` Jesse Gross
  2013-05-24 21:57         ` Eric Dumazet
  0 siblings, 1 reply; 20+ messages in thread
From: Jesse Gross @ 2013-05-24 21:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Thomas Graf, netdev, dev

On Fri, May 24, 2013 at 11:58 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2013-05-24 at 10:24 -0700, Jesse Gross wrote:
>
>> Does this have any impact on small packets? Those are usually the
>> common case (i.e. TCP SYN) and I think this is slightly less optimal
>> for those.
>
> No difference at all, small packets are copied anyway in skb->head

Yes, but it makes the Open vSwitch code slightly worse - for example,
currently checksumming and copying are done in a single step but this
prevents that. Actually, I'm also curious about the test case that was
used for large packets and the full profile output since checksumming
and GSO aren't listed in the one that Thomas gave.

My guess is that there isn't a real different for small packets since
everything will be in the cache but it seems worth checking given that
this is optimizing a rare case at the expense of the common one.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 21:23       ` Jesse Gross
@ 2013-05-24 21:57         ` Eric Dumazet
  2013-05-24 22:18           ` Jesse Gross
  0 siblings, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2013-05-24 21:57 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Thomas Graf, netdev, dev

On Fri, 2013-05-24 at 14:23 -0700, Jesse Gross wrote:
> On Fri, May 24, 2013 at 11:58 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Fri, 2013-05-24 at 10:24 -0700, Jesse Gross wrote:
> >
> >> Does this have any impact on small packets? Those are usually the
> >> common case (i.e. TCP SYN) and I think this is slightly less optimal
> >> for those.
> >
> > No difference at all, small packets are copied anyway in skb->head
> 
> Yes, but it makes the Open vSwitch code slightly worse - for example,
> currently checksumming and copying are done in a single step but this
> prevents that. Actually, I'm also curious about the test case that was
> used for large packets and the full profile output since checksumming
> and GSO aren't listed in the one that Thomas gave.
> 

GSO is fully supported in nfnetlink, I see no reason why Open vSwitch
would not allow that.

> My guess is that there isn't a real different for small packets since
> everything will be in the cache but it seems worth checking given that
> this is optimizing a rare case at the expense of the common one.

I really doubt checksumming a SYN/ACK packet is that a performance
issue. Do you have performance numbers ?

You could always provide a patch to restore this copy/checksum if it
really gives a benefit, and if people still use NIC not doing this
checksum.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 21:57         ` Eric Dumazet
@ 2013-05-24 22:18           ` Jesse Gross
  2013-05-24 22:34             ` Eric Dumazet
  2013-05-25  7:02             ` Thomas Graf
  0 siblings, 2 replies; 20+ messages in thread
From: Jesse Gross @ 2013-05-24 22:18 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Thomas Graf, netdev, dev

On Fri, May 24, 2013 at 2:57 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2013-05-24 at 14:23 -0700, Jesse Gross wrote:
>> On Fri, May 24, 2013 at 11:58 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > On Fri, 2013-05-24 at 10:24 -0700, Jesse Gross wrote:
>> >
>> >> Does this have any impact on small packets? Those are usually the
>> >> common case (i.e. TCP SYN) and I think this is slightly less optimal
>> >> for those.
>> >
>> > No difference at all, small packets are copied anyway in skb->head
>>
>> Yes, but it makes the Open vSwitch code slightly worse - for example,
>> currently checksumming and copying are done in a single step but this
>> prevents that. Actually, I'm also curious about the test case that was
>> used for large packets and the full profile output since checksumming
>> and GSO aren't listed in the one that Thomas gave.
>>
>
> GSO is fully supported in nfnetlink, I see no reason why Open vSwitch
> would not allow that.

Offloads are supported. What I want to know is how they affect
performance with this change.

>> My guess is that there isn't a real different for small packets since
>> everything will be in the cache but it seems worth checking given that
>> this is optimizing a rare case at the expense of the common one.
>
> I really doubt checksumming a SYN/ACK packet is that a performance
> issue. Do you have performance numbers ?
>
> You could always provide a patch to restore this copy/checksum if it
> really gives a benefit, and if people still use NIC not doing this
> checksum.

If it makes a difference then it needs to be addressed before this
patch goes in since it's the common case. I don't think it will but
that's why I'm asking for numbers.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 22:18           ` Jesse Gross
@ 2013-05-24 22:34             ` Eric Dumazet
  2013-05-25  0:08               ` Jesse Gross
  2013-05-25  7:02             ` Thomas Graf
  1 sibling, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2013-05-24 22:34 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Thomas Graf, netdev, dev

On Fri, 2013-05-24 at 15:18 -0700, Jesse Gross wrote:

> Offloads are supported. What I want to know is how they affect
> performance with this change.

Hmm, I do not understand why you are checksumming then.

skb_copy_and_csum_dev() is a killer.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 22:34             ` Eric Dumazet
@ 2013-05-25  0:08               ` Jesse Gross
  2013-05-25  0:32                 ` Eric Dumazet
  0 siblings, 1 reply; 20+ messages in thread
From: Jesse Gross @ 2013-05-25  0:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Thomas Graf, netdev, dev

On Fri, May 24, 2013 at 3:34 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2013-05-24 at 15:18 -0700, Jesse Gross wrote:
>
>> Offloads are supported. What I want to know is how they affect
>> performance with this change.
>
> Hmm, I do not understand why you are checksumming then.

Checksum offloading is an internal kernel optimization, so the goal is
to shield userspace from needing to know about it.

> skb_copy_and_csum_dev() is a killer.

What's the alternative?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-25  0:08               ` Jesse Gross
@ 2013-05-25  0:32                 ` Eric Dumazet
  0 siblings, 0 replies; 20+ messages in thread
From: Eric Dumazet @ 2013-05-25  0:32 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Thomas Graf, netdev, dev

On Fri, 2013-05-24 at 17:08 -0700, Jesse Gross wrote:

> What's the alternative?

I guess Thomas was working on this ;)

We addressed all this on nfnetlink lately.

Presumably GSO stuff is a non issue for vSwitch upcalls.

But UDP messages can be big, and their checksum might be already
validated by the NIC.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-24 22:18           ` Jesse Gross
  2013-05-24 22:34             ` Eric Dumazet
@ 2013-05-25  7:02             ` Thomas Graf
  2013-05-25 14:39               ` Eric Dumazet
  1 sibling, 1 reply; 20+ messages in thread
From: Thomas Graf @ 2013-05-25  7:02 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Eric Dumazet, netdev, dev

On 05/24/13 at 03:18pm, Jesse Gross wrote:
> On Fri, May 24, 2013 at 2:57 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> My guess is that there isn't a real different for small packets since
> >> everything will be in the cache but it seems worth checking given that
> >> this is optimizing a rare case at the expense of the common one.
> >
> > I really doubt checksumming a SYN/ACK packet is that a performance
> > issue. Do you have performance numbers ?
> >
> > You could always provide a patch to restore this copy/checksum if it
> > really gives a benefit, and if people still use NIC not doing this
> > checksum.
> 
> If it makes a difference then it needs to be addressed before this
> patch goes in since it's the common case. I don't think it will but
> that's why I'm asking for numbers.

I ran TCP_CRR to verify the SYN/ACK use case and I did not
observe a difference. If you have any specific test in mind
I will be glad to run that before posting the 2nd revision.

The CPU numbers as reported was seen when testing with pktgen
at 1400 bytes with 8K flows and a 10/8 random IP source address.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-25  7:02             ` Thomas Graf
@ 2013-05-25 14:39               ` Eric Dumazet
  2013-05-27  1:28                 ` Jesse Gross
  0 siblings, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2013-05-25 14:39 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Jesse Gross, netdev, dev

On Sat, 2013-05-25 at 08:02 +0100, Thomas Graf wrote:

> I ran TCP_CRR to verify the SYN/ACK use case and I did not
> observe a difference. If you have any specific test in mind
> I will be glad to run that before posting the 2nd revision.

I guess you should test with rx checksum disabled as well, Jesse seemed
to be concerned about that.

ethtool -K eth0 rx off

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-25 14:39               ` Eric Dumazet
@ 2013-05-27  1:28                 ` Jesse Gross
  2013-06-07 12:21                   ` Thomas Graf
  0 siblings, 1 reply; 20+ messages in thread
From: Jesse Gross @ 2013-05-27  1:28 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, dev-yBygre7rU0TnMu66kgdUjQ


[-- Attachment #1.1: Type: text/plain, Size: 696 bytes --]

On Saturday, May 25, 2013, Eric Dumazet wrote:

> On Sat, 2013-05-25 at 08:02 +0100, Thomas Graf wrote:
>
> > I ran TCP_CRR to verify the SYN/ACK use case and I did not
> > observe a difference. If you have any specific test in mind
> > I will be glad to run that before posting the 2nd revision.
>
> I guess you should test with rx checksum disabled as well, Jesse seemed
> to be concerned about that.


 I was actually thinking about the transmit side - rx checksum verification
doesn't matter much here since the result will get thrown away. However, if
the packet is CHECKSUM_PARTIAL then the checksum will have to get filled in
first and that's the code path that is a little different now.

[-- Attachment #1.2: Type: text/html, Size: 1005 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-05-27  1:28                 ` Jesse Gross
@ 2013-06-07 12:21                   ` Thomas Graf
  2013-06-10 20:34                     ` Jesse Gross
  0 siblings, 1 reply; 20+ messages in thread
From: Thomas Graf @ 2013-06-07 12:21 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Eric Dumazet, netdev, dev

On 05/27/13 at 10:28am, Jesse Gross wrote:
> On Saturday, May 25, 2013, Eric Dumazet wrote:
> 
> > On Sat, 2013-05-25 at 08:02 +0100, Thomas Graf wrote:
> >
> > > I ran TCP_CRR to verify the SYN/ACK use case and I did not
> > > observe a difference. If you have any specific test in mind
> > > I will be glad to run that before posting the 2nd revision.
> >
> > I guess you should test with rx checksum disabled as well, Jesse seemed
> > to be concerned about that.
> 
> 
>  I was actually thinking about the transmit side - rx checksum verification
> doesn't matter much here since the result will get thrown away. However, if
> the packet is CHECKSUM_PARTIAL then the checksum will have to get filled in
> first and that's the code path that is a little different now.

Do we actually need to complete the checksum before doing the
upcall and if so, why? Couldn't the slow path do that if needed?
The only reason I can think of where it would matter is if a
controller injects the packet into another network stack such
as RouteFlow.

On the receive side, hitting an ovs bridge connected to two
interface with ~18G of randomized flows:

3.10.0-rc2+:
+   9.44%   ksoftirqd/0  [k] csum_partial_copy_generic
+   3.54%  ovs-vswitchd  [k] copy_user_enhanced_fast_string
+   3.08%       swapper  [k] intel_idle
+   2.47%   ksoftirqd/0  [k] memset
+   2.21%   ksoftirqd/0  [k] memcpy
+   1.64%   ksoftirqd/0  [k] __nla_reserve
+   1.26%  ovs-vswitchd  [k] netlink_recvmsg
+   1.07%   ksoftirqd/0  [k] __netif_receive_skb_core
+   1.02%   ksoftirqd/0  [k] __pskb_pull_tail
+   0.98%       swapper  [k] csum_partial_copy_generic
+   0.95%  ovs-vswitchd  [k] _raw_spin_lock
+   0.94%  ovs-vswitchd  [k] _raw_spin_lock_irqsave
+   0.83%  ovs-vswitchd  [k] memcpy
+   0.80%   ksoftirqd/0  [k] __alloc_skb
+   0.78%   ksoftirqd/0  [k] skb_copy_bits
+   0.71%       swapper  [k] menu_select
+   0.71%   ksoftirqd/2  [k] csum_partial_copy_generic                              

3.10.0-rc2+ + zerocopy
+   4.84%    ovs-vswitchd  [k] copy_user_enhanced_fast_string
+   4.74%         swapper  [k] intel_idle
+   3.14%         swapper  [k] memset
+   3.04%         swapper  [k] memcpy
+   2.10%         swapper  [k] __nla_reserve
+   1.44%         swapper  [k] skb_copy_bits
+   1.40%         swapper  [k] __netif_receive_skb_core
+   1.34%         swapper  [k] __pskb_pull_tail
+   1.23%    ovs-vswitchd  [k] _raw_spin_lock
+   1.16%    ovs-vswitchd  [k] fib_table_lookup
+   1.16%         swapper  [k] irq_entries_start
+   1.09%    ovs-vswitchd  [k] memcpy
+   1.05%         swapper  [k] __alloc_skb
+   1.03%         swapper  [k] lapic_next_deadline
+   1.02%         swapper  [k] build_skb
+   0.99%         swapper  [k] skb_zerocopy                  

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-06-07 12:21                   ` Thomas Graf
@ 2013-06-10 20:34                     ` Jesse Gross
  2013-06-10 20:43                       ` David Miller
  0 siblings, 1 reply; 20+ messages in thread
From: Jesse Gross @ 2013-06-10 20:34 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Eric Dumazet, netdev, dev

On Fri, Jun 7, 2013 at 5:21 AM, Thomas Graf <tgraf@suug.ch> wrote:
> On 05/27/13 at 10:28am, Jesse Gross wrote:
>> On Saturday, May 25, 2013, Eric Dumazet wrote:
>>
>> > On Sat, 2013-05-25 at 08:02 +0100, Thomas Graf wrote:
>> >
>> > > I ran TCP_CRR to verify the SYN/ACK use case and I did not
>> > > observe a difference. If you have any specific test in mind
>> > > I will be glad to run that before posting the 2nd revision.
>> >
>> > I guess you should test with rx checksum disabled as well, Jesse seemed
>> > to be concerned about that.
>>
>>
>>  I was actually thinking about the transmit side - rx checksum verification
>> doesn't matter much here since the result will get thrown away. However, if
>> the packet is CHECKSUM_PARTIAL then the checksum will have to get filled in
>> first and that's the code path that is a little different now.
>
> Do we actually need to complete the checksum before doing the
> upcall and if so, why? Couldn't the slow path do that if needed?
> The only reason I can think of where it would matter is if a
> controller injects the packet into another network stack such
> as RouteFlow.

Well, this is the slow path. I don't want to force userspace to deal
with this because it's an internal kernel optimization that is
platform-specific and requires carrying additional metadata around.

It's also very common for a packet to traverse another network stack -
it's typically a VM's.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-06-10 20:34                     ` Jesse Gross
@ 2013-06-10 20:43                       ` David Miller
  2013-06-10 20:55                         ` Jesse Gross
  0 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2013-06-10 20:43 UTC (permalink / raw)
  To: jesse; +Cc: tgraf, eric.dumazet, netdev, dev

From: Jesse Gross <jesse@nicira.com>
Date: Mon, 10 Jun 2013 13:34:23 -0700

> Well, this is the slow path. I don't want to force userspace to deal
> with this because it's an internal kernel optimization that is
> platform-specific and requires carrying additional metadata around.

I think the grounds for this rejection is over the top and
unreasonable.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall
  2013-06-10 20:43                       ` David Miller
@ 2013-06-10 20:55                         ` Jesse Gross
  0 siblings, 0 replies; 20+ messages in thread
From: Jesse Gross @ 2013-06-10 20:55 UTC (permalink / raw)
  To: David Miller; +Cc: Thomas Graf, Eric Dumazet, netdev, dev

On Mon, Jun 10, 2013 at 1:43 PM, David Miller <davem@davemloft.net> wrote:
> From: Jesse Gross <jesse@nicira.com>
> Date: Mon, 10 Jun 2013 13:34:23 -0700
>
>> Well, this is the slow path. I don't want to force userspace to deal
>> with this because it's an internal kernel optimization that is
>> platform-specific and requires carrying additional metadata around.
>
> I think the grounds for this rejection is over the top and
> unreasonable.

I didn't reject it and in fact, I expect it to go in. I asked for
numbers as to how this affects the common case.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2013-06-10 20:55 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-24 14:52 [PATCH net-next 0/2] Open vSwitch zerocopy upcall Thomas Graf
2013-05-24 14:52 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
2013-05-24 15:31   ` Eric Dumazet
2013-05-24 16:01     ` Thomas Graf
2013-05-24 14:52 ` [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall Thomas Graf
2013-05-24 17:24   ` Jesse Gross
2013-05-24 18:58     ` Eric Dumazet
2013-05-24 21:23       ` Jesse Gross
2013-05-24 21:57         ` Eric Dumazet
2013-05-24 22:18           ` Jesse Gross
2013-05-24 22:34             ` Eric Dumazet
2013-05-25  0:08               ` Jesse Gross
2013-05-25  0:32                 ` Eric Dumazet
2013-05-25  7:02             ` Thomas Graf
2013-05-25 14:39               ` Eric Dumazet
2013-05-27  1:28                 ` Jesse Gross
2013-06-07 12:21                   ` Thomas Graf
2013-06-10 20:34                     ` Jesse Gross
2013-06-10 20:43                       ` David Miller
2013-06-10 20:55                         ` Jesse Gross

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).