* [PATCH net-next 0/2] Open vSwitch zerocopy upcall @ 2013-05-24 14:52 Thomas Graf 2013-05-24 14:52 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf 2013-05-24 14:52 ` [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall Thomas Graf 0 siblings, 2 replies; 21+ messages in thread From: Thomas Graf @ 2013-05-24 14:52 UTC (permalink / raw) To: netdev, jesse; +Cc: dev, eric.dumazet Posting this to netdev due to dependency on first patch. Thomas Graf (2): net: Export skb_zerocopy() to zerocopy from one skb to another openvswitch: Use zerocopy if applicable when performing the upcall include/linux/skbuff.h | 2 ++ net/core/skbuff.c | 46 +++++++++++++++++++++++++++++++++++ net/netfilter/nfnetlink_queue_core.c | 47 +----------------------------------- net/openvswitch/datapath.c | 38 ++++++++++++++++++++++++----- 4 files changed, 81 insertions(+), 52 deletions(-) -- 1.7.11.7 ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another 2013-05-24 14:52 [PATCH net-next 0/2] Open vSwitch zerocopy upcall Thomas Graf @ 2013-05-24 14:52 ` Thomas Graf 2013-05-24 15:31 ` Eric Dumazet 2013-05-24 14:52 ` [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall Thomas Graf 1 sibling, 1 reply; 21+ messages in thread From: Thomas Graf @ 2013-05-24 14:52 UTC (permalink / raw) To: netdev, jesse; +Cc: dev, eric.dumazet Make the skb zerocopy logic written for nfnetlink queue available for use by other modules. Signed-off-by: Thomas Graf <tgraf@suug.ch> --- include/linux/skbuff.h | 2 ++ net/core/skbuff.c | 46 +++++++++++++++++++++++++++++++++++ net/netfilter/nfnetlink_queue_core.c | 47 +----------------------------------- 3 files changed, 49 insertions(+), 46 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2e0ced1..ff2cbe8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2478,6 +2478,8 @@ extern int skb_splice_bits(struct sk_buff *skb, unsigned int len, unsigned int flags); extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); +extern void skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, + int len, int hlen); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d629891..cb60bf6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2118,6 +2118,52 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) } EXPORT_SYMBOL(skb_copy_and_csum_dev); +void +skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + struct page *page; + unsigned int offset; + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) { + skb_copy_bits(from, 0, skb_put(to, len), len); + return; + } + + if (hlen) { + skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_page_desc(to, 0, page, offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + to->truesize += len + plen; + to->len += len + plen; + to->data_len += len + plen; + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); + len -= skb_shinfo(to)->frags[j].size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); + /** * skb_dequeue - remove from the head of the queue * @list: list to dequeue from diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 2e0e835..7ac9dba 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -227,51 +227,6 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) spin_unlock_bh(&queue->lock); } -static void -nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) -{ - int i, j = 0; - int plen = 0; /* length of skb->head fragment */ - struct page *page; - unsigned int offset; - - /* dont bother with small payloads */ - if (len <= skb_tailroom(to)) { - skb_copy_bits(from, 0, skb_put(to, len), len); - return; - } - - if (hlen) { - skb_copy_bits(from, 0, skb_put(to, hlen), hlen); - len -= hlen; - } else { - plen = min_t(int, skb_headlen(from), len); - if (plen) { - page = virt_to_head_page(from->head); - offset = from->data - (unsigned char *)page_address(page); - __skb_fill_page_desc(to, 0, page, offset, plen); - get_page(page); - j = 1; - len -= plen; - } - } - - to->truesize += len + plen; - to->len += len + plen; - to->data_len += len + plen; - - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { - if (!len) - break; - skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; - skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); - len -= skb_shinfo(to)->frags[j].size; - skb_frag_ref(to, j); - j++; - } - skb_shinfo(to)->nr_frags = j; -} - static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet) { __u32 flags = 0; @@ -481,7 +436,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); - nfqnl_zcopy(skb, entskb, data_len, hlen); + skb_zerocopy(skb, entskb, data_len, hlen); } nlh->nlmsg_len = skb->len; -- 1.7.11.7 ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another 2013-05-24 14:52 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf @ 2013-05-24 15:31 ` Eric Dumazet 2013-05-24 16:01 ` Thomas Graf 0 siblings, 1 reply; 21+ messages in thread From: Eric Dumazet @ 2013-05-24 15:31 UTC (permalink / raw) To: Thomas Graf; +Cc: netdev, jesse, dev On Fri, 2013-05-24 at 16:52 +0200, Thomas Graf wrote: > Make the skb zerocopy logic written for nfnetlink queue available for > use by other modules. > > Signed-off-by: Thomas Graf <tgraf@suug.ch> > --- > include/linux/skbuff.h | 2 ++ > net/core/skbuff.c | 46 +++++++++++++++++++++++++++++++++++ > net/netfilter/nfnetlink_queue_core.c | 47 +----------------------------------- > 3 files changed, 49 insertions(+), 46 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 2e0ced1..ff2cbe8 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -2478,6 +2478,8 @@ extern int skb_splice_bits(struct sk_buff *skb, > unsigned int len, > unsigned int flags); > extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); > +extern void skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, > + int len, int hlen); > extern void skb_split(struct sk_buff *skb, > struct sk_buff *skb1, const u32 len); > extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index d629891..cb60bf6 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -2118,6 +2118,52 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) > } > EXPORT_SYMBOL(skb_copy_and_csum_dev); > > +void > +skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) > +{ > + int i, j = 0; > + int plen = 0; /* length of skb->head fragment */ > + struct page *page; > + unsigned int offset; > + > + /* dont bother with small payloads */ > + if (len <= skb_tailroom(to)) { > + skb_copy_bits(from, 0, skb_put(to, len), len); > + return; > + } > + > + if (hlen) { > + skb_copy_bits(from, 0, skb_put(to, hlen), hlen); > + len -= hlen; > + } else { > + plen = min_t(int, skb_headlen(from), len); > + if (plen) { > + page = virt_to_head_page(from->head); Hmm. Big warning here : In the nfnetlink case, we specifically made sure that this path was taken only if skb->head_frag was set (all incoming packets have this property, but not outgoing ones yet) This should be documented if we move it to net/core/skbuff.c, or we should add a BUG_ON() I see your openvswitch code seems OK. Maybe also put in net/core/skbuff.c the code dealing with this stuff ? + if (!skb->head_frag || + skb_headlen(skb) < L1_CACHE_BYTES || + skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) + hlen = skb_headlen(skb); + + if (skb_has_frag_list(skb)) + hlen = skb->len; > + offset = from->data - (unsigned char *)page_address(page); > + __skb_fill_page_desc(to, 0, page, offset, plen); > + get_page(page); > + j = 1; > + len -= plen; > + } > + } ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another 2013-05-24 15:31 ` Eric Dumazet @ 2013-05-24 16:01 ` Thomas Graf 0 siblings, 0 replies; 21+ messages in thread From: Thomas Graf @ 2013-05-24 16:01 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev, jesse, dev On 05/24/13 at 08:31am, Eric Dumazet wrote: > Hmm. Big warning here : In the nfnetlink case, we specifically made sure > that this path was taken only if skb->head_frag was set (all incoming > packets have this property, but not outgoing ones yet) > > This should be documented if we move it to net/core/skbuff.c, or we > should add a BUG_ON() Good point. I guess we could ensure in skb_zerocopy() that if !head_frag then we enforce hlen = max(hlen, skb_headlen(skb)) to always fall back to skb_copy_bits() for !head_frag. > I see your openvswitch code seems OK. > > Maybe also put in net/core/skbuff.c the code dealing with this stuff ? > > + if (!skb->head_frag || > + skb_headlen(skb) < L1_CACHE_BYTES || > + skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) > + hlen = skb_headlen(skb); > + > + if (skb_has_frag_list(skb)) > + hlen = skb->len; I'll put this into skb_zerocopy_hdrlen() ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 14:52 [PATCH net-next 0/2] Open vSwitch zerocopy upcall Thomas Graf 2013-05-24 14:52 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf @ 2013-05-24 14:52 ` Thomas Graf 2013-05-24 17:24 ` Jesse Gross 1 sibling, 1 reply; 21+ messages in thread From: Thomas Graf @ 2013-05-24 14:52 UTC (permalink / raw) To: netdev, jesse; +Cc: dev, eric.dumazet, Thomas Graf From: Thomas Graf <tgraf@rhlap.localdomain> Avoids a memcpy() that is expensive for large packets: from: 4.51% [kernel] [k] memcpy to: 1.20% [kernel] [k] memcpy 1.01% [kernel] [k] skb_zerocopy Signed-off-by: Thomas Graf <tgraf@suug.ch> --- net/openvswitch/datapath.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d12d6b8..b434adf 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -375,10 +375,11 @@ static size_t key_attr_size(void) } static size_t upcall_msg_size(const struct sk_buff *skb, - const struct nlattr *userdata) + const struct nlattr *userdata, + unsigned int hdrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */ + + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ /* OVS_PACKET_ATTR_USERDATA */ @@ -396,6 +397,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, struct sk_buff *nskb = NULL; struct sk_buff *user_skb; /* to be queued to userspace */ struct nlattr *nla; + unsigned int plen, hlen = 0; int err; if (vlan_tx_tag_present(skb)) { @@ -416,7 +418,21 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, goto out; } - user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC); + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto out; + + if (!skb->head_frag || + skb_headlen(skb) < L1_CACHE_BYTES || + skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS) + hlen = skb_headlen(skb); + + if (skb_has_frag_list(skb)) + hlen = skb->len; + + hlen = min_t(int, skb->len, hlen); + + user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata, hlen), GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -435,11 +451,21 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); - nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len); + if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) + goto out; + nla->nla_len = nla_attr_size(skb->len); + + skb_zerocopy(user_skb, skb, skb->len, hlen); - skb_copy_and_csum_dev(skb, nla_data(nla)); + plen = NLA_ALIGN(user_skb->len) - user_skb->len; + if (plen > 0) + memset(skb_put(user_skb, plen), 0, plen); + + /* Fix alignment of .nlmsg_len, OVS user space enforces a strict + * total message size alignment. + */ + ((struct nlmsghdr *) user_skb->data)->nlmsg_len = NLA_ALIGN(user_skb->len); - genlmsg_end(user_skb, upcall); err = genlmsg_unicast(net, user_skb, upcall_info->portid); out: -- 1.7.11.7 ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 14:52 ` [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall Thomas Graf @ 2013-05-24 17:24 ` Jesse Gross 2013-05-24 18:58 ` Eric Dumazet 0 siblings, 1 reply; 21+ messages in thread From: Jesse Gross @ 2013-05-24 17:24 UTC (permalink / raw) To: Thomas Graf; +Cc: netdev, dev, Eric Dumazet On Fri, May 24, 2013 at 7:52 AM, Thomas Graf <tgraf@suug.ch> wrote: > From: Thomas Graf <tgraf@rhlap.localdomain> > > Avoids a memcpy() that is expensive for large packets: > > from: > 4.51% [kernel] [k] memcpy > to: > 1.20% [kernel] [k] memcpy > 1.01% [kernel] [k] skb_zerocopy > > Signed-off-by: Thomas Graf <tgraf@suug.ch> Does this have any impact on small packets? Those are usually the common case (i.e. TCP SYN) and I think this is slightly less optimal for those. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 17:24 ` Jesse Gross @ 2013-05-24 18:58 ` Eric Dumazet 2013-05-24 21:23 ` Jesse Gross 0 siblings, 1 reply; 21+ messages in thread From: Eric Dumazet @ 2013-05-24 18:58 UTC (permalink / raw) To: Jesse Gross; +Cc: Thomas Graf, netdev, dev On Fri, 2013-05-24 at 10:24 -0700, Jesse Gross wrote: > Does this have any impact on small packets? Those are usually the > common case (i.e. TCP SYN) and I think this is slightly less optimal > for those. No difference at all, small packets are copied anyway in skb->head ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 18:58 ` Eric Dumazet @ 2013-05-24 21:23 ` Jesse Gross 2013-05-24 21:57 ` Eric Dumazet 0 siblings, 1 reply; 21+ messages in thread From: Jesse Gross @ 2013-05-24 21:23 UTC (permalink / raw) To: Eric Dumazet; +Cc: Thomas Graf, netdev, dev On Fri, May 24, 2013 at 11:58 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Fri, 2013-05-24 at 10:24 -0700, Jesse Gross wrote: > >> Does this have any impact on small packets? Those are usually the >> common case (i.e. TCP SYN) and I think this is slightly less optimal >> for those. > > No difference at all, small packets are copied anyway in skb->head Yes, but it makes the Open vSwitch code slightly worse - for example, currently checksumming and copying are done in a single step but this prevents that. Actually, I'm also curious about the test case that was used for large packets and the full profile output since checksumming and GSO aren't listed in the one that Thomas gave. My guess is that there isn't a real different for small packets since everything will be in the cache but it seems worth checking given that this is optimizing a rare case at the expense of the common one. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 21:23 ` Jesse Gross @ 2013-05-24 21:57 ` Eric Dumazet 2013-05-24 22:18 ` Jesse Gross 0 siblings, 1 reply; 21+ messages in thread From: Eric Dumazet @ 2013-05-24 21:57 UTC (permalink / raw) To: Jesse Gross; +Cc: Thomas Graf, netdev, dev On Fri, 2013-05-24 at 14:23 -0700, Jesse Gross wrote: > On Fri, May 24, 2013 at 11:58 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > On Fri, 2013-05-24 at 10:24 -0700, Jesse Gross wrote: > > > >> Does this have any impact on small packets? Those are usually the > >> common case (i.e. TCP SYN) and I think this is slightly less optimal > >> for those. > > > > No difference at all, small packets are copied anyway in skb->head > > Yes, but it makes the Open vSwitch code slightly worse - for example, > currently checksumming and copying are done in a single step but this > prevents that. Actually, I'm also curious about the test case that was > used for large packets and the full profile output since checksumming > and GSO aren't listed in the one that Thomas gave. > GSO is fully supported in nfnetlink, I see no reason why Open vSwitch would not allow that. > My guess is that there isn't a real different for small packets since > everything will be in the cache but it seems worth checking given that > this is optimizing a rare case at the expense of the common one. I really doubt checksumming a SYN/ACK packet is that a performance issue. Do you have performance numbers ? You could always provide a patch to restore this copy/checksum if it really gives a benefit, and if people still use NIC not doing this checksum. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 21:57 ` Eric Dumazet @ 2013-05-24 22:18 ` Jesse Gross 2013-05-24 22:34 ` Eric Dumazet 2013-05-25 7:02 ` Thomas Graf 0 siblings, 2 replies; 21+ messages in thread From: Jesse Gross @ 2013-05-24 22:18 UTC (permalink / raw) To: Eric Dumazet; +Cc: Thomas Graf, netdev, dev On Fri, May 24, 2013 at 2:57 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Fri, 2013-05-24 at 14:23 -0700, Jesse Gross wrote: >> On Fri, May 24, 2013 at 11:58 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: >> > On Fri, 2013-05-24 at 10:24 -0700, Jesse Gross wrote: >> > >> >> Does this have any impact on small packets? Those are usually the >> >> common case (i.e. TCP SYN) and I think this is slightly less optimal >> >> for those. >> > >> > No difference at all, small packets are copied anyway in skb->head >> >> Yes, but it makes the Open vSwitch code slightly worse - for example, >> currently checksumming and copying are done in a single step but this >> prevents that. Actually, I'm also curious about the test case that was >> used for large packets and the full profile output since checksumming >> and GSO aren't listed in the one that Thomas gave. >> > > GSO is fully supported in nfnetlink, I see no reason why Open vSwitch > would not allow that. Offloads are supported. What I want to know is how they affect performance with this change. >> My guess is that there isn't a real different for small packets since >> everything will be in the cache but it seems worth checking given that >> this is optimizing a rare case at the expense of the common one. > > I really doubt checksumming a SYN/ACK packet is that a performance > issue. Do you have performance numbers ? > > You could always provide a patch to restore this copy/checksum if it > really gives a benefit, and if people still use NIC not doing this > checksum. If it makes a difference then it needs to be addressed before this patch goes in since it's the common case. I don't think it will but that's why I'm asking for numbers. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 22:18 ` Jesse Gross @ 2013-05-24 22:34 ` Eric Dumazet 2013-05-25 0:08 ` Jesse Gross 2013-05-25 7:02 ` Thomas Graf 1 sibling, 1 reply; 21+ messages in thread From: Eric Dumazet @ 2013-05-24 22:34 UTC (permalink / raw) To: Jesse Gross; +Cc: Thomas Graf, netdev, dev On Fri, 2013-05-24 at 15:18 -0700, Jesse Gross wrote: > Offloads are supported. What I want to know is how they affect > performance with this change. Hmm, I do not understand why you are checksumming then. skb_copy_and_csum_dev() is a killer. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 22:34 ` Eric Dumazet @ 2013-05-25 0:08 ` Jesse Gross 2013-05-25 0:32 ` Eric Dumazet 0 siblings, 1 reply; 21+ messages in thread From: Jesse Gross @ 2013-05-25 0:08 UTC (permalink / raw) To: Eric Dumazet; +Cc: Thomas Graf, netdev, dev On Fri, May 24, 2013 at 3:34 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Fri, 2013-05-24 at 15:18 -0700, Jesse Gross wrote: > >> Offloads are supported. What I want to know is how they affect >> performance with this change. > > Hmm, I do not understand why you are checksumming then. Checksum offloading is an internal kernel optimization, so the goal is to shield userspace from needing to know about it. > skb_copy_and_csum_dev() is a killer. What's the alternative? ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-25 0:08 ` Jesse Gross @ 2013-05-25 0:32 ` Eric Dumazet 0 siblings, 0 replies; 21+ messages in thread From: Eric Dumazet @ 2013-05-25 0:32 UTC (permalink / raw) To: Jesse Gross; +Cc: Thomas Graf, netdev, dev On Fri, 2013-05-24 at 17:08 -0700, Jesse Gross wrote: > What's the alternative? I guess Thomas was working on this ;) We addressed all this on nfnetlink lately. Presumably GSO stuff is a non issue for vSwitch upcalls. But UDP messages can be big, and their checksum might be already validated by the NIC. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-24 22:18 ` Jesse Gross 2013-05-24 22:34 ` Eric Dumazet @ 2013-05-25 7:02 ` Thomas Graf 2013-05-25 14:39 ` Eric Dumazet 1 sibling, 1 reply; 21+ messages in thread From: Thomas Graf @ 2013-05-25 7:02 UTC (permalink / raw) To: Jesse Gross; +Cc: Eric Dumazet, netdev, dev On 05/24/13 at 03:18pm, Jesse Gross wrote: > On Fri, May 24, 2013 at 2:57 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > >> My guess is that there isn't a real different for small packets since > >> everything will be in the cache but it seems worth checking given that > >> this is optimizing a rare case at the expense of the common one. > > > > I really doubt checksumming a SYN/ACK packet is that a performance > > issue. Do you have performance numbers ? > > > > You could always provide a patch to restore this copy/checksum if it > > really gives a benefit, and if people still use NIC not doing this > > checksum. > > If it makes a difference then it needs to be addressed before this > patch goes in since it's the common case. I don't think it will but > that's why I'm asking for numbers. I ran TCP_CRR to verify the SYN/ACK use case and I did not observe a difference. If you have any specific test in mind I will be glad to run that before posting the 2nd revision. The CPU numbers as reported was seen when testing with pktgen at 1400 bytes with 8K flows and a 10/8 random IP source address. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-25 7:02 ` Thomas Graf @ 2013-05-25 14:39 ` Eric Dumazet 2013-05-27 1:28 ` Jesse Gross 0 siblings, 1 reply; 21+ messages in thread From: Eric Dumazet @ 2013-05-25 14:39 UTC (permalink / raw) To: Thomas Graf; +Cc: Jesse Gross, netdev, dev On Sat, 2013-05-25 at 08:02 +0100, Thomas Graf wrote: > I ran TCP_CRR to verify the SYN/ACK use case and I did not > observe a difference. If you have any specific test in mind > I will be glad to run that before posting the 2nd revision. I guess you should test with rx checksum disabled as well, Jesse seemed to be concerned about that. ethtool -K eth0 rx off ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-25 14:39 ` Eric Dumazet @ 2013-05-27 1:28 ` Jesse Gross 2013-06-07 12:21 ` Thomas Graf 0 siblings, 1 reply; 21+ messages in thread From: Jesse Gross @ 2013-05-27 1:28 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev, dev-yBygre7rU0TnMu66kgdUjQ [-- Attachment #1.1: Type: text/plain, Size: 696 bytes --] On Saturday, May 25, 2013, Eric Dumazet wrote: > On Sat, 2013-05-25 at 08:02 +0100, Thomas Graf wrote: > > > I ran TCP_CRR to verify the SYN/ACK use case and I did not > > observe a difference. If you have any specific test in mind > > I will be glad to run that before posting the 2nd revision. > > I guess you should test with rx checksum disabled as well, Jesse seemed > to be concerned about that. I was actually thinking about the transmit side - rx checksum verification doesn't matter much here since the result will get thrown away. However, if the packet is CHECKSUM_PARTIAL then the checksum will have to get filled in first and that's the code path that is a little different now. [-- Attachment #1.2: Type: text/html, Size: 1005 bytes --] [-- Attachment #2: Type: text/plain, Size: 0 bytes --] ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-05-27 1:28 ` Jesse Gross @ 2013-06-07 12:21 ` Thomas Graf 2013-06-10 20:34 ` Jesse Gross 0 siblings, 1 reply; 21+ messages in thread From: Thomas Graf @ 2013-06-07 12:21 UTC (permalink / raw) To: Jesse Gross; +Cc: Eric Dumazet, netdev, dev On 05/27/13 at 10:28am, Jesse Gross wrote: > On Saturday, May 25, 2013, Eric Dumazet wrote: > > > On Sat, 2013-05-25 at 08:02 +0100, Thomas Graf wrote: > > > > > I ran TCP_CRR to verify the SYN/ACK use case and I did not > > > observe a difference. If you have any specific test in mind > > > I will be glad to run that before posting the 2nd revision. > > > > I guess you should test with rx checksum disabled as well, Jesse seemed > > to be concerned about that. > > > I was actually thinking about the transmit side - rx checksum verification > doesn't matter much here since the result will get thrown away. However, if > the packet is CHECKSUM_PARTIAL then the checksum will have to get filled in > first and that's the code path that is a little different now. Do we actually need to complete the checksum before doing the upcall and if so, why? Couldn't the slow path do that if needed? The only reason I can think of where it would matter is if a controller injects the packet into another network stack such as RouteFlow. On the receive side, hitting an ovs bridge connected to two interface with ~18G of randomized flows: 3.10.0-rc2+: + 9.44% ksoftirqd/0 [k] csum_partial_copy_generic + 3.54% ovs-vswitchd [k] copy_user_enhanced_fast_string + 3.08% swapper [k] intel_idle + 2.47% ksoftirqd/0 [k] memset + 2.21% ksoftirqd/0 [k] memcpy + 1.64% ksoftirqd/0 [k] __nla_reserve + 1.26% ovs-vswitchd [k] netlink_recvmsg + 1.07% ksoftirqd/0 [k] __netif_receive_skb_core + 1.02% ksoftirqd/0 [k] __pskb_pull_tail + 0.98% swapper [k] csum_partial_copy_generic + 0.95% ovs-vswitchd [k] _raw_spin_lock + 0.94% ovs-vswitchd [k] _raw_spin_lock_irqsave + 0.83% ovs-vswitchd [k] memcpy + 0.80% ksoftirqd/0 [k] __alloc_skb + 0.78% ksoftirqd/0 [k] skb_copy_bits + 0.71% swapper [k] menu_select + 0.71% ksoftirqd/2 [k] csum_partial_copy_generic 3.10.0-rc2+ + zerocopy + 4.84% ovs-vswitchd [k] copy_user_enhanced_fast_string + 4.74% swapper [k] intel_idle + 3.14% swapper [k] memset + 3.04% swapper [k] memcpy + 2.10% swapper [k] __nla_reserve + 1.44% swapper [k] skb_copy_bits + 1.40% swapper [k] __netif_receive_skb_core + 1.34% swapper [k] __pskb_pull_tail + 1.23% ovs-vswitchd [k] _raw_spin_lock + 1.16% ovs-vswitchd [k] fib_table_lookup + 1.16% swapper [k] irq_entries_start + 1.09% ovs-vswitchd [k] memcpy + 1.05% swapper [k] __alloc_skb + 1.03% swapper [k] lapic_next_deadline + 1.02% swapper [k] build_skb + 0.99% swapper [k] skb_zerocopy ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-06-07 12:21 ` Thomas Graf @ 2013-06-10 20:34 ` Jesse Gross 2013-06-10 20:43 ` David Miller 0 siblings, 1 reply; 21+ messages in thread From: Jesse Gross @ 2013-06-10 20:34 UTC (permalink / raw) To: Thomas Graf; +Cc: Eric Dumazet, netdev, dev On Fri, Jun 7, 2013 at 5:21 AM, Thomas Graf <tgraf@suug.ch> wrote: > On 05/27/13 at 10:28am, Jesse Gross wrote: >> On Saturday, May 25, 2013, Eric Dumazet wrote: >> >> > On Sat, 2013-05-25 at 08:02 +0100, Thomas Graf wrote: >> > >> > > I ran TCP_CRR to verify the SYN/ACK use case and I did not >> > > observe a difference. If you have any specific test in mind >> > > I will be glad to run that before posting the 2nd revision. >> > >> > I guess you should test with rx checksum disabled as well, Jesse seemed >> > to be concerned about that. >> >> >> I was actually thinking about the transmit side - rx checksum verification >> doesn't matter much here since the result will get thrown away. However, if >> the packet is CHECKSUM_PARTIAL then the checksum will have to get filled in >> first and that's the code path that is a little different now. > > Do we actually need to complete the checksum before doing the > upcall and if so, why? Couldn't the slow path do that if needed? > The only reason I can think of where it would matter is if a > controller injects the packet into another network stack such > as RouteFlow. Well, this is the slow path. I don't want to force userspace to deal with this because it's an internal kernel optimization that is platform-specific and requires carrying additional metadata around. It's also very common for a packet to traverse another network stack - it's typically a VM's. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-06-10 20:34 ` Jesse Gross @ 2013-06-10 20:43 ` David Miller 2013-06-10 20:55 ` Jesse Gross 0 siblings, 1 reply; 21+ messages in thread From: David Miller @ 2013-06-10 20:43 UTC (permalink / raw) To: jesse; +Cc: tgraf, eric.dumazet, netdev, dev From: Jesse Gross <jesse@nicira.com> Date: Mon, 10 Jun 2013 13:34:23 -0700 > Well, this is the slow path. I don't want to force userspace to deal > with this because it's an internal kernel optimization that is > platform-specific and requires carrying additional metadata around. I think the grounds for this rejection is over the top and unreasonable. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall 2013-06-10 20:43 ` David Miller @ 2013-06-10 20:55 ` Jesse Gross 0 siblings, 0 replies; 21+ messages in thread From: Jesse Gross @ 2013-06-10 20:55 UTC (permalink / raw) To: David Miller; +Cc: Thomas Graf, Eric Dumazet, netdev, dev On Mon, Jun 10, 2013 at 1:43 PM, David Miller <davem@davemloft.net> wrote: > From: Jesse Gross <jesse@nicira.com> > Date: Mon, 10 Jun 2013 13:34:23 -0700 > >> Well, this is the slow path. I don't want to force userspace to deal >> with this because it's an internal kernel optimization that is >> platform-specific and requires carrying additional metadata around. > > I think the grounds for this rejection is over the top and > unreasonable. I didn't reject it and in fact, I expect it to go in. I asked for numbers as to how this affects the common case. ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH net-next 0/2 v2] Open vSwitch zerocopy upcall @ 2013-07-25 12:43 Thomas Graf 2013-07-25 12:43 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf 0 siblings, 1 reply; 21+ messages in thread From: Thomas Graf @ 2013-07-25 12:43 UTC (permalink / raw) To: davem, jesse; +Cc: netdev, dev, eric.dumazet Respin of the zerocopy patches for the openvswitch upcall. V2: - Added skb_zerocopy_headlen() to calculate headroom of destination buffer. This also takes care of the from->head_frag issue. - Attribute alignment for frag_list case - API documentation - performance data for CHECKSUM_PARTIAL tx case Thomas Graf (2): net: Export skb_zerocopy() to zerocopy from one skb to another openvswitch: Use skb_zerocopy() to prepare skb for upcall include/linux/skbuff.h | 3 ++ net/core/skbuff.c | 85 ++++++++++++++++++++++++++++++++++++ net/netfilter/nfnetlink_queue_core.c | 59 ++----------------------- net/openvswitch/datapath.c | 47 +++++++++++++++++--- 4 files changed, 133 insertions(+), 61 deletions(-) -- 1.8.3.1 ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another 2013-07-25 12:43 [PATCH net-next 0/2 v2] Open vSwitch zerocopy upcall Thomas Graf @ 2013-07-25 12:43 ` Thomas Graf 0 siblings, 0 replies; 21+ messages in thread From: Thomas Graf @ 2013-07-25 12:43 UTC (permalink / raw) To: davem, jesse; +Cc: netdev, dev, eric.dumazet Make the skb zerocopy logic written for nfnetlink queue available for use by other modules. Signed-off-by: Thomas Graf <tgraf@suug.ch> --- include/linux/skbuff.h | 3 ++ net/core/skbuff.c | 85 ++++++++++++++++++++++++++++++++++++ net/netfilter/nfnetlink_queue_core.c | 59 ++----------------------- 3 files changed, 92 insertions(+), 55 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5afefa0..2f3c64f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2381,6 +2381,9 @@ extern int skb_splice_bits(struct sk_buff *skb, unsigned int len, unsigned int flags); extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); +extern unsigned int skb_zerocopy_headlen(const struct sk_buff *from); +extern void skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, + int len, int hlen); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 20e02d2..23736ad 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2111,6 +2111,91 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) EXPORT_SYMBOL(skb_copy_and_csum_dev); /** + * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() + * @from: source buffer + * + * Calculates the amount of linear headroom needed in the 'to' skb passed + * into skb_zerocopy(). + */ +unsigned int +skb_zerocopy_headlen(const struct sk_buff *from) +{ + unsigned int hlen = 0; + + if (!from->head_frag || + skb_headlen(from) < L1_CACHE_BYTES || + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + hlen = skb_headlen(from); + + if (skb_has_frag_list(from)) + hlen = from->len; + + return hlen; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); + +/** + * skb_zerocopy - Zero copy skb to skb + * @to: destination buffer + * @source: source buffer + * @len: number of bytes to copy from source buffer + * @hlen: size of linear headroom in destination buffer + * + * Copies up to `len` bytes from `from` to `to` by creating references + * to the frags in the source buffer. + * + * The `hlen` as calculated by skb_zerocopy_headlen() specifies the + * headroom in the `to` buffer. + */ +void +skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + struct page *page; + unsigned int offset; + + BUG_ON(!from->head_frag && !hlen); + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) { + skb_copy_bits(from, 0, skb_put(to, len), len); + return; + } + + if (hlen) { + skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_page_desc(to, 0, page, offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + to->truesize += len + plen; + to->len += len + plen; + to->data_len += len + plen; + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); + len -= skb_shinfo(to)->frags[j].size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); + +/** * skb_dequeue - remove from the head of the queue * @list: list to dequeue from * diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 971ea14..ba539a8 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -235,51 +235,6 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) spin_unlock_bh(&queue->lock); } -static void -nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) -{ - int i, j = 0; - int plen = 0; /* length of skb->head fragment */ - struct page *page; - unsigned int offset; - - /* dont bother with small payloads */ - if (len <= skb_tailroom(to)) { - skb_copy_bits(from, 0, skb_put(to, len), len); - return; - } - - if (hlen) { - skb_copy_bits(from, 0, skb_put(to, hlen), hlen); - len -= hlen; - } else { - plen = min_t(int, skb_headlen(from), len); - if (plen) { - page = virt_to_head_page(from->head); - offset = from->data - (unsigned char *)page_address(page); - __skb_fill_page_desc(to, 0, page, offset, plen); - get_page(page); - j = 1; - len -= plen; - } - } - - to->truesize += len + plen; - to->len += len + plen; - to->data_len += len + plen; - - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { - if (!len) - break; - skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; - skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); - len -= skb_shinfo(to)->frags[j].size; - skb_frag_ref(to, j); - j++; - } - skb_shinfo(to)->nr_frags = j; -} - static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, bool csum_verify) @@ -304,7 +259,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, { size_t size; size_t data_len = 0, cap_len = 0; - int hlen = 0; + unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; @@ -356,14 +311,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (data_len > entskb->len) data_len = entskb->len; - if (!entskb->head_frag || - skb_headlen(entskb) < L1_CACHE_BYTES || - skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS) - hlen = skb_headlen(entskb); - - if (skb_has_frag_list(entskb)) - hlen = entskb->len; - hlen = min_t(int, data_len, hlen); + hlen = skb_zerocopy_headlen(entskb); + hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; @@ -501,7 +450,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); - nfqnl_zcopy(skb, entskb, data_len, hlen); + skb_zerocopy(skb, entskb, data_len, hlen); } nlh->nlmsg_len = skb->len; -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 21+ messages in thread
end of thread, other threads:[~2013-07-25 12:43 UTC | newest] Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2013-05-24 14:52 [PATCH net-next 0/2] Open vSwitch zerocopy upcall Thomas Graf 2013-05-24 14:52 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf 2013-05-24 15:31 ` Eric Dumazet 2013-05-24 16:01 ` Thomas Graf 2013-05-24 14:52 ` [PATCH net-next 2/2] openvswitch: Use zerocopy if applicable when performing the upcall Thomas Graf 2013-05-24 17:24 ` Jesse Gross 2013-05-24 18:58 ` Eric Dumazet 2013-05-24 21:23 ` Jesse Gross 2013-05-24 21:57 ` Eric Dumazet 2013-05-24 22:18 ` Jesse Gross 2013-05-24 22:34 ` Eric Dumazet 2013-05-25 0:08 ` Jesse Gross 2013-05-25 0:32 ` Eric Dumazet 2013-05-25 7:02 ` Thomas Graf 2013-05-25 14:39 ` Eric Dumazet 2013-05-27 1:28 ` Jesse Gross 2013-06-07 12:21 ` Thomas Graf 2013-06-10 20:34 ` Jesse Gross 2013-06-10 20:43 ` David Miller 2013-06-10 20:55 ` Jesse Gross 2013-07-25 12:43 [PATCH net-next 0/2 v2] Open vSwitch zerocopy upcall Thomas Graf 2013-07-25 12:43 ` [PATCH net-next 1/2] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).