All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
@ 2016-08-23  5:20 Steffen Klassert
  2016-08-23 14:47 ` Alexander Duyck
  0 siblings, 1 reply; 16+ messages in thread
From: Steffen Klassert @ 2016-08-23  5:20 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Alexander Duyck

Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
gro may build buffers with a frag_list. This can hurt forwarding
because most NICs can't offload such packets, they need to be
segmented in software. This patch splits buffers with a frag_list
at the frag_list pointer into buffers that can be TSO offloaded.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/core/skbuff.c      | 89 +++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/af_inet.c     |  7 ++--
 net/ipv4/gre_offload.c |  7 +++-
 net/ipv4/tcp_offload.c |  3 ++
 net/ipv4/udp_offload.c |  9 +++--
 net/ipv6/ip6_offload.c |  6 +++-
 6 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b6..a614e9d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	sg = !!(features & NETIF_F_SG);
 	csum = !!can_checksum_protocol(features, proto);
 
+	headroom = skb_headroom(head_skb);
+
+	if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) &&
+	    csum && sg && (mss != GSO_BY_FRAGS) &&
+	    !(features & NETIF_F_GSO_PARTIAL)) {
+		unsigned int lskb_segs;
+		unsigned int delta_segs, delta_len, delta_truesize;
+		struct sk_buff *nskb;
+		delta_segs = delta_len = delta_truesize = 0;
+
+		segs = __alloc_skb(skb_headlen(head_skb) + headroom,
+				   GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
+				   NUMA_NO_NODE);
+		if (unlikely(!segs))
+			return ERR_PTR(-ENOMEM);
+
+		skb_reserve(segs, headroom);
+		skb_put(segs, skb_headlen(head_skb));
+		skb_copy_from_linear_data(head_skb, segs->data, segs->len);
+		copy_skb_header(segs, head_skb);
+
+		if (skb_shinfo(head_skb)->nr_frags) {
+			int i;
+
+			if (skb_orphan_frags(head_skb, GFP_ATOMIC))
+				goto err;
+
+			for (i = 0; i < skb_shinfo(head_skb)->nr_frags; i++) {
+				skb_shinfo(segs)->frags[i] = skb_shinfo(head_skb)->frags[i];
+				skb_frag_ref(head_skb, i);
+			}
+			skb_shinfo(segs)->nr_frags = i;
+		}
+
+		do {
+			nskb = skb_clone(list_skb, GFP_ATOMIC);
+			if (unlikely(!nskb))
+				goto err;
+
+			list_skb = list_skb->next;
+
+			if (!tail)
+				segs->next = nskb;
+			else
+				tail->next = nskb;
+
+			tail = nskb;
+
+			if (skb_cow_head(nskb, doffset + headroom))
+				goto err;
+
+			lskb_segs = nskb->len / mss;
+
+			skb_shinfo(nskb)->gso_size = mss;
+			skb_shinfo(nskb)->gso_type = skb_shinfo(head_skb)->gso_type;
+			skb_shinfo(nskb)->gso_segs = lskb_segs;
+
+
+			delta_segs += lskb_segs;
+			delta_len += nskb->len;
+			delta_truesize += nskb->truesize;
+
+			__skb_push(nskb, doffset);
+
+			skb_release_head_state(nskb);
+			__copy_skb_header(nskb, head_skb);
+
+			skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+			skb_reset_mac_len(nskb);
+
+			skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
+							 nskb->data - tnl_hlen,
+							 doffset + tnl_hlen);
+
+		} while (list_skb);
+
+		skb_shinfo(segs)->gso_segs -= delta_segs;
+		segs->len = head_skb->len - delta_len;
+		segs->data_len = head_skb->data_len - delta_len;
+		segs->truesize += head_skb->data_len - delta_truesize;
+
+		segs->prev = tail;
+
+		goto out;
+	}
+
 	/* GSO partial only requires that we trim off any excess that
 	 * doesn't fit into an MSS sized block, so take care of that
 	 * now.
@@ -3090,7 +3176,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 			partial_segs = 0;
 	}
 
-	headroom = skb_headroom(head_skb);
 	pos = skb_headlen(head_skb);
 
 	do {
@@ -3307,6 +3392,8 @@ perform_csum_check:
 		swap(tail->destructor, head_skb->destructor);
 		swap(tail->sk, head_skb->sk);
 	}
+
+out:
 	return segs;
 
 err:
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 55513e6..c814afa 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1195,7 +1195,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				 netdev_features_t features)
 {
-	bool udpfrag = false, fixedid = false, encap;
+	bool udpfrag = false, fixedid = false, gso_partial = false, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
 	unsigned int offset = 0;
@@ -1248,6 +1248,9 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	if (IS_ERR_OR_NULL(segs))
 		goto out;
 
+	if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
+		gso_partial = true;
+
 	skb = segs;
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1257,7 +1260,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				iph->frag_off |= htons(IP_MF);
 			offset += skb->len - nhoff - ihl;
 			tot_len = skb->len - nhoff;
-		} else if (skb_is_gso(skb)) {
+		} else if (skb_is_gso(skb) && gso_partial) {
 			if (!fixedid) {
 				iph->id = htons(id);
 				id += skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09..cf82e28 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	__be16 protocol = skb->protocol;
 	u16 mac_len = skb->mac_len;
 	int gre_offset, outer_hlen;
-	bool need_csum, ufo;
+	bool need_csum, ufo, gso_partial;
 
 	if (!skb->encapsulation)
 		goto out;
@@ -69,6 +69,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
+		gso_partial = true;
+	else
+		gso_partial = false;
+
 	outer_hlen = skb_tnl_header_len(skb);
 	gre_offset = outer_hlen - tnl_hlen;
 	skb = segs;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5c59649..dddd227 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -107,6 +107,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 
 	/* Only first segment might have ooo_okay set */
 	segs->ooo_okay = ooo_okay;
+	if (skb_is_gso(segs) && !(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL))
+		mss = (skb_tail_pointer(segs) - skb_transport_header(segs)) +
+		       segs->data_len - thlen;
 
 	delta = htonl(oldlen + (thlen + mss));
 
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 81f253b..dfb6a2c 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	__be16 new_protocol, bool is_ipv6)
 {
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-	bool remcsum, need_csum, offload_csum, ufo;
+	bool remcsum, need_csum, offload_csum, ufo, gso_partial;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	struct udphdr *uh = udp_hdr(skb);
 	u16 mac_offset = skb->mac_header;
@@ -88,6 +88,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
+		gso_partial = true;
+	else
+		gso_partial = false;
+
 	outer_hlen = skb_tnl_header_len(skb);
 	udp_offset = outer_hlen - tnl_hlen;
 	skb = segs;
@@ -117,7 +122,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		 * will be using a length value equal to only one MSS sized
 		 * segment instead of the entire frame.
 		 */
-		if (skb_is_gso(skb)) {
+		if (skb_is_gso(skb) && gso_partial) {
 			uh->len = htons(skb_shinfo(skb)->gso_size +
 					SKB_GSO_CB(skb)->data_offset +
 					skb->head - (unsigned char *)uh);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 22e90e5..0ec16ba 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	int offset = 0;
 	bool encap, udpfrag;
 	int nhoff;
+	bool gso_partial = false;
 
 	skb_reset_network_header(skb);
 	nhoff = skb_network_header(skb) - skb_mac_header(skb);
@@ -101,9 +102,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	if (IS_ERR(segs))
 		goto out;
 
+	if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
+		gso_partial = true;
+
 	for (skb = segs; skb; skb = skb->next) {
 		ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-		if (skb_is_gso(skb))
+		if (skb_is_gso(skb) && gso_partial)
 			payload_len = skb_shinfo(skb)->gso_size +
 				      SKB_GSO_CB(skb)->data_offset +
 				      skb->head - (unsigned char *)(ipv6h + 1);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-23  5:20 [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer Steffen Klassert
@ 2016-08-23 14:47 ` Alexander Duyck
  2016-08-24  9:32   ` Steffen Klassert
  0 siblings, 1 reply; 16+ messages in thread
From: Alexander Duyck @ 2016-08-23 14:47 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Netdev, Eric Dumazet, Alexander Duyck

On Mon, Aug 22, 2016 at 10:20 PM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
> gro may build buffers with a frag_list. This can hurt forwarding
> because most NICs can't offload such packets, they need to be
> segmented in software. This patch splits buffers with a frag_list
> at the frag_list pointer into buffers that can be TSO offloaded.
>
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> ---
>  net/core/skbuff.c      | 89 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  net/ipv4/af_inet.c     |  7 ++--
>  net/ipv4/gre_offload.c |  7 +++-
>  net/ipv4/tcp_offload.c |  3 ++
>  net/ipv4/udp_offload.c |  9 +++--
>  net/ipv6/ip6_offload.c |  6 +++-
>  6 files changed, 114 insertions(+), 7 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 3864b4b6..a614e9d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>         sg = !!(features & NETIF_F_SG);
>         csum = !!can_checksum_protocol(features, proto);
>
> +       headroom = skb_headroom(head_skb);
> +
> +       if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) &&
> +           csum && sg && (mss != GSO_BY_FRAGS) &&
> +           !(features & NETIF_F_GSO_PARTIAL)) {

Does this really need to be mutually exclusive with
NETIF_F_GSO_PARTIAL and GSO_BY_FRAGS?  This is occurring early enough
that maybe instead of doubling the size of skb_segment you should look
at instead adding a new static function that could handle splitting
the frag_list and just call that instead of adding this massive amount
of code.

Some of these checks are more expensive than others.  I would
recommend doing the sg && csum && !(features & NETIF_F_GSO_PARTIAL)
checks first.  If possible you could even combine some of the checks
since they are also in the block that sets up partial_segs.  That way
we can cut down on the total number of conditional branches needed.

> +               unsigned int lskb_segs;
> +               unsigned int delta_segs, delta_len, delta_truesize;
> +               struct sk_buff *nskb;
> +               delta_segs = delta_len = delta_truesize = 0;
> +
> +               segs = __alloc_skb(skb_headlen(head_skb) + headroom,
> +                                  GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
> +                                  NUMA_NO_NODE);
> +               if (unlikely(!segs))
> +                       return ERR_PTR(-ENOMEM);
> +
> +               skb_reserve(segs, headroom);
> +               skb_put(segs, skb_headlen(head_skb));
> +               skb_copy_from_linear_data(head_skb, segs->data, segs->len);
> +               copy_skb_header(segs, head_skb);
> +
> +               if (skb_shinfo(head_skb)->nr_frags) {
> +                       int i;
> +
> +                       if (skb_orphan_frags(head_skb, GFP_ATOMIC))
> +                               goto err;
> +
> +                       for (i = 0; i < skb_shinfo(head_skb)->nr_frags; i++) {
> +                               skb_shinfo(segs)->frags[i] = skb_shinfo(head_skb)->frags[i];
> +                               skb_frag_ref(head_skb, i);
> +                       }
> +                       skb_shinfo(segs)->nr_frags = i;
> +               }
> +
> +               do {
> +                       nskb = skb_clone(list_skb, GFP_ATOMIC);
> +                       if (unlikely(!nskb))
> +                               goto err;
> +
> +                       list_skb = list_skb->next;
> +
> +                       if (!tail)
> +                               segs->next = nskb;
> +                       else
> +                               tail->next = nskb;
> +
> +                       tail = nskb;
> +
> +                       if (skb_cow_head(nskb, doffset + headroom))
> +                               goto err;
> +
> +                       lskb_segs = nskb->len / mss;
> +
> +                       skb_shinfo(nskb)->gso_size = mss;
> +                       skb_shinfo(nskb)->gso_type = skb_shinfo(head_skb)->gso_type;
> +                       skb_shinfo(nskb)->gso_segs = lskb_segs;
> +
> +
> +                       delta_segs += lskb_segs;
> +                       delta_len += nskb->len;
> +                       delta_truesize += nskb->truesize;
> +
> +                       __skb_push(nskb, doffset);
> +
> +                       skb_release_head_state(nskb);
> +                       __copy_skb_header(nskb, head_skb);
> +
> +                       skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
> +                       skb_reset_mac_len(nskb);
> +
> +                       skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
> +                                                        nskb->data - tnl_hlen,
> +                                                        doffset + tnl_hlen);
> +
> +               } while (list_skb);
> +
> +               skb_shinfo(segs)->gso_segs -= delta_segs;
> +               segs->len = head_skb->len - delta_len;
> +               segs->data_len = head_skb->data_len - delta_len;
> +               segs->truesize += head_skb->data_len - delta_truesize;
> +
> +               segs->prev = tail;
> +
> +               goto out;
> +       }
> +
>         /* GSO partial only requires that we trim off any excess that
>          * doesn't fit into an MSS sized block, so take care of that
>          * now.
> @@ -3090,7 +3176,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>                         partial_segs = 0;
>         }
>
> -       headroom = skb_headroom(head_skb);
>         pos = skb_headlen(head_skb);
>
>         do {
> @@ -3307,6 +3392,8 @@ perform_csum_check:
>                 swap(tail->destructor, head_skb->destructor);
>                 swap(tail->sk, head_skb->sk);
>         }
> +
> +out:
>         return segs;
>
>  err:
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 55513e6..c814afa 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1195,7 +1195,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
>  struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>                                  netdev_features_t features)
>  {
> -       bool udpfrag = false, fixedid = false, encap;
> +       bool udpfrag = false, fixedid = false, gso_partial = false, encap;
>         struct sk_buff *segs = ERR_PTR(-EINVAL);
>         const struct net_offload *ops;
>         unsigned int offset = 0;
> @@ -1248,6 +1248,9 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>         if (IS_ERR_OR_NULL(segs))
>                 goto out;
>
> +       if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
> +               gso_partial = true;
> +

For these kind of blocks it is usually best to just do:
    gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);

The compiler usually does a better job of just doing a bit of
arithmetic instead of generating a set of test/jump type instructions
and generally that runs faster since there is less branching.  The
same applies to all the other cases where you setup gso_partial this
way.

>         skb = segs;
>         do {
>                 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
> @@ -1257,7 +1260,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>                                 iph->frag_off |= htons(IP_MF);
>                         offset += skb->len - nhoff - ihl;
>                         tot_len = skb->len - nhoff;
> -               } else if (skb_is_gso(skb)) {
> +               } else if (skb_is_gso(skb) && gso_partial) {
>                         if (!fixedid) {
>                                 iph->id = htons(id);
>                                 id += skb_shinfo(skb)->gso_segs;
> diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> index ecd1e09..cf82e28 100644
> --- a/net/ipv4/gre_offload.c
> +++ b/net/ipv4/gre_offload.c
> @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
>         __be16 protocol = skb->protocol;
>         u16 mac_len = skb->mac_len;
>         int gre_offset, outer_hlen;
> -       bool need_csum, ufo;
> +       bool need_csum, ufo, gso_partial;
>
>         if (!skb->encapsulation)
>                 goto out;
> @@ -69,6 +69,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
>                 goto out;
>         }
>
> +       if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
> +               gso_partial = true;
> +       else
> +               gso_partial = false;
> +
>         outer_hlen = skb_tnl_header_len(skb);
>         gre_offset = outer_hlen - tnl_hlen;
>         skb = segs;
> diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> index 5c59649..dddd227 100644
> --- a/net/ipv4/tcp_offload.c
> +++ b/net/ipv4/tcp_offload.c
> @@ -107,6 +107,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
>
>         /* Only first segment might have ooo_okay set */
>         segs->ooo_okay = ooo_okay;
> +       if (skb_is_gso(segs) && !(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL))
> +               mss = (skb_tail_pointer(segs) - skb_transport_header(segs)) +
> +                      segs->data_len - thlen;
>
>         delta = htonl(oldlen + (thlen + mss));
>
> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> index 81f253b..dfb6a2c 100644
> --- a/net/ipv4/udp_offload.c
> +++ b/net/ipv4/udp_offload.c
> @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
>         __be16 new_protocol, bool is_ipv6)
>  {
>         int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
> -       bool remcsum, need_csum, offload_csum, ufo;
> +       bool remcsum, need_csum, offload_csum, ufo, gso_partial;
>         struct sk_buff *segs = ERR_PTR(-EINVAL);
>         struct udphdr *uh = udp_hdr(skb);
>         u16 mac_offset = skb->mac_header;
> @@ -88,6 +88,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
>                 goto out;
>         }
>
> +       if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
> +               gso_partial = true;
> +       else
> +               gso_partial = false;
> +
>         outer_hlen = skb_tnl_header_len(skb);
>         udp_offset = outer_hlen - tnl_hlen;
>         skb = segs;
> @@ -117,7 +122,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
>                  * will be using a length value equal to only one MSS sized
>                  * segment instead of the entire frame.
>                  */
> -               if (skb_is_gso(skb)) {
> +               if (skb_is_gso(skb) && gso_partial) {
>                         uh->len = htons(skb_shinfo(skb)->gso_size +
>                                         SKB_GSO_CB(skb)->data_offset +
>                                         skb->head - (unsigned char *)uh);
> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
> index 22e90e5..0ec16ba 100644
> --- a/net/ipv6/ip6_offload.c
> +++ b/net/ipv6/ip6_offload.c
> @@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
>         int offset = 0;
>         bool encap, udpfrag;
>         int nhoff;
> +       bool gso_partial = false;
>
>         skb_reset_network_header(skb);
>         nhoff = skb_network_header(skb) - skb_mac_header(skb);
> @@ -101,9 +102,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
>         if (IS_ERR(segs))
>                 goto out;
>
> +       if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
> +               gso_partial = true;
> +
>         for (skb = segs; skb; skb = skb->next) {
>                 ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
> -               if (skb_is_gso(skb))
> +               if (skb_is_gso(skb) && gso_partial)
>                         payload_len = skb_shinfo(skb)->gso_size +
>                                       SKB_GSO_CB(skb)->data_offset +
>                                       skb->head - (unsigned char *)(ipv6h + 1);
> --
> 1.9.1
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-23 14:47 ` Alexander Duyck
@ 2016-08-24  9:32   ` Steffen Klassert
  2016-08-24 16:27     ` Alexander Duyck
  0 siblings, 1 reply; 16+ messages in thread
From: Steffen Klassert @ 2016-08-24  9:32 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Netdev, Eric Dumazet, Alexander Duyck

On Tue, Aug 23, 2016 at 07:47:32AM -0700, Alexander Duyck wrote:
> On Mon, Aug 22, 2016 at 10:20 PM, Steffen Klassert
> <steffen.klassert@secunet.com> wrote:
> > Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
> > gro may build buffers with a frag_list. This can hurt forwarding
> > because most NICs can't offload such packets, they need to be
> > segmented in software. This patch splits buffers with a frag_list
> > at the frag_list pointer into buffers that can be TSO offloaded.
> >
> > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> > ---
> >  net/core/skbuff.c      | 89 +++++++++++++++++++++++++++++++++++++++++++++++++-
> >  net/ipv4/af_inet.c     |  7 ++--
> >  net/ipv4/gre_offload.c |  7 +++-
> >  net/ipv4/tcp_offload.c |  3 ++
> >  net/ipv4/udp_offload.c |  9 +++--
> >  net/ipv6/ip6_offload.c |  6 +++-
> >  6 files changed, 114 insertions(+), 7 deletions(-)
> >
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 3864b4b6..a614e9d 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
> >         sg = !!(features & NETIF_F_SG);
> >         csum = !!can_checksum_protocol(features, proto);
> >
> > +       headroom = skb_headroom(head_skb);
> > +
> > +       if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) &&
> > +           csum && sg && (mss != GSO_BY_FRAGS) &&
> > +           !(features & NETIF_F_GSO_PARTIAL)) {
> 
> Does this really need to be mutually exclusive with
> NETIF_F_GSO_PARTIAL and GSO_BY_FRAGS? 

It should be possible to extend this to NETIF_F_GSO_PARTIAL but
I have no test for this. Regarding GSO_BY_FRAGS, this is rather
new and just used for sctp. I don't know what sctp does with
GSO_BY_FRAGS.

> This is occurring early enough
> that maybe instead of doubling the size of skb_segment you should look
> at instead adding a new static function that could handle splitting
> the frag_list and just call that instead of adding this massive amount
> of code.

Ok, will do that.

> 
> Some of these checks are more expensive than others.  I would
> recommend doing the sg && csum && !(features & NETIF_F_GSO_PARTIAL)
> checks first.  If possible you could even combine some of the checks
> since they are also in the block that sets up partial_segs.  That way
> we can cut down on the total number of conditional branches needed.

We can combine the sg && csum check in the block that sets up partial_segs.
In case this is not NETIF_F_GSO_PARTIAL, I'll do the list_skb and
net_gso_ok() check and call the new static function then.

> >
> > +       if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)
> > +               gso_partial = true;
> > +
> 
> For these kind of blocks it is usually best to just do:
>     gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> 
> The compiler usually does a better job of just doing a bit of
> arithmetic instead of generating a set of test/jump type instructions
> and generally that runs faster since there is less branching.  The
> same applies to all the other cases where you setup gso_partial this
> way.

Good point, I'll change this.

Thanks for the review!

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-24  9:32   ` Steffen Klassert
@ 2016-08-24 16:27     ` Alexander Duyck
  2016-08-24 17:25       ` Marcelo Ricardo Leitner
  2016-08-25 11:00       ` Steffen Klassert
  0 siblings, 2 replies; 16+ messages in thread
From: Alexander Duyck @ 2016-08-24 16:27 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Netdev, Eric Dumazet, Alexander Duyck, Marcelo Ricardo Leitner

On Wed, Aug 24, 2016 at 2:32 AM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> On Tue, Aug 23, 2016 at 07:47:32AM -0700, Alexander Duyck wrote:
>> On Mon, Aug 22, 2016 at 10:20 PM, Steffen Klassert
>> <steffen.klassert@secunet.com> wrote:
>> > Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
>> > gro may build buffers with a frag_list. This can hurt forwarding
>> > because most NICs can't offload such packets, they need to be
>> > segmented in software. This patch splits buffers with a frag_list
>> > at the frag_list pointer into buffers that can be TSO offloaded.
>> >
>> > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
>> > ---
>> >  net/core/skbuff.c      | 89 +++++++++++++++++++++++++++++++++++++++++++++++++-
>> >  net/ipv4/af_inet.c     |  7 ++--
>> >  net/ipv4/gre_offload.c |  7 +++-
>> >  net/ipv4/tcp_offload.c |  3 ++
>> >  net/ipv4/udp_offload.c |  9 +++--
>> >  net/ipv6/ip6_offload.c |  6 +++-
>> >  6 files changed, 114 insertions(+), 7 deletions(-)
>> >
>> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> > index 3864b4b6..a614e9d 100644
>> > --- a/net/core/skbuff.c
>> > +++ b/net/core/skbuff.c
>> > @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>> >         sg = !!(features & NETIF_F_SG);
>> >         csum = !!can_checksum_protocol(features, proto);
>> >
>> > +       headroom = skb_headroom(head_skb);
>> > +
>> > +       if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) &&
>> > +           csum && sg && (mss != GSO_BY_FRAGS) &&
>> > +           !(features & NETIF_F_GSO_PARTIAL)) {
>>
>> Does this really need to be mutually exclusive with
>> NETIF_F_GSO_PARTIAL and GSO_BY_FRAGS?
>
> It should be possible to extend this to NETIF_F_GSO_PARTIAL but
> I have no test for this. Regarding GSO_BY_FRAGS, this is rather
> new and just used for sctp. I don't know what sctp does with
> GSO_BY_FRAGS.

I'm adding Marcelo as he could probably explain the GSO_BY_FRAGS
functionality better than I could since he is the original author.

If I recall GSO_BY_FRAGS does something similar to what you are doing,
although I believe it doesn't carry any data in the first buffer other
than just a header.  I believe the idea behind GSO_BY_FRAGS was to
allow for segmenting a frame at the frag_list level instead of having
it done just based on MSS.  That was the only reason why I brought it
up.

In you case though we maybe be able to make this easier.  If I am not
mistaken I believe we should have the main skb, and any in the chain
excluding the last containing the same amount of data.  That being the
case we should be able to determine the size that you would need to
segment at by taking skb->len, and removing the length of all the
skbuffs hanging off of frag_list.  At that point you just use that as
your MSS for segmentation and it should break things up so that you
have a series of equal sized segments split as the frag_list buffer
boundaries.

After that all that is left is to update the gso info for the buffers.
For GSO_PARTIAL I was handling that on the first segment only.  For
this change you would need to update that code to address the fact
that you would have to determine the number of segments on the first
frame and the last since the last could be less than the first, but
all of the others in-between should have the same number of segments.

- Alex

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-24 16:27     ` Alexander Duyck
@ 2016-08-24 17:25       ` Marcelo Ricardo Leitner
  2016-08-25  7:31         ` Steffen Klassert
  2016-08-25 11:00       ` Steffen Klassert
  1 sibling, 1 reply; 16+ messages in thread
From: Marcelo Ricardo Leitner @ 2016-08-24 17:25 UTC (permalink / raw)
  To: Alexander Duyck, Steffen Klassert; +Cc: Netdev, Eric Dumazet, Alexander Duyck

Em 24-08-2016 13:27, Alexander Duyck escreveu:
> On Wed, Aug 24, 2016 at 2:32 AM, Steffen Klassert
> <steffen.klassert@secunet.com> wrote:
>> On Tue, Aug 23, 2016 at 07:47:32AM -0700, Alexander Duyck wrote:
>>> On Mon, Aug 22, 2016 at 10:20 PM, Steffen Klassert
>>> <steffen.klassert@secunet.com> wrote:
>>>> Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
>>>> gro may build buffers with a frag_list. This can hurt forwarding
>>>> because most NICs can't offload such packets, they need to be
>>>> segmented in software. This patch splits buffers with a frag_list
>>>> at the frag_list pointer into buffers that can be TSO offloaded.
>>>>
>>>> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
>>>> ---
>>>>  net/core/skbuff.c      | 89 +++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>  net/ipv4/af_inet.c     |  7 ++--
>>>>  net/ipv4/gre_offload.c |  7 +++-
>>>>  net/ipv4/tcp_offload.c |  3 ++
>>>>  net/ipv4/udp_offload.c |  9 +++--
>>>>  net/ipv6/ip6_offload.c |  6 +++-
>>>>  6 files changed, 114 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index 3864b4b6..a614e9d 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>         sg = !!(features & NETIF_F_SG);
>>>>         csum = !!can_checksum_protocol(features, proto);
>>>>
>>>> +       headroom = skb_headroom(head_skb);
>>>> +
>>>> +       if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) &&
>>>> +           csum && sg && (mss != GSO_BY_FRAGS) &&
>>>> +           !(features & NETIF_F_GSO_PARTIAL)) {
>>>
>>> Does this really need to be mutually exclusive with
>>> NETIF_F_GSO_PARTIAL and GSO_BY_FRAGS?
>>
>> It should be possible to extend this to NETIF_F_GSO_PARTIAL but
>> I have no test for this. Regarding GSO_BY_FRAGS, this is rather
>> new and just used for sctp. I don't know what sctp does with
>> GSO_BY_FRAGS.
>
> I'm adding Marcelo as he could probably explain the GSO_BY_FRAGS
> functionality better than I could since he is the original author.
>
> If I recall GSO_BY_FRAGS does something similar to what you are doing,
> although I believe it doesn't carry any data in the first buffer other
> than just a header.  I believe the idea behind GSO_BY_FRAGS was to
> allow for segmenting a frame at the frag_list level instead of having
> it done just based on MSS.  That was the only reason why I brought it
> up.
>

That's exactly it.

On this no data in the first buffer limitation, we probably can allow it 
have some data in there. It was done this way just because sctp is using 
skb_gro_receive() to build such skb and this was the way I found to get 
such frag_list skb generated by it, thus preserving frame boundaries.

For using GSO_BY_FRAGS in gso_size, it's how skb_is_gso() returns true, 
but it's similar to the SKB_GSO_PARTIAL rationale in here. We can make 
sctp also flag it as SKB_GSO_PARTIAL if needed I guess, in case you need 
to maintain gso_size value.

   Marcelo

> In you case though we maybe be able to make this easier.  If I am not
> mistaken I believe we should have the main skb, and any in the chain
> excluding the last containing the same amount of data.  That being the
> case we should be able to determine the size that you would need to
> segment at by taking skb->len, and removing the length of all the
> skbuffs hanging off of frag_list.  At that point you just use that as
> your MSS for segmentation and it should break things up so that you
> have a series of equal sized segments split as the frag_list buffer
> boundaries.
>
> After that all that is left is to update the gso info for the buffers.
> For GSO_PARTIAL I was handling that on the first segment only.  For
> this change you would need to update that code to address the fact
> that you would have to determine the number of segments on the first
> frame and the last since the last could be less than the first, but
> all of the others in-between should have the same number of segments.
>
> - Alex
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-24 17:25       ` Marcelo Ricardo Leitner
@ 2016-08-25  7:31         ` Steffen Klassert
  2016-08-25 12:17           ` Marcelo Ricardo Leitner
  0 siblings, 1 reply; 16+ messages in thread
From: Steffen Klassert @ 2016-08-25  7:31 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Alexander Duyck, Netdev, Eric Dumazet, Alexander Duyck

On Wed, Aug 24, 2016 at 02:25:29PM -0300, Marcelo Ricardo Leitner wrote:
> Em 24-08-2016 13:27, Alexander Duyck escreveu:
> >
> >I'm adding Marcelo as he could probably explain the GSO_BY_FRAGS
> >functionality better than I could since he is the original author.
> >
> >If I recall GSO_BY_FRAGS does something similar to what you are doing,
> >although I believe it doesn't carry any data in the first buffer other
> >than just a header.  I believe the idea behind GSO_BY_FRAGS was to
> >allow for segmenting a frame at the frag_list level instead of having
> >it done just based on MSS.  That was the only reason why I brought it
> >up.
> >
> 
> That's exactly it.
> 
> On this no data in the first buffer limitation, we probably can
> allow it have some data in there. It was done this way just because
> sctp is using skb_gro_receive() to build such skb and this was the
> way I found to get such frag_list skb generated by it, thus
> preserving frame boundaries.

Just to understand what you are doing. You generate MTU sized linear
buffers in sctp and then, skb_gro_receive() chains up these buffers
at the frag_list pointer. skb_gro_receive() does this because
skb_gro_offset is null and skb->head_frag is not set in your case.

At segmentation, you just need to split at the frag_list pointer
because you know that the chained buffers fit the MTU, right?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-24 16:27     ` Alexander Duyck
  2016-08-24 17:25       ` Marcelo Ricardo Leitner
@ 2016-08-25 11:00       ` Steffen Klassert
  2016-08-25 12:38         ` Marcelo Ricardo Leitner
                           ` (3 more replies)
  1 sibling, 4 replies; 16+ messages in thread
From: Steffen Klassert @ 2016-08-25 11:00 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Netdev, Eric Dumazet, Alexander Duyck, Marcelo Ricardo Leitner

On Wed, Aug 24, 2016 at 09:27:54AM -0700, Alexander Duyck wrote:
> 
> In you case though we maybe be able to make this easier.  If I am not
> mistaken I believe we should have the main skb, and any in the chain
> excluding the last containing the same amount of data. 

Yes, it seems to be like that. With this observation we can spmplify
things. 

> That being the
> case we should be able to determine the size that you would need to
> segment at by taking skb->len, and removing the length of all the
> skbuffs hanging off of frag_list.  At that point you just use that as
> your MSS for segmentation and it should break things up so that you
> have a series of equal sized segments split as the frag_list buffer
> boundaries.
> 
> After that all that is left is to update the gso info for the buffers.
> For GSO_PARTIAL I was handling that on the first segment only.  For
> this change you would need to update that code to address the fact
> that you would have to determine the number of segments on the first
> frame and the last since the last could be less than the first, but
> all of the others in-between should have the same number of segments.

I tried to do this and ended up with the patch below.
Seems to work, but sill needs some tests. So it is
not an official patch submission.

Subject: [PATCH net-next RFC] gso: Support partial splitting at the frag_list pointer

Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
gro may build buffers with a frag_list. This can hurt forwarding
because most NICs can't offload such packets, they need to be
segmented in software. This patch splits buffers with a frag_list
at the frag_list pointer into buffers that can be TSO offloaded.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/core/skbuff.c      | 46 ++++++++++++++++++++++++++++++++++++----------
 net/ipv4/af_inet.c     |  6 ++++--
 net/ipv4/gre_offload.c |  4 +++-
 net/ipv4/tcp_offload.c |  3 +++
 net/ipv4/udp_offload.c |  6 ++++--
 net/ipv6/ip6_offload.c |  5 ++++-
 6 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b6..cb326e5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3060,6 +3060,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	unsigned int offset = doffset;
 	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
 	unsigned int partial_segs = 0;
+	unsigned int fraglist_segs = 0;
 	unsigned int headroom;
 	unsigned int len = head_skb->len;
 	__be16 proto;
@@ -3078,16 +3079,27 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 	sg = !!(features & NETIF_F_SG);
 	csum = !!can_checksum_protocol(features, proto);
 
-	/* GSO partial only requires that we trim off any excess that
-	 * doesn't fit into an MSS sized block, so take care of that
-	 * now.
-	 */
-	if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
-		partial_segs = len / mss;
-		if (partial_segs > 1)
-			mss *= partial_segs;
-		else
-			partial_segs = 0;
+	if (sg && csum) {
+		/* GSO partial only requires that we trim off any excess that
+		 * doesn't fit into an MSS sized block, so take care of that
+		 * now.
+		 */
+		if ((features & NETIF_F_GSO_PARTIAL)) {
+			partial_segs = len / mss;
+			if (partial_segs > 1)
+				mss *= partial_segs;
+			else
+				partial_segs = 0;
+		} else if (list_skb && (mss != GSO_BY_FRAGS) &&
+			   net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) {
+
+			skb_walk_frags(head_skb, segs) {
+				len -= segs->len;
+			}
+			fraglist_segs = len / mss;
+			mss = len;
+			segs = NULL;
+		}
 	}
 
 	headroom = skb_headroom(head_skb);
@@ -3298,6 +3310,20 @@ perform_csum_check:
 		SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
 	}
 
+	if (fraglist_segs) {
+		struct sk_buff *iter;
+
+		for (iter = segs; iter; iter = iter->next) {
+			if (iter->next) {
+				skb_shinfo(iter)->gso_size = skb_shinfo(head_skb)->gso_size;
+				skb_shinfo(iter)->gso_segs = fraglist_segs;
+			} else {
+				skb_shinfo(iter)->gso_size = skb_shinfo(head_skb)->gso_size;
+				skb_shinfo(iter)->gso_segs = iter->len / skb_shinfo(head_skb)->gso_size;
+			}
+		}
+	}
+
 	/* Following permits correct backpressure, for protocols
 	 * using skb_set_owner_w().
 	 * Idea is to tranfert ownership from head_skb to last segment.
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 989a362..ac46233 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1190,7 +1190,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				 netdev_features_t features)
 {
-	bool udpfrag = false, fixedid = false, encap;
+	bool udpfrag = false, fixedid = false, gso_partial, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
 	unsigned int offset = 0;
@@ -1243,6 +1243,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	if (IS_ERR_OR_NULL(segs))
 		goto out;
 
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
 	skb = segs;
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1252,7 +1254,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				iph->frag_off |= htons(IP_MF);
 			offset += skb->len - nhoff - ihl;
 			tot_len = skb->len - nhoff;
-		} else if (skb_is_gso(skb)) {
+		} else if (skb_is_gso(skb) && gso_partial) {
 			if (!fixedid) {
 				iph->id = htons(id);
 				id += skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09..7c56785 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	__be16 protocol = skb->protocol;
 	u16 mac_len = skb->mac_len;
 	int gre_offset, outer_hlen;
-	bool need_csum, ufo;
+	bool need_csum, ufo, gso_partial;
 
 	if (!skb->encapsulation)
 		goto out;
@@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
 	outer_hlen = skb_tnl_header_len(skb);
 	gre_offset = outer_hlen - tnl_hlen;
 	skb = segs;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5c59649..dddd227 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -107,6 +107,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 
 	/* Only first segment might have ooo_okay set */
 	segs->ooo_okay = ooo_okay;
+	if (skb_is_gso(segs) && !(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL))
+		mss = (skb_tail_pointer(segs) - skb_transport_header(segs)) +
+		       segs->data_len - thlen;
 
 	delta = htonl(oldlen + (thlen + mss));
 
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 81f253b..0b909f9 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	__be16 new_protocol, bool is_ipv6)
 {
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-	bool remcsum, need_csum, offload_csum, ufo;
+	bool remcsum, need_csum, offload_csum, ufo, gso_partial;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	struct udphdr *uh = udp_hdr(skb);
 	u16 mac_offset = skb->mac_header;
@@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		goto out;
 	}
 
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
 	outer_hlen = skb_tnl_header_len(skb);
 	udp_offset = outer_hlen - tnl_hlen;
 	skb = segs;
@@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		 * will be using a length value equal to only one MSS sized
 		 * segment instead of the entire frame.
 		 */
-		if (skb_is_gso(skb)) {
+		if (skb_is_gso(skb) && gso_partial) {
 			uh->len = htons(skb_shinfo(skb)->gso_size +
 					SKB_GSO_CB(skb)->data_offset +
 					skb->head - (unsigned char *)uh);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 22e90e5..73dd0d1 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	int offset = 0;
 	bool encap, udpfrag;
 	int nhoff;
+	bool gso_partial;
 
 	skb_reset_network_header(skb);
 	nhoff = skb_network_header(skb) - skb_mac_header(skb);
@@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	if (IS_ERR(segs))
 		goto out;
 
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
 	for (skb = segs; skb; skb = skb->next) {
 		ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-		if (skb_is_gso(skb))
+		if (skb_is_gso(skb) && gso_partial)
 			payload_len = skb_shinfo(skb)->gso_size +
 				      SKB_GSO_CB(skb)->data_offset +
 				      skb->head - (unsigned char *)(ipv6h + 1);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-25  7:31         ` Steffen Klassert
@ 2016-08-25 12:17           ` Marcelo Ricardo Leitner
  2016-08-29 12:02             ` Steffen Klassert
  0 siblings, 1 reply; 16+ messages in thread
From: Marcelo Ricardo Leitner @ 2016-08-25 12:17 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Alexander Duyck, Netdev, Eric Dumazet, Alexander Duyck

On Thu, Aug 25, 2016 at 09:31:26AM +0200, Steffen Klassert wrote:
> On Wed, Aug 24, 2016 at 02:25:29PM -0300, Marcelo Ricardo Leitner wrote:
> > Em 24-08-2016 13:27, Alexander Duyck escreveu:
> > >
> > >I'm adding Marcelo as he could probably explain the GSO_BY_FRAGS
> > >functionality better than I could since he is the original author.
> > >
> > >If I recall GSO_BY_FRAGS does something similar to what you are doing,
> > >although I believe it doesn't carry any data in the first buffer other
> > >than just a header.  I believe the idea behind GSO_BY_FRAGS was to
> > >allow for segmenting a frame at the frag_list level instead of having
> > >it done just based on MSS.  That was the only reason why I brought it
> > >up.
> > >
> > 
> > That's exactly it.
> > 
> > On this no data in the first buffer limitation, we probably can
> > allow it have some data in there. It was done this way just because
> > sctp is using skb_gro_receive() to build such skb and this was the
> > way I found to get such frag_list skb generated by it, thus
> > preserving frame boundaries.
> 
> Just to understand what you are doing. You generate MTU sized linear
> buffers in sctp and then, skb_gro_receive() chains up these buffers
> at the frag_list pointer. skb_gro_receive() does this because
> skb_gro_offset is null and skb->head_frag is not set in your case.
> 
> At segmentation, you just need to split at the frag_list pointer
> because you know that the chained buffers fit the MTU, right?
> 

Correct. Just note that these buffers fit the MTU, but not necessary
uses all of it. That is main point in here, variable segmentation size.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-25 11:00       ` Steffen Klassert
@ 2016-08-25 12:38         ` Marcelo Ricardo Leitner
  2016-08-25 16:02         ` Alexander Duyck
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Marcelo Ricardo Leitner @ 2016-08-25 12:38 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Alexander Duyck, Netdev, Eric Dumazet, Alexander Duyck

On Thu, Aug 25, 2016 at 01:00:19PM +0200, Steffen Klassert wrote:
> On Wed, Aug 24, 2016 at 09:27:54AM -0700, Alexander Duyck wrote:
> > 
> > In you case though we maybe be able to make this easier.  If I am not
> > mistaken I believe we should have the main skb, and any in the chain
> > excluding the last containing the same amount of data. 
> 
> Yes, it seems to be like that. With this observation we can spmplify
> things. 
> 
> > That being the
> > case we should be able to determine the size that you would need to
> > segment at by taking skb->len, and removing the length of all the
> > skbuffs hanging off of frag_list.  At that point you just use that as
> > your MSS for segmentation and it should break things up so that you
> > have a series of equal sized segments split as the frag_list buffer
> > boundaries.
> > 
> > After that all that is left is to update the gso info for the buffers.
> > For GSO_PARTIAL I was handling that on the first segment only.  For
> > this change you would need to update that code to address the fact
> > that you would have to determine the number of segments on the first
> > frame and the last since the last could be less than the first, but
> > all of the others in-between should have the same number of segments.
> 
> I tried to do this and ended up with the patch below.
> Seems to work, but sill needs some tests. So it is
> not an official patch submission.
> 
> Subject: [PATCH net-next RFC] gso: Support partial splitting at the frag_list pointer
> 
> Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
> gro may build buffers with a frag_list. This can hurt forwarding
> because most NICs can't offload such packets, they need to be
> segmented in software. This patch splits buffers with a frag_list
> at the frag_list pointer into buffers that can be TSO offloaded.
> 
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> ---
>  net/core/skbuff.c      | 46 ++++++++++++++++++++++++++++++++++++----------
>  net/ipv4/af_inet.c     |  6 ++++--
>  net/ipv4/gre_offload.c |  4 +++-
>  net/ipv4/tcp_offload.c |  3 +++
>  net/ipv4/udp_offload.c |  6 ++++--
>  net/ipv6/ip6_offload.c |  5 ++++-
>  6 files changed, 54 insertions(+), 16 deletions(-)
> 
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 3864b4b6..cb326e5 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3060,6 +3060,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>  	unsigned int offset = doffset;
>  	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
>  	unsigned int partial_segs = 0;
> +	unsigned int fraglist_segs = 0;
>  	unsigned int headroom;
>  	unsigned int len = head_skb->len;
>  	__be16 proto;
> @@ -3078,16 +3079,27 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>  	sg = !!(features & NETIF_F_SG);
>  	csum = !!can_checksum_protocol(features, proto);
>  
> -	/* GSO partial only requires that we trim off any excess that
> -	 * doesn't fit into an MSS sized block, so take care of that
> -	 * now.
> -	 */
> -	if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
> -		partial_segs = len / mss;
> -		if (partial_segs > 1)
> -			mss *= partial_segs;
> -		else
> -			partial_segs = 0;
> +	if (sg && csum) {
> +		/* GSO partial only requires that we trim off any excess that
> +		 * doesn't fit into an MSS sized block, so take care of that
> +		 * now.
> +		 */
> +		if ((features & NETIF_F_GSO_PARTIAL)) {
> +			partial_segs = len / mss;
> +			if (partial_segs > 1)
> +				mss *= partial_segs;
> +			else
> +				partial_segs = 0;
> +		} else if (list_skb && (mss != GSO_BY_FRAGS) &&
> +			   net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) {
> +
> +			skb_walk_frags(head_skb, segs) {
> +				len -= segs->len;
> +			}
> +			fraglist_segs = len / mss;
> +			mss = len;
> +			segs = NULL;
> +		}
>  	}
>  
>  	headroom = skb_headroom(head_skb);
> @@ -3298,6 +3310,20 @@ perform_csum_check:
>  		SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
>  	}
>  
> +	if (fraglist_segs) {
> +		struct sk_buff *iter;
> +
> +		for (iter = segs; iter; iter = iter->next) {
> +			if (iter->next) {
> +				skb_shinfo(iter)->gso_size = skb_shinfo(head_skb)->gso_size;
> +				skb_shinfo(iter)->gso_segs = fraglist_segs;
> +			} else {
> +				skb_shinfo(iter)->gso_size = skb_shinfo(head_skb)->gso_size;
> +				skb_shinfo(iter)->gso_segs = iter->len / skb_shinfo(head_skb)->gso_size;
> +			}
> +		}
> +	}
> +
>  	/* Following permits correct backpressure, for protocols
>  	 * using skb_set_owner_w().
>  	 * Idea is to tranfert ownership from head_skb to last segment.
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 989a362..ac46233 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1190,7 +1190,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
>  struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>  				 netdev_features_t features)
>  {
> -	bool udpfrag = false, fixedid = false, encap;
> +	bool udpfrag = false, fixedid = false, gso_partial, encap;
>  	struct sk_buff *segs = ERR_PTR(-EINVAL);
>  	const struct net_offload *ops;
>  	unsigned int offset = 0;
> @@ -1243,6 +1243,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>  	if (IS_ERR_OR_NULL(segs))
>  		goto out;
>  
> +	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> +
>  	skb = segs;
>  	do {
>  		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
> @@ -1252,7 +1254,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>  				iph->frag_off |= htons(IP_MF);
>  			offset += skb->len - nhoff - ihl;
>  			tot_len = skb->len - nhoff;
> -		} else if (skb_is_gso(skb)) {
> +		} else if (skb_is_gso(skb) && gso_partial) {
>  			if (!fixedid) {
>  				iph->id = htons(id);
>  				id += skb_shinfo(skb)->gso_segs;
> diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> index ecd1e09..7c56785 100644
> --- a/net/ipv4/gre_offload.c
> +++ b/net/ipv4/gre_offload.c
> @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
>  	__be16 protocol = skb->protocol;
>  	u16 mac_len = skb->mac_len;
>  	int gre_offset, outer_hlen;
> -	bool need_csum, ufo;
> +	bool need_csum, ufo, gso_partial;
>  
>  	if (!skb->encapsulation)
>  		goto out;
> @@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
>  		goto out;
>  	}
>  
> +	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> +

This is set but not used here in GRE code, probably just missed a
chunk in this version..

You probably want to wrap this into a helper, like:

static inline bool skb_is_gso_partial(struct sk_buff *skb)
{
	return !!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL);
}

It would save you from defining a local var for this.
Maybe even make it also consider the skb_is_gso() check too, as you
seem to always have (skb_is_gso() && skb_is_gso_partial()) and if it's
gso_partial, it should also be gso anyway.

(I didn't check skb_segment part of the patch)

  Marcelo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-25 11:00       ` Steffen Klassert
  2016-08-25 12:38         ` Marcelo Ricardo Leitner
@ 2016-08-25 16:02         ` Alexander Duyck
  2016-08-29 12:00           ` Steffen Klassert
  2016-08-26 20:36         ` Shmulik Ladkani
  2016-08-26 20:59         ` Alexander Duyck
  3 siblings, 1 reply; 16+ messages in thread
From: Alexander Duyck @ 2016-08-25 16:02 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Netdev, Eric Dumazet, Alexander Duyck, Marcelo Ricardo Leitner

On Thu, Aug 25, 2016 at 4:00 AM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> On Wed, Aug 24, 2016 at 09:27:54AM -0700, Alexander Duyck wrote:
>>
>> In you case though we maybe be able to make this easier.  If I am not
>> mistaken I believe we should have the main skb, and any in the chain
>> excluding the last containing the same amount of data.
>
> Yes, it seems to be like that. With this observation we can spmplify
> things.
>
>> That being the
>> case we should be able to determine the size that you would need to
>> segment at by taking skb->len, and removing the length of all the
>> skbuffs hanging off of frag_list.  At that point you just use that as
>> your MSS for segmentation and it should break things up so that you
>> have a series of equal sized segments split as the frag_list buffer
>> boundaries.
>>
>> After that all that is left is to update the gso info for the buffers.
>> For GSO_PARTIAL I was handling that on the first segment only.  For
>> this change you would need to update that code to address the fact
>> that you would have to determine the number of segments on the first
>> frame and the last since the last could be less than the first, but
>> all of the others in-between should have the same number of segments.
>
> I tried to do this and ended up with the patch below.
> Seems to work, but sill needs some tests. So it is
> not an official patch submission.
>
> Subject: [PATCH net-next RFC] gso: Support partial splitting at the frag_list pointer
>
> Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
> gro may build buffers with a frag_list. This can hurt forwarding
> because most NICs can't offload such packets, they need to be
> segmented in software. This patch splits buffers with a frag_list
> at the frag_list pointer into buffers that can be TSO offloaded.
>
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> ---
>  net/core/skbuff.c      | 46 ++++++++++++++++++++++++++++++++++++----------
>  net/ipv4/af_inet.c     |  6 ++++--
>  net/ipv4/gre_offload.c |  4 +++-
>  net/ipv4/tcp_offload.c |  3 +++
>  net/ipv4/udp_offload.c |  6 ++++--
>  net/ipv6/ip6_offload.c |  5 ++++-
>  6 files changed, 54 insertions(+), 16 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 3864b4b6..cb326e5 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3060,6 +3060,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>         unsigned int offset = doffset;
>         unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
>         unsigned int partial_segs = 0;
> +       unsigned int fraglist_segs = 0;
>         unsigned int headroom;
>         unsigned int len = head_skb->len;
>         __be16 proto;
> @@ -3078,16 +3079,27 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>         sg = !!(features & NETIF_F_SG);
>         csum = !!can_checksum_protocol(features, proto);
>
> -       /* GSO partial only requires that we trim off any excess that
> -        * doesn't fit into an MSS sized block, so take care of that
> -        * now.
> -        */
> -       if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
> -               partial_segs = len / mss;
> -               if (partial_segs > 1)
> -                       mss *= partial_segs;
> -               else
> -                       partial_segs = 0;
> +       if (sg && csum) {
> +               /* GSO partial only requires that we trim off any excess that
> +                * doesn't fit into an MSS sized block, so take care of that
> +                * now.
> +                */
> +               if ((features & NETIF_F_GSO_PARTIAL)) {
> +                       partial_segs = len / mss;
> +                       if (partial_segs > 1)
> +                               mss *= partial_segs;
> +                       else
> +                               partial_segs = 0;
> +               } else if (list_skb && (mss != GSO_BY_FRAGS) &&
> +                          net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) {

It might be worthwhile to declare your own skb pointer in here instead
of reusing segs.  That way you can avoid having it initialized on the
stack and then being overwritten here.

> +
> +                       skb_walk_frags(head_skb, segs) {
> +                               len -= segs->len;
> +                       }

One thought I had is that you may want to add a check here to make
certain that the skb frags have a headlen of 0.  If they don't you
would probably need to bail.

Also a comment on the assumption that no skb will have more page frags
than the first might be useful to explain this for anyone that has to
look into this in the future.

> +                       fraglist_segs = len / mss;
> +                       mss = len;

There are a few things here that need to be verified.  First you
should probably verify that fralist_segs is >= 2.  If not we might as
well just bail since there is no advantage to segmenting frames in
this approach if the segments per resultant frame is just 1.  Also you
may wan to use "mss *= fraglist_segs".  In most cases it should be the
same as len, but in the unlikely event that there is some extra data
that was added to skb->head it should round us down to even MSS sized
chunks.

> +                       segs = NULL;
> +               }
>         }
>
>         headroom = skb_headroom(head_skb);
> @@ -3298,6 +3310,20 @@ perform_csum_check:
>                 SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
>         }
>
> +       if (fraglist_segs) {
> +               struct sk_buff *iter;
> +
> +               for (iter = segs; iter; iter = iter->next) {
> +                       if (iter->next) {
> +                               skb_shinfo(iter)->gso_size = skb_shinfo(head_skb)->gso_size;
> +                               skb_shinfo(iter)->gso_segs = fraglist_segs;
> +                       } else {
> +                               skb_shinfo(iter)->gso_size = skb_shinfo(head_skb)->gso_size;
> +                               skb_shinfo(iter)->gso_segs = iter->len / skb_shinfo(head_skb)->gso_size;
> +                       }
> +               }
> +       }
> +
>         /* Following permits correct backpressure, for protocols
>          * using skb_set_owner_w().
>          * Idea is to tranfert ownership from head_skb to last segment.
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 989a362..ac46233 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1190,7 +1190,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
>  struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>                                  netdev_features_t features)
>  {
> -       bool udpfrag = false, fixedid = false, encap;
> +       bool udpfrag = false, fixedid = false, gso_partial, encap;
>         struct sk_buff *segs = ERR_PTR(-EINVAL);
>         const struct net_offload *ops;
>         unsigned int offset = 0;
> @@ -1243,6 +1243,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>         if (IS_ERR_OR_NULL(segs))
>                 goto out;
>
> +       gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> +
>         skb = segs;
>         do {
>                 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
> @@ -1252,7 +1254,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>                                 iph->frag_off |= htons(IP_MF);
>                         offset += skb->len - nhoff - ihl;
>                         tot_len = skb->len - nhoff;
> -               } else if (skb_is_gso(skb)) {
> +               } else if (skb_is_gso(skb) && gso_partial) {
>                         if (!fixedid) {
>                                 iph->id = htons(id);
>                                 id += skb_shinfo(skb)->gso_segs;
> diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> index ecd1e09..7c56785 100644
> --- a/net/ipv4/gre_offload.c
> +++ b/net/ipv4/gre_offload.c
> @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
>         __be16 protocol = skb->protocol;
>         u16 mac_len = skb->mac_len;
>         int gre_offset, outer_hlen;
> -       bool need_csum, ufo;
> +       bool need_csum, ufo, gso_partial;
>
>         if (!skb->encapsulation)
>                 goto out;
> @@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
>                 goto out;
>         }
>
> +       gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> +
>         outer_hlen = skb_tnl_header_len(skb);
>         gre_offset = outer_hlen - tnl_hlen;
>         skb = segs;

Yeah, like Marcelo said this value is unused so you dropped some code somewhere.

> diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> index 5c59649..dddd227 100644
> --- a/net/ipv4/tcp_offload.c
> +++ b/net/ipv4/tcp_offload.c
> @@ -107,6 +107,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
>
>         /* Only first segment might have ooo_okay set */
>         segs->ooo_okay = ooo_okay;
> +       if (skb_is_gso(segs) && !(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL))
> +               mss = (skb_tail_pointer(segs) - skb_transport_header(segs)) +
> +                      segs->data_len - thlen;
>
>         delta = htonl(oldlen + (thlen + mss));
>

Maybe we should just merge the code for this with GSO_PARTIAL which
has something similar a few lines up.

What you could do is just update mss like so:
    if (skb_is_gso(segs))
        mss *= skb_shinfo(segs)->gso_segs;

Then we have both the GSO_PARTIAL and your case covered.

> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> index 81f253b..0b909f9 100644
> --- a/net/ipv4/udp_offload.c
> +++ b/net/ipv4/udp_offload.c
> @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
>         __be16 new_protocol, bool is_ipv6)
>  {
>         int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
> -       bool remcsum, need_csum, offload_csum, ufo;
> +       bool remcsum, need_csum, offload_csum, ufo, gso_partial;
>         struct sk_buff *segs = ERR_PTR(-EINVAL);
>         struct udphdr *uh = udp_hdr(skb);
>         u16 mac_offset = skb->mac_header;
> @@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
>                 goto out;
>         }
>
> +       gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> +
>         outer_hlen = skb_tnl_header_len(skb);
>         udp_offset = outer_hlen - tnl_hlen;
>         skb = segs;
> @@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
>                  * will be using a length value equal to only one MSS sized
>                  * segment instead of the entire frame.
>                  */
> -               if (skb_is_gso(skb)) {
> +               if (skb_is_gso(skb) && gso_partial) {
>                         uh->len = htons(skb_shinfo(skb)->gso_size +
>                                         SKB_GSO_CB(skb)->data_offset +
>                                         skb->head - (unsigned char *)uh);

It might be worth while to reverse these checks.  You can probably
save a few cycles if you check for gso_partial first and then
skb_is_gso instead of the current ordering.

> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
> index 22e90e5..73dd0d1 100644
> --- a/net/ipv6/ip6_offload.c
> +++ b/net/ipv6/ip6_offload.c
> @@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
>         int offset = 0;
>         bool encap, udpfrag;
>         int nhoff;
> +       bool gso_partial;
>
>         skb_reset_network_header(skb);
>         nhoff = skb_network_header(skb) - skb_mac_header(skb);
> @@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
>         if (IS_ERR(segs))
>                 goto out;
>
> +       gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> +
>         for (skb = segs; skb; skb = skb->next) {
>                 ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
> -               if (skb_is_gso(skb))
> +               if (skb_is_gso(skb) && gso_partial)
>                         payload_len = skb_shinfo(skb)->gso_size +
>                                       SKB_GSO_CB(skb)->data_offset +
>                                       skb->head - (unsigned char *)(ipv6h + 1);
> --
> 1.9.1
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-25 11:00       ` Steffen Klassert
  2016-08-25 12:38         ` Marcelo Ricardo Leitner
  2016-08-25 16:02         ` Alexander Duyck
@ 2016-08-26 20:36         ` Shmulik Ladkani
  2016-08-26 20:45           ` Alexander Duyck
  2016-08-26 20:59         ` Alexander Duyck
  3 siblings, 1 reply; 16+ messages in thread
From: Shmulik Ladkani @ 2016-08-26 20:36 UTC (permalink / raw)
  To: Steffen Klassert, Alexander Duyck
  Cc: Netdev, Eric Dumazet, Alexander Duyck, Marcelo Ricardo Leitner

Hi,

On Thu, 25 Aug 2016 13:00:19 +0200 Steffen Klassert <steffen.klassert@secunet.com> wrote:
> On Wed, Aug 24, 2016 at 09:27:54AM -0700, Alexander Duyck wrote:
> > That being the
> > case we should be able to determine the size that you would need to
> > segment at by taking skb->len, and removing the length of all the
> > skbuffs hanging off of frag_list.  At that point you just use that as
> > your MSS for segmentation and it should break things up so that you
> > have a series of equal sized segments split as the frag_list buffer
> > boundaries.
> > 
> > After that all that is left is to update the gso info for the buffers.
> > For GSO_PARTIAL I was handling that on the first segment only.  For
> > this change you would need to update that code to address the fact
> > that you would have to determine the number of segments on the first
> > frame and the last since the last could be less than the first, but
> > all of the others in-between should have the same number of segments.
> 
> I tried to do this and ended up with the patch below.

Nice idea.

Still I'm missing something here, appreciate if you can explain.

We initially split the original head_skb to "big" skbs per the frag_list
boundaries (in a way similar to the SKB_GSO_PARTIAL trick).

Then we reassign gso_size (and gso_segs) for each seg skb, to be the
original gso_size specified in the head_skb.

So if they go for TSO (for devices doing NETIF_F_SG), each skb will be
split to mss segments by the device.

However, if TSO is off, but GSO is on, who takes care of further
splitting these skbs according to their gso_size?

And another question:
Can this be utilized in any way to solve the problem described in [1] ?

Thanks
Shmulik

[1] https://patchwork.ozlabs.org/patch/661419/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-26 20:36         ` Shmulik Ladkani
@ 2016-08-26 20:45           ` Alexander Duyck
  2016-08-27  7:47             ` Shmulik Ladkani
  0 siblings, 1 reply; 16+ messages in thread
From: Alexander Duyck @ 2016-08-26 20:45 UTC (permalink / raw)
  To: Shmulik Ladkani
  Cc: Steffen Klassert, Netdev, Eric Dumazet, Alexander Duyck,
	Marcelo Ricardo Leitner

On Fri, Aug 26, 2016 at 1:36 PM, Shmulik Ladkani
<shmulik.ladkani@gmail.com> wrote:
> Hi,
>
> On Thu, 25 Aug 2016 13:00:19 +0200 Steffen Klassert <steffen.klassert@secunet.com> wrote:
>> On Wed, Aug 24, 2016 at 09:27:54AM -0700, Alexander Duyck wrote:
>> > That being the
>> > case we should be able to determine the size that you would need to
>> > segment at by taking skb->len, and removing the length of all the
>> > skbuffs hanging off of frag_list.  At that point you just use that as
>> > your MSS for segmentation and it should break things up so that you
>> > have a series of equal sized segments split as the frag_list buffer
>> > boundaries.
>> >
>> > After that all that is left is to update the gso info for the buffers.
>> > For GSO_PARTIAL I was handling that on the first segment only.  For
>> > this change you would need to update that code to address the fact
>> > that you would have to determine the number of segments on the first
>> > frame and the last since the last could be less than the first, but
>> > all of the others in-between should have the same number of segments.
>>
>> I tried to do this and ended up with the patch below.
>
> Nice idea.
>
> Still I'm missing something here, appreciate if you can explain.
>
> We initially split the original head_skb to "big" skbs per the frag_list
> boundaries (in a way similar to the SKB_GSO_PARTIAL trick).
>
> Then we reassign gso_size (and gso_segs) for each seg skb, to be the
> original gso_size specified in the head_skb.
>
> So if they go for TSO (for devices doing NETIF_F_SG), each skb will be
> split to mss segments by the device.
>
> However, if TSO is off, but GSO is on, who takes care of further
> splitting these skbs according to their gso_size?

I believe the patch resolves it via the net_gso_ok check.  This is
used to verify if the lower device could segment it if we split out
the buffers from skb->frag_list.

> And another question:
> Can this be utilized in any way to solve the problem described in [1] ?
>
> Thanks
> Shmulik
>
> [1] https://patchwork.ozlabs.org/patch/661419/

I don't think so.  This solution is to only do part of the software
offload and still make use of an existing hardware offload.

The problem in item 1 is that you are wanting to fragment a frame that
has to also be segmented.  Really in order to make something like that
work you would have to do two passes.  One for the segmentation and
another for the fragmentation.  The fact that GSO is already used to
handle UDP fragmentation offload makes this a bit trickier since it
means you would have to pass the frame through the same software
offload stack twice.

- Alex

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-25 11:00       ` Steffen Klassert
                           ` (2 preceding siblings ...)
  2016-08-26 20:36         ` Shmulik Ladkani
@ 2016-08-26 20:59         ` Alexander Duyck
  3 siblings, 0 replies; 16+ messages in thread
From: Alexander Duyck @ 2016-08-26 20:59 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Netdev, Eric Dumazet, Alexander Duyck, Marcelo Ricardo Leitner

On Thu, Aug 25, 2016 at 4:00 AM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 989a362..ac46233 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1190,7 +1190,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
>  struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>                                  netdev_features_t features)
>  {
> -       bool udpfrag = false, fixedid = false, encap;
> +       bool udpfrag = false, fixedid = false, gso_partial, encap;
>         struct sk_buff *segs = ERR_PTR(-EINVAL);
>         const struct net_offload *ops;
>         unsigned int offset = 0;
> @@ -1243,6 +1243,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>         if (IS_ERR_OR_NULL(segs))
>                 goto out;
>
> +       gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> +
>         skb = segs;
>         do {
>                 iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
> @@ -1252,7 +1254,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
>                                 iph->frag_off |= htons(IP_MF);
>                         offset += skb->len - nhoff - ihl;
>                         tot_len = skb->len - nhoff;
> -               } else if (skb_is_gso(skb)) {
> +               } else if (skb_is_gso(skb) && gso_partial) {
>                         if (!fixedid) {
>                                 iph->id = htons(id);
>                                 id += skb_shinfo(skb)->gso_segs;

So I was thinking about it and it occurred to me that this part of the
patch has a slight bug.  What you may want to do is take this section
and merge it with the next so that you get the IPv4 IDs right.  Right
now you are skipping this section so id is only increments by 1 in the
next section.  This will result in overlapping IDs in the final
result.  It should be updating like the code here shows for the
gso_partial portion.

- Alex

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-26 20:45           ` Alexander Duyck
@ 2016-08-27  7:47             ` Shmulik Ladkani
  0 siblings, 0 replies; 16+ messages in thread
From: Shmulik Ladkani @ 2016-08-27  7:47 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Steffen Klassert, Netdev, Eric Dumazet, Alexander Duyck,
	Marcelo Ricardo Leitner

Hi,

On Fri, 26 Aug 2016 13:45:56 -0700 Alexander Duyck <alexander.duyck@gmail.com> wrote:
> > However, if TSO is off, but GSO is on, who takes care of further
> > splitting these skbs according to their gso_size?  
> 
> I believe the patch resolves it via the net_gso_ok check.  This is
> used to verify if the lower device could segment it if we split out
> the buffers from skb->frag_list.

Thanks, got it.

> > And another question:
> > Can this be utilized in any way to solve the problem described in [1] ?
> >
> > [1] https://patchwork.ozlabs.org/patch/661419/  
> 
> I don't think so.  This solution is to only do part of the software
> offload and still make use of an existing hardware offload.

Sorry, I wasn't too clear.

When attempting to reduce gso_size in order to avoid
segmentation+fragmentation, problem I'm hitting is:
  http://lists.openwall.net/netdev/2016/08/25/35

Actually, the idea of yours hinted me to a new direction, will pursue
that.

Thanks,
Shmulik

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-25 16:02         ` Alexander Duyck
@ 2016-08-29 12:00           ` Steffen Klassert
  0 siblings, 0 replies; 16+ messages in thread
From: Steffen Klassert @ 2016-08-29 12:00 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Netdev, Eric Dumazet, Alexander Duyck, Marcelo Ricardo Leitner

On Thu, Aug 25, 2016 at 09:02:33AM -0700, Alexander Duyck wrote:
> On Thu, Aug 25, 2016 at 4:00 AM, Steffen Klassert
> > +
> > +                       skb_walk_frags(head_skb, segs) {
> > +                               len -= segs->len;
> > +                       }
> 
> One thought I had is that you may want to add a check here to make
> certain that the skb frags have a headlen of 0.  If they don't you
> would probably need to bail.

Why do you think we need this check? Is it because the assumption that
all excluding the last skb have the same size might be wrong in this
case?

> >
> > +       gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
> > +
> >         outer_hlen = skb_tnl_header_len(skb);
> >         udp_offset = outer_hlen - tnl_hlen;
> >         skb = segs;
> > @@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
> >                  * will be using a length value equal to only one MSS sized
> >                  * segment instead of the entire frame.
> >                  */
> > -               if (skb_is_gso(skb)) {
> > +               if (skb_is_gso(skb) && gso_partial) {
> >                         uh->len = htons(skb_shinfo(skb)->gso_size +
> >                                         SKB_GSO_CB(skb)->data_offset +
> >                                         skb->head - (unsigned char *)uh);
> 
> It might be worth while to reverse these checks.  You can probably
> save a few cycles if you check for gso_partial first and then
> skb_is_gso instead of the current ordering.

Maybe we can even remove the skb_is_gso check, the gso_partial
check should be sufficient.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer
  2016-08-25 12:17           ` Marcelo Ricardo Leitner
@ 2016-08-29 12:02             ` Steffen Klassert
  0 siblings, 0 replies; 16+ messages in thread
From: Steffen Klassert @ 2016-08-29 12:02 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Alexander Duyck, Netdev, Eric Dumazet, Alexander Duyck

On Thu, Aug 25, 2016 at 09:17:18AM -0300, Marcelo Ricardo Leitner wrote:
> On Thu, Aug 25, 2016 at 09:31:26AM +0200, Steffen Klassert wrote:
> > 
> > Just to understand what you are doing. You generate MTU sized linear
> > buffers in sctp and then, skb_gro_receive() chains up these buffers
> > at the frag_list pointer. skb_gro_receive() does this because
> > skb_gro_offset is null and skb->head_frag is not set in your case.
> > 
> > At segmentation, you just need to split at the frag_list pointer
> > because you know that the chained buffers fit the MTU, right?
> > 
> 
> Correct. Just note that these buffers fit the MTU, but not necessary
> uses all of it. That is main point in here, variable segmentation size.

Thanks for the info. This is a very interesting concept, maybe I can use
it for IPsec too.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2016-08-29 12:02 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-23  5:20 [PATCH net-next v1] gso: Support partial splitting at the frag_list pointer Steffen Klassert
2016-08-23 14:47 ` Alexander Duyck
2016-08-24  9:32   ` Steffen Klassert
2016-08-24 16:27     ` Alexander Duyck
2016-08-24 17:25       ` Marcelo Ricardo Leitner
2016-08-25  7:31         ` Steffen Klassert
2016-08-25 12:17           ` Marcelo Ricardo Leitner
2016-08-29 12:02             ` Steffen Klassert
2016-08-25 11:00       ` Steffen Klassert
2016-08-25 12:38         ` Marcelo Ricardo Leitner
2016-08-25 16:02         ` Alexander Duyck
2016-08-29 12:00           ` Steffen Klassert
2016-08-26 20:36         ` Shmulik Ladkani
2016-08-26 20:45           ` Alexander Duyck
2016-08-27  7:47             ` Shmulik Ladkani
2016-08-26 20:59         ` Alexander Duyck

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.