All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v2 1/2] net: ip: refactor SG checks
@ 2021-06-22 22:50 Jakub Kicinski
  2021-06-22 22:50 ` [PATCH net-next v2 2/2] net: ip: avoid OOM kills with large UDP sends over loopback Jakub Kicinski
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Kicinski @ 2021-06-22 22:50 UTC (permalink / raw)
  To: davem; +Cc: netdev, willemb, eric.dumazet, dsahern, yoshfuji, Jakub Kicinski

There is a number of rt->dst.dev->features & NETIF_F_SG checks
scattered throughout the code. Shorten the lines by caching
the result of this check.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/ip_output.c  | 13 ++++++-------
 net/ipv6/ip6_output.c | 13 ++++++-------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c3efc7d658f6..90031f5446bd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -981,12 +981,14 @@ static int __ip_append_data(struct sock *sk,
 	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
+	bool has_sg, paged, extra_uref = false;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged, extra_uref = false;
 	u32 tskey = 0;
 
 	skb = skb_peek_tail(queue);
 
+	has_sg = rt->dst.dev->features & NETIF_F_SG;
+
 	exthdrlen = !skb ? rt->dst.header_len : 0;
 	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
 	paged = !!cork->gso_size;
@@ -1023,8 +1025,7 @@ static int __ip_append_data(struct sock *sk,
 		if (!uarg)
 			return -ENOBUFS;
 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
-		if (rt->dst.dev->features & NETIF_F_SG &&
-		    csummode == CHECKSUM_PARTIAL) {
+		if (has_sg && csummode == CHECKSUM_PARTIAL) {
 			paged = true;
 		} else {
 			uarg->zerocopy = 0;
@@ -1074,8 +1075,7 @@ static int __ip_append_data(struct sock *sk,
 			fraglen = datalen + fragheaderlen;
 			pagedlen = 0;
 
-			if ((flags & MSG_MORE) &&
-			    !(rt->dst.dev->features&NETIF_F_SG))
+			if ((flags & MSG_MORE) && !has_sg)
 				alloclen = mtu;
 			else if (!paged)
 				alloclen = fraglen;
@@ -1174,8 +1174,7 @@ static int __ip_append_data(struct sock *sk,
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->dst.dev->features&NETIF_F_SG) &&
-		    skb_tailroom(skb) >= copy) {
+		if (!has_sg && skb_tailroom(skb) >= copy) {
 			unsigned int off;
 
 			off = skb->len;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ff4f9ebcf7f6..c667b7e2856f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1444,8 +1444,8 @@ static int __ip6_append_data(struct sock *sk,
 	struct ipv6_txoptions *opt = v6_cork->opt;
 	int csummode = CHECKSUM_NONE;
 	unsigned int maxnonfragsize, headersize;
+	bool has_sg, paged, extra_uref = false;
 	unsigned int wmem_alloc_delta = 0;
-	bool paged, extra_uref = false;
 
 	skb = skb_peek_tail(queue);
 	if (!skb) {
@@ -1453,6 +1453,8 @@ static int __ip6_append_data(struct sock *sk,
 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
 	}
 
+	has_sg = rt->dst.dev->features & NETIF_F_SG;
+
 	paged = !!cork->gso_size;
 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
 	orig_mtu = mtu;
@@ -1515,8 +1517,7 @@ static int __ip6_append_data(struct sock *sk,
 		if (!uarg)
 			return -ENOBUFS;
 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
-		if (rt->dst.dev->features & NETIF_F_SG &&
-		    csummode == CHECKSUM_PARTIAL) {
+		if (has_sg && csummode == CHECKSUM_PARTIAL) {
 			paged = true;
 		} else {
 			uarg->zerocopy = 0;
@@ -1582,8 +1583,7 @@ static int __ip6_append_data(struct sock *sk,
 			fraglen = datalen + fragheaderlen;
 			pagedlen = 0;
 
-			if ((flags & MSG_MORE) &&
-			    !(rt->dst.dev->features&NETIF_F_SG))
+			if ((flags & MSG_MORE) && !has_sg)
 				alloclen = mtu;
 			else if (!paged)
 				alloclen = fraglen;
@@ -1698,8 +1698,7 @@ static int __ip6_append_data(struct sock *sk,
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->dst.dev->features&NETIF_F_SG) &&
-		    skb_tailroom(skb) >= copy) {
+		if (!has_sg && skb_tailroom(skb) >= copy) {
 			unsigned int off;
 
 			off = skb->len;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH net-next v2 2/2] net: ip: avoid OOM kills with large UDP sends over loopback
  2021-06-22 22:50 [PATCH net-next v2 1/2] net: ip: refactor SG checks Jakub Kicinski
@ 2021-06-22 22:50 ` Jakub Kicinski
  2021-06-23 14:25   ` Eric Dumazet
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Kicinski @ 2021-06-22 22:50 UTC (permalink / raw)
  To: davem
  Cc: netdev, willemb, eric.dumazet, dsahern, yoshfuji, Jakub Kicinski,
	Dave Jones

Dave observed number of machines hitting OOM on the UDP send
path. The workload seems to be sending large UDP packets over
loopback. Since loopback has MTU of 64k kernel will try to
allocate an skb with up to 64k of head space. This has a good
chance of failing under memory pressure. What's worse if
the message length is <32k the allocation may trigger an
OOM killer.

This is entirely avoidable, we can use an skb with frags.

af_unix solves a similar problem by limiting the head
length to SKB_MAX_ALLOC. This seems like a good and simple
approach. It means that UDP messages > 16kB will now
use fragments if underlying device supports SG, if extra
allocator pressure causes regressions in real workloads
we can switch to trying the large allocation first and
falling back.

Reported-by: Dave Jones <dsj@fb.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/ip_output.c  | 2 +-
 net/ipv6/ip6_output.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 90031f5446bd..1ab140c173d0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1077,7 +1077,7 @@ static int __ip_append_data(struct sock *sk,
 
 			if ((flags & MSG_MORE) && !has_sg)
 				alloclen = mtu;
-			else if (!paged)
+			else if (!paged && (fraglen < SKB_MAX_ALLOC || !has_sg))
 				alloclen = fraglen;
 			else {
 				alloclen = min_t(int, fraglen, MAX_HEADER);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c667b7e2856f..46d805097a79 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1585,7 +1585,7 @@ static int __ip6_append_data(struct sock *sk,
 
 			if ((flags & MSG_MORE) && !has_sg)
 				alloclen = mtu;
-			else if (!paged)
+			else if (!paged && (fraglen < SKB_MAX_ALLOC || !has_sg))
 				alloclen = fraglen;
 			else {
 				alloclen = min_t(int, fraglen, MAX_HEADER);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next v2 2/2] net: ip: avoid OOM kills with large UDP sends over loopback
  2021-06-22 22:50 ` [PATCH net-next v2 2/2] net: ip: avoid OOM kills with large UDP sends over loopback Jakub Kicinski
@ 2021-06-23 14:25   ` Eric Dumazet
  2021-06-23 15:58     ` Jakub Kicinski
  0 siblings, 1 reply; 4+ messages in thread
From: Eric Dumazet @ 2021-06-23 14:25 UTC (permalink / raw)
  To: Jakub Kicinski, davem
  Cc: netdev, willemb, eric.dumazet, dsahern, yoshfuji, Dave Jones



On 6/23/21 12:50 AM, Jakub Kicinski wrote:
> Dave observed number of machines hitting OOM on the UDP send
> path. The workload seems to be sending large UDP packets over
> loopback. Since loopback has MTU of 64k kernel will try to
> allocate an skb with up to 64k of head space. This has a good
> chance of failing under memory pressure. What's worse if
> the message length is <32k the allocation may trigger an
> OOM killer.
> 
> This is entirely avoidable, we can use an skb with frags.

Are you referring to IP fragments, or page frags ?

> 
> af_unix solves a similar problem by limiting the head
> length to SKB_MAX_ALLOC. This seems like a good and simple
> approach. It means that UDP messages > 16kB will now
> use fragments if underlying device supports SG, if extra
> allocator pressure causes regressions in real workloads
> we can switch to trying the large allocation first and
> falling back.
> 
> Reported-by: Dave Jones <dsj@fb.com>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
>  net/ipv4/ip_output.c  | 2 +-
>  net/ipv6/ip6_output.c | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 90031f5446bd..1ab140c173d0 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -1077,7 +1077,7 @@ static int __ip_append_data(struct sock *sk,
>  
>  			if ((flags & MSG_MORE) && !has_sg)
>  				alloclen = mtu;
> -			else if (!paged)
> +			else if (!paged && (fraglen < SKB_MAX_ALLOC || !has_sg))

This looks indeed better, but there are some boundary conditions,
caused by the fact that we add hh_len+15 later when allocating the skb.

(I expect hh_len+15 being 31)


You probably need 
	else if (!paged && (fraglen + hh_len + 15 < SKB_MAX_ALLOC || !has_sg))

Otherwise we might still attempt order-3 allocations ?

SKB_MAX_ALLOC is 16064 currently (skb_shinfo size being 320 on 64bit arches)

An UDP message with 16034 bytes of payload would translate to
alloclen==16062. If we add 28 bytes for UDP+IP headers, plus 31 bytes for hh_len+31
this would go to 16413, thus asking for 32768 bytes (order-3 page)

(16062+320 = 16382, which is smaller than 16384)


>  				alloclen = fraglen;
>  			else {
>  				alloclen = min_t(int, fraglen, MAX_HEADER);
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index c667b7e2856f..46d805097a79 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -1585,7 +1585,7 @@ static int __ip6_append_data(struct sock *sk,
>  
>  			if ((flags & MSG_MORE) && !has_sg)
>  				alloclen = mtu;
> -			else if (!paged)
> +			else if (!paged && (fraglen < SKB_MAX_ALLOC || !has_sg))
>  				alloclen = fraglen;
>  			else {
>  				alloclen = min_t(int, fraglen, MAX_HEADER);
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next v2 2/2] net: ip: avoid OOM kills with large UDP sends over loopback
  2021-06-23 14:25   ` Eric Dumazet
@ 2021-06-23 15:58     ` Jakub Kicinski
  0 siblings, 0 replies; 4+ messages in thread
From: Jakub Kicinski @ 2021-06-23 15:58 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev, willemb, dsahern, yoshfuji, Dave Jones

On Wed, 23 Jun 2021 16:25:18 +0200 Eric Dumazet wrote:
> On 6/23/21 12:50 AM, Jakub Kicinski wrote:
> > Dave observed number of machines hitting OOM on the UDP send
> > path. The workload seems to be sending large UDP packets over
> > loopback. Since loopback has MTU of 64k kernel will try to
> > allocate an skb with up to 64k of head space. This has a good
> > chance of failing under memory pressure. What's worse if
> > the message length is <32k the allocation may trigger an
> > OOM killer.
> > 
> > This is entirely avoidable, we can use an skb with frags.  
> 
> Are you referring to IP fragments, or page frags ?

page frags, annoyingly overloaded term. I'll say paged, it's 
not the common term but at least won't be confusing.

> > af_unix solves a similar problem by limiting the head
> > length to SKB_MAX_ALLOC. This seems like a good and simple
> > approach. It means that UDP messages > 16kB will now
> > use fragments if underlying device supports SG, if extra
> > allocator pressure causes regressions in real workloads
> > we can switch to trying the large allocation first and
> > falling back.
> > 
> > Reported-by: Dave Jones <dsj@fb.com>
> > Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> > ---
> >  net/ipv4/ip_output.c  | 2 +-
> >  net/ipv6/ip6_output.c | 2 +-
> >  2 files changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> > index 90031f5446bd..1ab140c173d0 100644
> > --- a/net/ipv4/ip_output.c
> > +++ b/net/ipv4/ip_output.c
> > @@ -1077,7 +1077,7 @@ static int __ip_append_data(struct sock *sk,
> >  
> >  			if ((flags & MSG_MORE) && !has_sg)
> >  				alloclen = mtu;
> > -			else if (!paged)
> > +			else if (!paged && (fraglen < SKB_MAX_ALLOC || !has_sg))  
> 
> This looks indeed better, but there are some boundary conditions,
> caused by the fact that we add hh_len+15 later when allocating the skb.
> 
> (I expect hh_len+15 being 31)
> 
> 
> You probably need 
> 	else if (!paged && (fraglen + hh_len + 15 < SKB_MAX_ALLOC || !has_sg))
> 
> Otherwise we might still attempt order-3 allocations ?
> 
> SKB_MAX_ALLOC is 16064 currently (skb_shinfo size being 320 on 64bit arches)
> 
> An UDP message with 16034 bytes of payload would translate to
> alloclen==16062. If we add 28 bytes for UDP+IP headers, plus 31 bytes for hh_len+31
> this would go to 16413, thus asking for 32768 bytes (order-3 page)
> 
> (16062+320 = 16382, which is smaller than 16384)

Will do, thanks!

> >  				alloclen = fraglen;
> >  			else {
> >  				alloclen = min_t(int, fraglen, MAX_HEADER);
> > diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> > index c667b7e2856f..46d805097a79 100644
> > --- a/net/ipv6/ip6_output.c
> > +++ b/net/ipv6/ip6_output.c
> > @@ -1585,7 +1585,7 @@ static int __ip6_append_data(struct sock *sk,
> >  
> >  			if ((flags & MSG_MORE) && !has_sg)
> >  				alloclen = mtu;
> > -			else if (!paged)
> > +			else if (!paged && (fraglen < SKB_MAX_ALLOC || !has_sg))
> >  				alloclen = fraglen;
> >  			else {
> >  				alloclen = min_t(int, fraglen, MAX_HEADER);
> >   


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-06-23 15:58 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-22 22:50 [PATCH net-next v2 1/2] net: ip: refactor SG checks Jakub Kicinski
2021-06-22 22:50 ` [PATCH net-next v2 2/2] net: ip: avoid OOM kills with large UDP sends over loopback Jakub Kicinski
2021-06-23 14:25   ` Eric Dumazet
2021-06-23 15:58     ` Jakub Kicinski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.