All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-06 13:16 ` Hans Westgaard Ry
  0 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-06 13:16 UTC (permalink / raw)
  To: David S. Miller
  Cc: Hans Westgaard Ry, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Alexei Starovoitov,
	Jiri Pirko, Eric Dumazet, Daniel Borkmann, Nicolas Dichtel,
	""Eric W. Biederman"",
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	""hannes@stressinduktion.org"",
	Edward Jee, Julia Lawall, netdev, linux-kernel, Haakon Bugge,
	Knut Omang, Wei Lin Guay, Santosh Shilimkar, Yuval Shaia

Devices may have limits on the number of fragments in an skb they
support. Current codebase uses a constant as maximum for number of
fragments (MAX_SKB_FRAGS) one skb can hold and use.

When enabling scatter/gather and running traffic with many small
messages the codebase uses the maximum number of fragments and thereby
violates the max for certain devices.

An example of such a violation is when running IPoIB on a HCA
supporting 16 SGE on an architecture with 4K pagesize. The
MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
segment we end up with send_requests with 18 SGE resulting in
kernel-panic.

The patch allows the device to limit the maximum number fragments used
in one skb.

The functionality corresponds to gso_max_size/gso_max_segs for gso.

Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>
Reviewed-by: Knut Omang <knut.omang@oracle.com>
Reviewed-by: Wei Lin Guay <wei.lin.guay@oracle.com>
Reviewed-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>

---
 include/linux/netdevice.h | 8 ++++++++
 include/net/sock.h        | 2 ++
 net/core/dev.c            | 1 +
 net/core/sock.c           | 1 +
 net/ipv4/tcp.c            | 4 ++--
 5 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3b5d134..c661865 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1513,6 +1513,8 @@ enum netdev_priv_flags {
  *			NIC for GSO
  *	@gso_min_segs:	Minimum number of segments that can be passed to the
  *			NIC for GSO
+ *     @sg_max_frags:  Maximum number of fragments that can be passed to the
+ *                     NIC for SG
  *
  *	@dcbnl_ops:	Data Center Bridging netlink ops
  *	@num_tc:	Number of traffic classes in the net device
@@ -1799,6 +1801,7 @@ struct net_device {
 	struct phy_device *phydev;
 	struct lock_class_key *qdisc_tx_busylock;
 	bool proto_down;
+	u16 sg_max_frags;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -3794,6 +3797,11 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 {
 	dev->gso_max_size = size;
 }
+static inline void netif_set_sg_max_frags(struct net_device *dev,
+					u16 max)
+{
+	dev->sg_max_frags = min_t(u16, MAX_SKB_FRAGS, max);
+}
 
 static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 					int pulled_hlen, u16 mac_offset,
diff --git a/include/net/sock.h b/include/net/sock.h
index 52d27ee..c884104 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -274,6 +274,7 @@ struct cg_proto;
   *	@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
   *	@sk_gso_max_size: Maximum GSO segment size to build
   *	@sk_gso_max_segs: Maximum number of GSO segments
+  *    @sk_sg_max_frags: Maximum number of SG fragments
   *	@sk_lingertime: %SO_LINGER l_linger setting
   *	@sk_backlog: always used with the per-socket spinlock held
   *	@sk_callback_lock: used with the callbacks in the end of this struct
@@ -456,6 +457,7 @@ struct sock {
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
 	void                    (*sk_destruct)(struct sock *sk);
+	u16                     sk_sg_max_frags;
 };
 
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
diff --git a/net/core/dev.c b/net/core/dev.c
index ae00b89..abfbd3a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7106,6 +7106,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
 	dev->gso_min_segs = 0;
+	dev->sg_max_frags = MAX_SKB_FRAGS;
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
diff --git a/net/core/sock.c b/net/core/sock.c
index e31dfce..53d0cf0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1621,6 +1621,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 		}
 	}
 	sk->sk_gso_max_segs = max_segs;
+	sk->sk_sg_max_frags = dst->dev->sg_max_frags;
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c82cca1..ca5f7a0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -938,7 +938,7 @@ new_segment:
 
 		i = skb_shinfo(skb)->nr_frags;
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
-		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+		if (!can_coalesce && i >= sk->sk_sg_max_frags) {
 			tcp_mark_push(tp, skb);
 			goto new_segment;
 		}
@@ -1211,7 +1211,7 @@ new_segment:
 
 			if (!skb_can_coalesce(skb, i, pfrag->page,
 					      pfrag->offset)) {
-				if (i == MAX_SKB_FRAGS || !sg) {
+				if (i >= sk->sk_sg_max_frags || !sg) {
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				}
-- 
2.4.3


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-06 13:16 ` Hans Westgaard Ry
  0 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-06 13:16 UTC (permalink / raw)
  To: David S. Miller
  Cc: Hans Westgaard Ry, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Alexei Starovoitov,
	Jiri Pirko, Eric Dumazet, Daniel Borkmann, Nicolas Dichtel,
	""Eric W. Biederman"",
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	""hannes@stressinduktion.org"",
	Edward Jee, Julia Lawall

Devices may have limits on the number of fragments in an skb they
support. Current codebase uses a constant as maximum for number of
fragments (MAX_SKB_FRAGS) one skb can hold and use.

When enabling scatter/gather and running traffic with many small
messages the codebase uses the maximum number of fragments and thereby
violates the max for certain devices.

An example of such a violation is when running IPoIB on a HCA
supporting 16 SGE on an architecture with 4K pagesize. The
MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
segment we end up with send_requests with 18 SGE resulting in
kernel-panic.

The patch allows the device to limit the maximum number fragments used
in one skb.

The functionality corresponds to gso_max_size/gso_max_segs for gso.

Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>
Reviewed-by: Knut Omang <knut.omang@oracle.com>
Reviewed-by: Wei Lin Guay <wei.lin.guay@oracle.com>
Reviewed-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>

---
 include/linux/netdevice.h | 8 ++++++++
 include/net/sock.h        | 2 ++
 net/core/dev.c            | 1 +
 net/core/sock.c           | 1 +
 net/ipv4/tcp.c            | 4 ++--
 5 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3b5d134..c661865 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1513,6 +1513,8 @@ enum netdev_priv_flags {
  *			NIC for GSO
  *	@gso_min_segs:	Minimum number of segments that can be passed to the
  *			NIC for GSO
+ *     @sg_max_frags:  Maximum number of fragments that can be passed to the
+ *                     NIC for SG
  *
  *	@dcbnl_ops:	Data Center Bridging netlink ops
  *	@num_tc:	Number of traffic classes in the net device
@@ -1799,6 +1801,7 @@ struct net_device {
 	struct phy_device *phydev;
 	struct lock_class_key *qdisc_tx_busylock;
 	bool proto_down;
+	u16 sg_max_frags;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -3794,6 +3797,11 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 {
 	dev->gso_max_size = size;
 }
+static inline void netif_set_sg_max_frags(struct net_device *dev,
+					u16 max)
+{
+	dev->sg_max_frags = min_t(u16, MAX_SKB_FRAGS, max);
+}
 
 static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 					int pulled_hlen, u16 mac_offset,
diff --git a/include/net/sock.h b/include/net/sock.h
index 52d27ee..c884104 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -274,6 +274,7 @@ struct cg_proto;
   *	@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
   *	@sk_gso_max_size: Maximum GSO segment size to build
   *	@sk_gso_max_segs: Maximum number of GSO segments
+  *    @sk_sg_max_frags: Maximum number of SG fragments
   *	@sk_lingertime: %SO_LINGER l_linger setting
   *	@sk_backlog: always used with the per-socket spinlock held
   *	@sk_callback_lock: used with the callbacks in the end of this struct
@@ -456,6 +457,7 @@ struct sock {
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
 	void                    (*sk_destruct)(struct sock *sk);
+	u16                     sk_sg_max_frags;
 };
 
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
diff --git a/net/core/dev.c b/net/core/dev.c
index ae00b89..abfbd3a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7106,6 +7106,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
 	dev->gso_min_segs = 0;
+	dev->sg_max_frags = MAX_SKB_FRAGS;
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
diff --git a/net/core/sock.c b/net/core/sock.c
index e31dfce..53d0cf0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1621,6 +1621,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 		}
 	}
 	sk->sk_gso_max_segs = max_segs;
+	sk->sk_sg_max_frags = dst->dev->sg_max_frags;
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c82cca1..ca5f7a0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -938,7 +938,7 @@ new_segment:
 
 		i = skb_shinfo(skb)->nr_frags;
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
-		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+		if (!can_coalesce && i >= sk->sk_sg_max_frags) {
 			tcp_mark_push(tp, skb);
 			goto new_segment;
 		}
@@ -1211,7 +1211,7 @@ new_segment:
 
 			if (!skb_can_coalesce(skb, i, pfrag->page,
 					      pfrag->offset)) {
-				if (i == MAX_SKB_FRAGS || !sg) {
+				if (i >= sk->sk_sg_max_frags || !sg) {
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				}
-- 
2.4.3

^ permalink raw reply	[flat|nested] 54+ messages in thread

* RE: [PATCH] net: add per device sg_max_frags for skb
  2016-01-06 13:16 ` Hans Westgaard Ry
@ 2016-01-06 13:59   ` David Laight
  -1 siblings, 0 replies; 54+ messages in thread
From: David Laight @ 2016-01-06 13:59 UTC (permalink / raw)
  To: 'Hans Westgaard Ry', David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel,
	""Eric W. Biederman"",
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	""hannes@stressinduktion.org"",
	Edward Jee, Julia Lawall, netdev, linux-kernel, Haakon Bugge,
	Knut Omang, Wei Lin Guay, Santosh Shilimkar, Yuval Shaia

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1274 bytes --]

From: Hans Westgaard Ry
> Sent: 06 January 2016 13:16
> Devices may have limits on the number of fragments in an skb they
> support. Current codebase uses a constant as maximum for number of
> fragments (MAX_SKB_FRAGS) one skb can hold and use.
> 
> When enabling scatter/gather and running traffic with many small
> messages the codebase uses the maximum number of fragments and thereby
> violates the max for certain devices.
> 
> An example of such a violation is when running IPoIB on a HCA
> supporting 16 SGE on an architecture with 4K pagesize. The
> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
> segment we end up with send_requests with 18 SGE resulting in
> kernel-panic.
> 
> The patch allows the device to limit the maximum number fragments used
> in one skb.

This doesn't seem to me to be the correct way to fix this.
Anything that adds an extra fragment (in this case IPoIB) should allow
for the skb already having the maximum number of fragments.
Fully linearising the skb is overkill, but I think the first fragment
can be added to the linear part of the skb.

	David


ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 54+ messages in thread

* RE: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-06 13:59   ` David Laight
  0 siblings, 0 replies; 54+ messages in thread
From: David Laight @ 2016-01-06 13:59 UTC (permalink / raw)
  To: 'Hans Westgaard Ry', David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel,
	""Eric W. Biederman"",
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	""hannes@stressinduktion.org"",
	Edward Jee, Julia Lawall, netdev,

From: Hans Westgaard Ry
> Sent: 06 January 2016 13:16
> Devices may have limits on the number of fragments in an skb they
> support. Current codebase uses a constant as maximum for number of
> fragments (MAX_SKB_FRAGS) one skb can hold and use.
> 
> When enabling scatter/gather and running traffic with many small
> messages the codebase uses the maximum number of fragments and thereby
> violates the max for certain devices.
> 
> An example of such a violation is when running IPoIB on a HCA
> supporting 16 SGE on an architecture with 4K pagesize. The
> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
> segment we end up with send_requests with 18 SGE resulting in
> kernel-panic.
> 
> The patch allows the device to limit the maximum number fragments used
> in one skb.

This doesn't seem to me to be the correct way to fix this.
Anything that adds an extra fragment (in this case IPoIB) should allow
for the skb already having the maximum number of fragments.
Fully linearising the skb is overkill, but I think the first fragment
can be added to the linear part of the skb.

	David



^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-06 13:16 ` Hans Westgaard Ry
@ 2016-01-06 14:05   ` Eric Dumazet
  -1 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-01-06 14:05 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Alexei Starovoitov,
	Jiri Pirko, Eric Dumazet, Daniel Borkmann, Nicolas Dichtel,
	""Eric W. Biederman"",
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	""hannes@stressinduktion.org"",
	Edward Jee, Julia Lawall, netdev, linux-kernel, Haakon Bugge,
	Knut Omang, Wei Lin Guay, Santosh Shilimkar, Yuval Shaia

On Wed, 2016-01-06 at 14:16 +0100, Hans Westgaard Ry wrote:
> Devices may have limits on the number of fragments in an skb they
> support. Current codebase uses a constant as maximum for number of
> fragments (MAX_SKB_FRAGS) one skb can hold and use.
> 
> When enabling scatter/gather and running traffic with many small
> messages the codebase uses the maximum number of fragments and thereby
> violates the max for certain devices.
> 
> An example of such a violation is when running IPoIB on a HCA
> supporting 16 SGE on an architecture with 4K pagesize. The
> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
> segment we end up with send_requests with 18 SGE resulting in
> kernel-panic.
> 
> The patch allows the device to limit the maximum number fragments used
> in one skb.
> 
> The functionality corresponds to gso_max_size/gso_max_segs for gso.

Unfortunately this is not the right place to fix this issue.

Think about forwarding workloads, where the SKB is cooked by GRO engine.

Anyway, local TCP stack uses 32KB page fragments, so typical skb has no
more than 3 frags.

Look at ndo_features_check(), where the problematic device driver can
add its logic.




^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-06 14:05   ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-01-06 14:05 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Alexei Starovoitov,
	Jiri Pirko, Eric Dumazet, Daniel Borkmann, Nicolas Dichtel,
	""Eric W. Biederman"",
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	""hannes@stressinduktion.org"",
	Edward Jee, Julia Lawall, netdev, linux-kerne

On Wed, 2016-01-06 at 14:16 +0100, Hans Westgaard Ry wrote:
> Devices may have limits on the number of fragments in an skb they
> support. Current codebase uses a constant as maximum for number of
> fragments (MAX_SKB_FRAGS) one skb can hold and use.
> 
> When enabling scatter/gather and running traffic with many small
> messages the codebase uses the maximum number of fragments and thereby
> violates the max for certain devices.
> 
> An example of such a violation is when running IPoIB on a HCA
> supporting 16 SGE on an architecture with 4K pagesize. The
> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
> segment we end up with send_requests with 18 SGE resulting in
> kernel-panic.
> 
> The patch allows the device to limit the maximum number fragments used
> in one skb.
> 
> The functionality corresponds to gso_max_size/gso_max_segs for gso.

Unfortunately this is not the right place to fix this issue.

Think about forwarding workloads, where the SKB is cooked by GRO engine.

Anyway, local TCP stack uses 32KB page fragments, so typical skb has no
more than 3 frags.

Look at ndo_features_check(), where the problematic device driver can
add its logic.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-06 13:59   ` David Laight
@ 2016-01-08  9:55     ` Hans Westgaard Ry
  -1 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-08  9:55 UTC (permalink / raw)
  To: David Laight, David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	" hannes"@stressinduktion.org ,
	Edward Jee, Julia Lawall, netdev, linux-kernel, Haakon Bugge,
	Knut Omang, Wei Lin Guay, Santosh Shilimkar, Yuval Shaia



On 01/06/2016 02:59 PM, David Laight wrote:
> From: Hans Westgaard Ry
>> Sent: 06 January 2016 13:16
>> Devices may have limits on the number of fragments in an skb they
>> support. Current codebase uses a constant as maximum for number of
>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>
>> When enabling scatter/gather and running traffic with many small
>> messages the codebase uses the maximum number of fragments and thereby
>> violates the max for certain devices.
>>
>> An example of such a violation is when running IPoIB on a HCA
>> supporting 16 SGE on an architecture with 4K pagesize. The
>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>> segment we end up with send_requests with 18 SGE resulting in
>> kernel-panic.
>>
>> The patch allows the device to limit the maximum number fragments used
>> in one skb.
> This doesn't seem to me to be the correct way to fix this.
> Anything that adds an extra fragment (in this case IPoIB) should allow
> for the skb already having the maximum number of fragments.
> Fully linearising the skb is overkill, but I think the first fragment
> can be added to the linear part of the skb.
>
> 	David
>
>
When IpoIB handles a skb-request it converts fragments to SGEs to
be handled by a HCA.
The problem arises when the HCA have a limited number of SGEs less than 
MAX_SKB_FRAGS.
(it gets a little worse since IPoIB need to yet another segment)
I have not found any easy way of fixing this with currenct codebase.

Hans

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-08  9:55     ` Hans Westgaard Ry
  0 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-08  9:55 UTC (permalink / raw)
  To: David Laight, David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	" hannes"@stressinduktion.org ,
	Edward Jee, Julia Lawall, netdev, linux-kernel@vger.kernel.org



On 01/06/2016 02:59 PM, David Laight wrote:
> From: Hans Westgaard Ry
>> Sent: 06 January 2016 13:16
>> Devices may have limits on the number of fragments in an skb they
>> support. Current codebase uses a constant as maximum for number of
>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>
>> When enabling scatter/gather and running traffic with many small
>> messages the codebase uses the maximum number of fragments and thereby
>> violates the max for certain devices.
>>
>> An example of such a violation is when running IPoIB on a HCA
>> supporting 16 SGE on an architecture with 4K pagesize. The
>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>> segment we end up with send_requests with 18 SGE resulting in
>> kernel-panic.
>>
>> The patch allows the device to limit the maximum number fragments used
>> in one skb.
> This doesn't seem to me to be the correct way to fix this.
> Anything that adds an extra fragment (in this case IPoIB) should allow
> for the skb already having the maximum number of fragments.
> Fully linearising the skb is overkill, but I think the first fragment
> can be added to the linear part of the skb.
>
> 	David
>
>
When IpoIB handles a skb-request it converts fragments to SGEs to
be handled by a HCA.
The problem arises when the HCA have a limited number of SGEs less than 
MAX_SKB_FRAGS.
(it gets a little worse since IPoIB need to yet another segment)
I have not found any easy way of fixing this with currenct codebase.

Hans

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-06 14:05   ` Eric Dumazet
@ 2016-01-08 10:01     ` Hans Westgaard Ry
  -1 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-08 10:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Alexei Starovoitov,
	Jiri Pirko, Eric Dumazet, Daniel Borkmann, Nicolas Dichtel,
	Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	" hannes"@stressinduktion.org ,
	Edward Jee, Julia Lawall, netdev, linux-kernel, Haakon Bugge,
	Knut Omang, Wei Lin Guay, Santosh Shilimkar, Yuval Shaia



On 01/06/2016 03:05 PM, Eric Dumazet wrote:
> On Wed, 2016-01-06 at 14:16 +0100, Hans Westgaard Ry wrote:
>> Devices may have limits on the number of fragments in an skb they
>> support. Current codebase uses a constant as maximum for number of
>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>
>> When enabling scatter/gather and running traffic with many small
>> messages the codebase uses the maximum number of fragments and thereby
>> violates the max for certain devices.
>>
>> An example of such a violation is when running IPoIB on a HCA
>> supporting 16 SGE on an architecture with 4K pagesize. The
>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>> segment we end up with send_requests with 18 SGE resulting in
>> kernel-panic.
>>
>> The patch allows the device to limit the maximum number fragments used
>> in one skb.
>>
>> The functionality corresponds to gso_max_size/gso_max_segs for gso.
> Unfortunately this is not the right place to fix this issue.
>
> Think about forwarding workloads, where the SKB is cooked by GRO engine.
>
> Anyway, local TCP stack uses 32KB page fragments, so typical skb has no
> more than 3 frags.
>
> Look at ndo_features_check(), where the problematic device driver can
> add its logic.
>
>
>
I've had a look at ndo_features_check and understand that I could supply 
my own
version of the routine, but I wasn't able to figure out how that would 
solve my problem.
As far as I can see the routine is not called in the part of code 
handling scatter/gather.
Could you help out with more info?


Hans

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-08 10:01     ` Hans Westgaard Ry
  0 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-08 10:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Alexei Starovoitov,
	Jiri Pirko, Eric Dumazet, Daniel Borkmann, Nicolas Dichtel,
	Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	" hannes"@stressinduktion.org ,
	Edward Jee, Julia Lawall, netdev, linux-kernel



On 01/06/2016 03:05 PM, Eric Dumazet wrote:
> On Wed, 2016-01-06 at 14:16 +0100, Hans Westgaard Ry wrote:
>> Devices may have limits on the number of fragments in an skb they
>> support. Current codebase uses a constant as maximum for number of
>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>
>> When enabling scatter/gather and running traffic with many small
>> messages the codebase uses the maximum number of fragments and thereby
>> violates the max for certain devices.
>>
>> An example of such a violation is when running IPoIB on a HCA
>> supporting 16 SGE on an architecture with 4K pagesize. The
>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>> segment we end up with send_requests with 18 SGE resulting in
>> kernel-panic.
>>
>> The patch allows the device to limit the maximum number fragments used
>> in one skb.
>>
>> The functionality corresponds to gso_max_size/gso_max_segs for gso.
> Unfortunately this is not the right place to fix this issue.
>
> Think about forwarding workloads, where the SKB is cooked by GRO engine.
>
> Anyway, local TCP stack uses 32KB page fragments, so typical skb has no
> more than 3 frags.
>
> Look at ndo_features_check(), where the problematic device driver can
> add its logic.
>
>
>
I've had a look at ndo_features_check and understand that I could supply 
my own
version of the routine, but I wasn't able to figure out how that would 
solve my problem.
As far as I can see the routine is not called in the part of code 
handling scatter/gather.
Could you help out with more info?


Hans

^ permalink raw reply	[flat|nested] 54+ messages in thread

* RE: [PATCH] net: add per device sg_max_frags for skb
  2016-01-08  9:55     ` Hans Westgaard Ry
@ 2016-01-08 10:33       ` David Laight
  -1 siblings, 0 replies; 54+ messages in thread
From: David Laight @ 2016-01-08 10:33 UTC (permalink / raw)
  To: 'Hans Westgaard Ry', David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	" hannes"@stressinduktion.org ,
	Edward Jee, Julia Lawall, netdev, linux-kernel, Haakon Bugge,
	Knut Omang, Wei Lin Guay, Santosh Shilimkar, Yuval Shaia

From: Hans Westgaard
> Sent: 08 January 2016 09:56
...
> >> The patch allows the device to limit the maximum number fragments used
> >> in one skb.
> >
> > This doesn't seem to me to be the correct way to fix this.
> > Anything that adds an extra fragment (in this case IPoIB) should allow
> > for the skb already having the maximum number of fragments.
> > Fully linearising the skb is overkill, but I think the first fragment
> > can be added to the linear part of the skb.
> >
> > 	David
> >
> >
> When IpoIB handles a skb-request it converts fragments to SGEs to
> be handled by a HCA.
> The problem arises when the HCA have a limited number of SGEs less than
> MAX_SKB_FRAGS.
> (it gets a little worse since IPoIB need to yet another segment)
> I have not found any easy way of fixing this with currenct codebase.

I think one of the xen ethernet interfaces had a similar problem.

Just reduce the number of fragments by copying two (or more) of them
into a single fragment.
In effect, anything that reduces the number of fragments will do a copy.

	David

^ permalink raw reply	[flat|nested] 54+ messages in thread

* RE: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-08 10:33       ` David Laight
  0 siblings, 0 replies; 54+ messages in thread
From: David Laight @ 2016-01-08 10:33 UTC (permalink / raw)
  To: 'Hans Westgaard Ry', David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman,
	" hannes"@stressinduktion.org ,
	Edward Jee, Julia Lawall, netdev, linux-kernel@vger.kernel.org

From: Hans Westgaard
> Sent: 08 January 2016 09:56
...
> >> The patch allows the device to limit the maximum number fragments used
> >> in one skb.
> >
> > This doesn't seem to me to be the correct way to fix this.
> > Anything that adds an extra fragment (in this case IPoIB) should allow
> > for the skb already having the maximum number of fragments.
> > Fully linearising the skb is overkill, but I think the first fragment
> > can be added to the linear part of the skb.
> >
> > 	David
> >
> >
> When IpoIB handles a skb-request it converts fragments to SGEs to
> be handled by a HCA.
> The problem arises when the HCA have a limited number of SGEs less than
> MAX_SKB_FRAGS.
> (it gets a little worse since IPoIB need to yet another segment)
> I have not found any easy way of fixing this with currenct codebase.

I think one of the xen ethernet interfaces had a similar problem.

Just reduce the number of fragments by copying two (or more) of them
into a single fragment.
In effect, anything that reduces the number of fragments will do a copy.

	David


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-08  9:55     ` Hans Westgaard Ry
@ 2016-01-08 11:47       ` Hannes Frederic Sowa
  -1 siblings, 0 replies; 54+ messages in thread
From: Hannes Frederic Sowa @ 2016-01-08 11:47 UTC (permalink / raw)
  To: Hans Westgaard Ry, David Laight, David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman, Edward Jee,
	Julia Lawall, netdev, linux-kernel, Haakon Bugge, Knut Omang,
	Wei Lin Guay, Santosh Shilimkar, Yuval Shaia

On 08.01.2016 10:55, Hans Westgaard Ry wrote:
>
>
> On 01/06/2016 02:59 PM, David Laight wrote:
>> From: Hans Westgaard Ry
>>> Sent: 06 January 2016 13:16
>>> Devices may have limits on the number of fragments in an skb they
>>> support. Current codebase uses a constant as maximum for number of
>>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>>
>>> When enabling scatter/gather and running traffic with many small
>>> messages the codebase uses the maximum number of fragments and thereby
>>> violates the max for certain devices.
>>>
>>> An example of such a violation is when running IPoIB on a HCA
>>> supporting 16 SGE on an architecture with 4K pagesize. The
>>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>>> segment we end up with send_requests with 18 SGE resulting in
>>> kernel-panic.
>>>
>>> The patch allows the device to limit the maximum number fragments used
>>> in one skb.
>> This doesn't seem to me to be the correct way to fix this.
>> Anything that adds an extra fragment (in this case IPoIB) should allow
>> for the skb already having the maximum number of fragments.
>> Fully linearising the skb is overkill, but I think the first fragment
>> can be added to the linear part of the skb.
>>
>>     David
>>
>>
> When IpoIB handles a skb-request it converts fragments to SGEs to
> be handled by a HCA.
> The problem arises when the HCA have a limited number of SGEs less than
> MAX_SKB_FRAGS.
> (it gets a little worse since IPoIB need to yet another segment)
> I have not found any easy way of fixing this with currenct codebase.

I think because of the complex forwarding nature, a global counter which 
driver's can reduce during initialization time is the only solution I 
see right now without changing the layout of the skb later on.

Unfortunately this doesn't resolve the cases were virtual machines 
inject gso skbs, for those there still needs to be a slow path to do the 
reformatting of the skb. :/

Bye,
Hannes

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-08 11:47       ` Hannes Frederic Sowa
  0 siblings, 0 replies; 54+ messages in thread
From: Hannes Frederic Sowa @ 2016-01-08 11:47 UTC (permalink / raw)
  To: Hans Westgaard Ry, David Laight, David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman, Edward Jee,
	Julia Lawall, netdev, linux-kernel, Haakon Bugge

On 08.01.2016 10:55, Hans Westgaard Ry wrote:
>
>
> On 01/06/2016 02:59 PM, David Laight wrote:
>> From: Hans Westgaard Ry
>>> Sent: 06 January 2016 13:16
>>> Devices may have limits on the number of fragments in an skb they
>>> support. Current codebase uses a constant as maximum for number of
>>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>>
>>> When enabling scatter/gather and running traffic with many small
>>> messages the codebase uses the maximum number of fragments and thereby
>>> violates the max for certain devices.
>>>
>>> An example of such a violation is when running IPoIB on a HCA
>>> supporting 16 SGE on an architecture with 4K pagesize. The
>>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>>> segment we end up with send_requests with 18 SGE resulting in
>>> kernel-panic.
>>>
>>> The patch allows the device to limit the maximum number fragments used
>>> in one skb.
>> This doesn't seem to me to be the correct way to fix this.
>> Anything that adds an extra fragment (in this case IPoIB) should allow
>> for the skb already having the maximum number of fragments.
>> Fully linearising the skb is overkill, but I think the first fragment
>> can be added to the linear part of the skb.
>>
>>     David
>>
>>
> When IpoIB handles a skb-request it converts fragments to SGEs to
> be handled by a HCA.
> The problem arises when the HCA have a limited number of SGEs less than
> MAX_SKB_FRAGS.
> (it gets a little worse since IPoIB need to yet another segment)
> I have not found any easy way of fixing this with currenct codebase.

I think because of the complex forwarding nature, a global counter which 
driver's can reduce during initialization time is the only solution I 
see right now without changing the layout of the skb later on.

Unfortunately this doesn't resolve the cases were virtual machines 
inject gso skbs, for those there still needs to be a slow path to do the 
reformatting of the skb. :/

Bye,
Hannes

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-08 11:47       ` Hannes Frederic Sowa
@ 2016-01-13 13:57         ` Hans Westgaard Ry
  -1 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-13 13:57 UTC (permalink / raw)
  To: Hannes Frederic Sowa, David Laight, David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman, Edward Jee,
	Julia Lawall, netdev, linux-kernel, Haakon Bugge, Knut Omang,
	Wei Lin Guay, Santosh Shilimkar, Yuval Shaia



On 01/08/2016 12:47 PM, Hannes Frederic Sowa wrote:
> On 08.01.2016 10:55, Hans Westgaard Ry wrote:
>>
>>
>> On 01/06/2016 02:59 PM, David Laight wrote:
>>> From: Hans Westgaard Ry
>>>> Sent: 06 January 2016 13:16
>>>> Devices may have limits on the number of fragments in an skb they
>>>> support. Current codebase uses a constant as maximum for number of
>>>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>>>
>>>> When enabling scatter/gather and running traffic with many small
>>>> messages the codebase uses the maximum number of fragments and thereby
>>>> violates the max for certain devices.
>>>>
>>>> An example of such a violation is when running IPoIB on a HCA
>>>> supporting 16 SGE on an architecture with 4K pagesize. The
>>>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>>>> segment we end up with send_requests with 18 SGE resulting in
>>>> kernel-panic.
>>>>
>>>> The patch allows the device to limit the maximum number fragments used
>>>> in one skb.
>>> This doesn't seem to me to be the correct way to fix this.
>>> Anything that adds an extra fragment (in this case IPoIB) should allow
>>> for the skb already having the maximum number of fragments.
>>> Fully linearising the skb is overkill, but I think the first fragment
>>> can be added to the linear part of the skb.
>>>
>>>     David
>>>
>>>
>> When IpoIB handles a skb-request it converts fragments to SGEs to
>> be handled by a HCA.
>> The problem arises when the HCA have a limited number of SGEs less than
>> MAX_SKB_FRAGS.
>> (it gets a little worse since IPoIB need to yet another segment)
>> I have not found any easy way of fixing this with currenct codebase.
>
> I think because of the complex forwarding nature, a global counter 
> which driver's can reduce during initialization time is the only 
> solution I see right now without changing the layout of the skb later on.
>
> Unfortunately this doesn't resolve the cases were virtual machines 
> inject gso skbs, for those there still needs to be a slow path to do 
> the reformatting of the skb. :/
>
> Bye,
> Hannes
>
>
The use-case for this patch is an application which sends many small 
messages, by write(2) on a TCP socket which has Nagle enabled. A 
scatter-gather capable NIC (potentially also supporting tso) will then 
be asked to send an skb containing up to MAX_SKB_FRAGS worth of 
fragments (17 considering a 4kb page size, hypothetically 65 considering 
an arch supporting 1kb page size).

Now, if the NIC hardware supports less _gather-fragments_, said hardware 
must run with scatter-gather disabled - or - the NIC driver has to 
implement a partial linearization of the skb to reduce #frags to what 
the hardware supports. The latter is far from elegant, and must be 
implemented in all NIC drivers which have this restriction.

This patch provides the flexibility to choose the maximum number of 
fragments that can be passed down to the NIC in order to
utilize the NIC SG hardware features.


In our view we are discussing two different issues:

    1. Is it reasonable that a NIC can restrict #frags in an skb when 
transmitting?
    2. If yes to the above, how is this implemented the best possible way.

Thanks a lot for feedback on the implementation from David Laight, Eric 
Dumazet and Hannes Fredreric Sowa.

What do you think?

        Hans

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-13 13:57         ` Hans Westgaard Ry
  0 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-13 13:57 UTC (permalink / raw)
  To: Hannes Frederic Sowa, David Laight, David S. Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman ,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman, Edward Jee,
	Julia Lawall, netdev, linux-kernel, Haakon Bugge



On 01/08/2016 12:47 PM, Hannes Frederic Sowa wrote:
> On 08.01.2016 10:55, Hans Westgaard Ry wrote:
>>
>>
>> On 01/06/2016 02:59 PM, David Laight wrote:
>>> From: Hans Westgaard Ry
>>>> Sent: 06 January 2016 13:16
>>>> Devices may have limits on the number of fragments in an skb they
>>>> support. Current codebase uses a constant as maximum for number of
>>>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>>>
>>>> When enabling scatter/gather and running traffic with many small
>>>> messages the codebase uses the maximum number of fragments and thereby
>>>> violates the max for certain devices.
>>>>
>>>> An example of such a violation is when running IPoIB on a HCA
>>>> supporting 16 SGE on an architecture with 4K pagesize. The
>>>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>>>> segment we end up with send_requests with 18 SGE resulting in
>>>> kernel-panic.
>>>>
>>>> The patch allows the device to limit the maximum number fragments used
>>>> in one skb.
>>> This doesn't seem to me to be the correct way to fix this.
>>> Anything that adds an extra fragment (in this case IPoIB) should allow
>>> for the skb already having the maximum number of fragments.
>>> Fully linearising the skb is overkill, but I think the first fragment
>>> can be added to the linear part of the skb.
>>>
>>>     David
>>>
>>>
>> When IpoIB handles a skb-request it converts fragments to SGEs to
>> be handled by a HCA.
>> The problem arises when the HCA have a limited number of SGEs less than
>> MAX_SKB_FRAGS.
>> (it gets a little worse since IPoIB need to yet another segment)
>> I have not found any easy way of fixing this with currenct codebase.
>
> I think because of the complex forwarding nature, a global counter 
> which driver's can reduce during initialization time is the only 
> solution I see right now without changing the layout of the skb later on.
>
> Unfortunately this doesn't resolve the cases were virtual machines 
> inject gso skbs, for those there still needs to be a slow path to do 
> the reformatting of the skb. :/
>
> Bye,
> Hannes
>
>
The use-case for this patch is an application which sends many small 
messages, by write(2) on a TCP socket which has Nagle enabled. A 
scatter-gather capable NIC (potentially also supporting tso) will then 
be asked to send an skb containing up to MAX_SKB_FRAGS worth of 
fragments (17 considering a 4kb page size, hypothetically 65 considering 
an arch supporting 1kb page size).

Now, if the NIC hardware supports less _gather-fragments_, said hardware 
must run with scatter-gather disabled - or - the NIC driver has to 
implement a partial linearization of the skb to reduce #frags to what 
the hardware supports. The latter is far from elegant, and must be 
implemented in all NIC drivers which have this restriction.

This patch provides the flexibility to choose the maximum number of 
fragments that can be passed down to the NIC in order to
utilize the NIC SG hardware features.


In our view we are discussing two different issues:

    1. Is it reasonable that a NIC can restrict #frags in an skb when 
transmitting?
    2. If yes to the above, how is this implemented the best possible way.

Thanks a lot for feedback on the implementation from David Laight, Eric 
Dumazet and Hannes Fredreric Sowa.

What do you think?

        Hans

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-13 13:57         ` Hans Westgaard Ry
@ 2016-01-13 14:19           ` Eric Dumazet
  -1 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-01-13 14:19 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: Hannes Frederic Sowa, David Laight, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Daniel Borkmann,
	Nicolas Dichtel, Eric W. Biederman, Salam Noureddine,
	Jarod Wilson, Toshiaki Makita, Julian Anastasov, Ying Xue,
	Craig Gallek, Mel Gorman, Edward Jee, Julia Lawall, netdev,
	linux-kernel, Haakon Bugge, Knut Omang, Wei Lin Guay,
	Santosh Shilimkar, Yuval Shaia

1) There are no arch with 1K page sizes. Most certainly, if we had
MAX_SKB_FRAGS=65 some assumptions in the stack would fail.

2) TCP stack has coalescing support. write(2) or sendmsg(2) should
append data into the last skb in write queue, and still use 32 KB
frags.
    You get pathological skb when using sendpage() or when one thread
writes data into _multiple_ TCP sockets, since TCP stack uses
    a per thread 32 KB reserve (
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5640f7685831e088fe6c2e1f863a6805962f8e81
)

2) As I said, implementing a limit in TCP stack is not enough. Your
patch is therefore adding complexity for all users, but is not a
general solution.

   GRO, tun device, many things can still cook 'big skbs'

    You need to properly implement a fallback, possibly using
ndo_features_check(), or directly from your ndo_start_xmit()

3) We currently have a very dumb way to fallback, forcing a linearize
call, likely to fail if memory is fragmented and skb big.

    You could instead provide a smart helper, trying to reduce the
number of frags in a skb by chosing adjacent frags and
re-allocating/merging them.

    By choosing, I mean trying to pick smallest ones to minimize copy
cost, to get one skb with X less fragment. (X=1 in your case ?)

   I know for example that bnx2x could benefit from such a helper, as
it has a 13 frags limits.
   (bnx2x_pkt_req_lin(), called from bnx2x ndo_start_xmit()


On Wed, Jan 13, 2016 at 5:57 AM, Hans Westgaard Ry
<hans.westgaard.ry@oracle.com> wrote:
>
>
> On 01/08/2016 12:47 PM, Hannes Frederic Sowa wrote:
>>
>> On 08.01.2016 10:55, Hans Westgaard Ry wrote:
>>>
>>>
>>>
>>> On 01/06/2016 02:59 PM, David Laight wrote:
>>>>
>>>> From: Hans Westgaard Ry
>>>>>
>>>>> Sent: 06 January 2016 13:16
>>>>> Devices may have limits on the number of fragments in an skb they
>>>>> support. Current codebase uses a constant as maximum for number of
>>>>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>>>>
>>>>> When enabling scatter/gather and running traffic with many small
>>>>> messages the codebase uses the maximum number of fragments and thereby
>>>>> violates the max for certain devices.
>>>>>
>>>>> An example of such a violation is when running IPoIB on a HCA
>>>>> supporting 16 SGE on an architecture with 4K pagesize. The
>>>>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>>>>> segment we end up with send_requests with 18 SGE resulting in
>>>>> kernel-panic.
>>>>>
>>>>> The patch allows the device to limit the maximum number fragments used
>>>>> in one skb.
>>>>
>>>> This doesn't seem to me to be the correct way to fix this.
>>>> Anything that adds an extra fragment (in this case IPoIB) should allow
>>>> for the skb already having the maximum number of fragments.
>>>> Fully linearising the skb is overkill, but I think the first fragment
>>>> can be added to the linear part of the skb.
>>>>
>>>>     David
>>>>
>>>>
>>> When IpoIB handles a skb-request it converts fragments to SGEs to
>>> be handled by a HCA.
>>> The problem arises when the HCA have a limited number of SGEs less than
>>> MAX_SKB_FRAGS.
>>> (it gets a little worse since IPoIB need to yet another segment)
>>> I have not found any easy way of fixing this with currenct codebase.
>>
>>
>> I think because of the complex forwarding nature, a global counter which
>> driver's can reduce during initialization time is the only solution I see
>> right now without changing the layout of the skb later on.
>>
>> Unfortunately this doesn't resolve the cases were virtual machines inject
>> gso skbs, for those there still needs to be a slow path to do the
>> reformatting of the skb. :/
>>
>> Bye,
>> Hannes
>>
>>
> The use-case for this patch is an application which sends many small
> messages, by write(2) on a TCP socket which has Nagle enabled. A
> scatter-gather capable NIC (potentially also supporting tso) will then be
> asked to send an skb containing up to MAX_SKB_FRAGS worth of fragments (17
> considering a 4kb page size, hypothetically 65 considering an arch
> supporting 1kb page size).
>
> Now, if the NIC hardware supports less _gather-fragments_, said hardware
> must run with scatter-gather disabled - or - the NIC driver has to implement
> a partial linearization of the skb to reduce #frags to what the hardware
> supports. The latter is far from elegant, and must be implemented in all NIC
> drivers which have this restriction.
>
> This patch provides the flexibility to choose the maximum number of
> fragments that can be passed down to the NIC in order to
> utilize the NIC SG hardware features.
>
>
> In our view we are discussing two different issues:
>
>    1. Is it reasonable that a NIC can restrict #frags in an skb when
> transmitting?
>    2. If yes to the above, how is this implemented the best possible way.
>
> Thanks a lot for feedback on the implementation from David Laight, Eric
> Dumazet and Hannes Fredreric Sowa.
>
> What do you think?
>
>        Hans
>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-13 14:19           ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-01-13 14:19 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: Hannes Frederic Sowa, David Laight, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Daniel Borkmann,
	Nicolas Dichtel, Eric W. Biederman, Salam Noureddine,
	Jarod Wilson, Toshiaki Makita, Julian Anastasov, Ying Xue,
	Craig Gallek, Mel Gorman, Edward Jee, Julia Lawall, netdev,

1) There are no arch with 1K page sizes. Most certainly, if we had
MAX_SKB_FRAGS=65 some assumptions in the stack would fail.

2) TCP stack has coalescing support. write(2) or sendmsg(2) should
append data into the last skb in write queue, and still use 32 KB
frags.
    You get pathological skb when using sendpage() or when one thread
writes data into _multiple_ TCP sockets, since TCP stack uses
    a per thread 32 KB reserve (
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5640f7685831e088fe6c2e1f863a6805962f8e81
)

2) As I said, implementing a limit in TCP stack is not enough. Your
patch is therefore adding complexity for all users, but is not a
general solution.

   GRO, tun device, many things can still cook 'big skbs'

    You need to properly implement a fallback, possibly using
ndo_features_check(), or directly from your ndo_start_xmit()

3) We currently have a very dumb way to fallback, forcing a linearize
call, likely to fail if memory is fragmented and skb big.

    You could instead provide a smart helper, trying to reduce the
number of frags in a skb by chosing adjacent frags and
re-allocating/merging them.

    By choosing, I mean trying to pick smallest ones to minimize copy
cost, to get one skb with X less fragment. (X=1 in your case ?)

   I know for example that bnx2x could benefit from such a helper, as
it has a 13 frags limits.
   (bnx2x_pkt_req_lin(), called from bnx2x ndo_start_xmit()


On Wed, Jan 13, 2016 at 5:57 AM, Hans Westgaard Ry
<hans.westgaard.ry@oracle.com> wrote:
>
>
> On 01/08/2016 12:47 PM, Hannes Frederic Sowa wrote:
>>
>> On 08.01.2016 10:55, Hans Westgaard Ry wrote:
>>>
>>>
>>>
>>> On 01/06/2016 02:59 PM, David Laight wrote:
>>>>
>>>> From: Hans Westgaard Ry
>>>>>
>>>>> Sent: 06 January 2016 13:16
>>>>> Devices may have limits on the number of fragments in an skb they
>>>>> support. Current codebase uses a constant as maximum for number of
>>>>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>>>>
>>>>> When enabling scatter/gather and running traffic with many small
>>>>> messages the codebase uses the maximum number of fragments and thereby
>>>>> violates the max for certain devices.
>>>>>
>>>>> An example of such a violation is when running IPoIB on a HCA
>>>>> supporting 16 SGE on an architecture with 4K pagesize. The
>>>>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>>>>> segment we end up with send_requests with 18 SGE resulting in
>>>>> kernel-panic.
>>>>>
>>>>> The patch allows the device to limit the maximum number fragments used
>>>>> in one skb.
>>>>
>>>> This doesn't seem to me to be the correct way to fix this.
>>>> Anything that adds an extra fragment (in this case IPoIB) should allow
>>>> for the skb already having the maximum number of fragments.
>>>> Fully linearising the skb is overkill, but I think the first fragment
>>>> can be added to the linear part of the skb.
>>>>
>>>>     David
>>>>
>>>>
>>> When IpoIB handles a skb-request it converts fragments to SGEs to
>>> be handled by a HCA.
>>> The problem arises when the HCA have a limited number of SGEs less than
>>> MAX_SKB_FRAGS.
>>> (it gets a little worse since IPoIB need to yet another segment)
>>> I have not found any easy way of fixing this with currenct codebase.
>>
>>
>> I think because of the complex forwarding nature, a global counter which
>> driver's can reduce during initialization time is the only solution I see
>> right now without changing the layout of the skb later on.
>>
>> Unfortunately this doesn't resolve the cases were virtual machines inject
>> gso skbs, for those there still needs to be a slow path to do the
>> reformatting of the skb. :/
>>
>> Bye,
>> Hannes
>>
>>
> The use-case for this patch is an application which sends many small
> messages, by write(2) on a TCP socket which has Nagle enabled. A
> scatter-gather capable NIC (potentially also supporting tso) will then be
> asked to send an skb containing up to MAX_SKB_FRAGS worth of fragments (17
> considering a 4kb page size, hypothetically 65 considering an arch
> supporting 1kb page size).
>
> Now, if the NIC hardware supports less _gather-fragments_, said hardware
> must run with scatter-gather disabled - or - the NIC driver has to implement
> a partial linearization of the skb to reduce #frags to what the hardware
> supports. The latter is far from elegant, and must be implemented in all NIC
> drivers which have this restriction.
>
> This patch provides the flexibility to choose the maximum number of
> fragments that can be passed down to the NIC in order to
> utilize the NIC SG hardware features.
>
>
> In our view we are discussing two different issues:
>
>    1. Is it reasonable that a NIC can restrict #frags in an skb when
> transmitting?
>    2. If yes to the above, how is this implemented the best possible way.
>
> Thanks a lot for feedback on the implementation from David Laight, Eric
> Dumazet and Hannes Fredreric Sowa.
>
> What do you think?
>
>        Hans
>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-13 14:19           ` Eric Dumazet
@ 2016-01-13 14:20             ` Eric Dumazet
  -1 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-01-13 14:20 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: Hannes Frederic Sowa, David Laight, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Daniel Borkmann,
	Nicolas Dichtel, Eric W. Biederman, Salam Noureddine,
	Jarod Wilson, Toshiaki Makita, Julian Anastasov, Ying Xue,
	Craig Gallek, Mel Gorman, Edward Jee, Julia Lawall, netdev,
	linux-kernel, Haakon Bugge, Knut Omang, Wei Lin Guay,
	Santosh Shilimkar, Yuval Shaia

Apologies from my prior top post, I used my corporate webmail instead
of my usual email client.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-13 14:20             ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-01-13 14:20 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: Hannes Frederic Sowa, David Laight, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Daniel Borkmann,
	Nicolas Dichtel, Eric W. Biederman, Salam Noureddine,
	Jarod Wilson, Toshiaki Makita, Julian Anastasov, Ying Xue,
	Craig Gallek, Mel Gorman, Edward Jee, Julia Lawall, netdev,

Apologies from my prior top post, I used my corporate webmail instead
of my usual email client.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-13 14:19           ` Eric Dumazet
@ 2016-01-13 15:07             ` Hannes Frederic Sowa
  -1 siblings, 0 replies; 54+ messages in thread
From: Hannes Frederic Sowa @ 2016-01-13 15:07 UTC (permalink / raw)
  To: Eric Dumazet, Hans Westgaard Ry
  Cc: David Laight, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Alexei Starovoitov,
	Jiri Pirko, Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman, Edward Jee,
	Julia Lawall, netdev, linux-kernel, Haakon Bugge, Knut Omang,
	Wei Lin Guay, Santosh Shilimkar, Yuval Shaia

On 13.01.2016 15:19, Eric Dumazet wrote:
> 1) There are no arch with 1K page sizes. Most certainly, if we had
> MAX_SKB_FRAGS=65 some assumptions in the stack would fail.
>
> 2) TCP stack has coalescing support. write(2) or sendmsg(2) should
> append data into the last skb in write queue, and still use 32 KB
> frags.
>      You get pathological skb when using sendpage() or when one thread
> writes data into _multiple_ TCP sockets, since TCP stack uses
>      a per thread 32 KB reserve (
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5640f7685831e088fe6c2e1f863a6805962f8e81
> )
>
> 2) As I said, implementing a limit in TCP stack is not enough. Your
> patch is therefore adding complexity for all users, but is not a
> general solution.
>
>     GRO, tun device, many things can still cook 'big skbs'
>
>      You need to properly implement a fallback, possibly using
> ndo_features_check(), or directly from your ndo_start_xmit()
>
> 3) We currently have a very dumb way to fallback, forcing a linearize
> call, likely to fail if memory is fragmented and skb big.
>
>      You could instead provide a smart helper, trying to reduce the
> number of frags in a skb by chosing adjacent frags and
> re-allocating/merging them.
>
>      By choosing, I mean trying to pick smallest ones to minimize copy
> cost, to get one skb with X less fragment. (X=1 in your case ?)
>
>     I know for example that bnx2x could benefit from such a helper, as
> it has a 13 frags limits.
>     (bnx2x_pkt_req_lin(), called from bnx2x ndo_start_xmit()

As I proposed, we could globally (or per netns) limit the maximum , I 
think this would be okay and could be the best alternative to install 
slow-paths which could be hit quite constantly.

Otherwise, the fallbacks like Eric proposed them are needed. I do not 
see any other choice.

Thanks,
Hannes

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-13 15:07             ` Hannes Frederic Sowa
  0 siblings, 0 replies; 54+ messages in thread
From: Hannes Frederic Sowa @ 2016-01-13 15:07 UTC (permalink / raw)
  To: Eric Dumazet, Hans Westgaard Ry
  Cc: David Laight, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Alexei Starovoitov,
	Jiri Pirko, Daniel Borkmann, Nicolas Dichtel, Eric W. Biederman,
	Salam Noureddine, Jarod Wilson, Toshiaki Makita,
	Julian Anastasov, Ying Xue, Craig Gallek, Mel Gorman, Edward Jee,
	Julia Lawall, netdev, linux-kernel@vger.kernel.org

On 13.01.2016 15:19, Eric Dumazet wrote:
> 1) There are no arch with 1K page sizes. Most certainly, if we had
> MAX_SKB_FRAGS=65 some assumptions in the stack would fail.
>
> 2) TCP stack has coalescing support. write(2) or sendmsg(2) should
> append data into the last skb in write queue, and still use 32 KB
> frags.
>      You get pathological skb when using sendpage() or when one thread
> writes data into _multiple_ TCP sockets, since TCP stack uses
>      a per thread 32 KB reserve (
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5640f7685831e088fe6c2e1f863a6805962f8e81
> )
>
> 2) As I said, implementing a limit in TCP stack is not enough. Your
> patch is therefore adding complexity for all users, but is not a
> general solution.
>
>     GRO, tun device, many things can still cook 'big skbs'
>
>      You need to properly implement a fallback, possibly using
> ndo_features_check(), or directly from your ndo_start_xmit()
>
> 3) We currently have a very dumb way to fallback, forcing a linearize
> call, likely to fail if memory is fragmented and skb big.
>
>      You could instead provide a smart helper, trying to reduce the
> number of frags in a skb by chosing adjacent frags and
> re-allocating/merging them.
>
>      By choosing, I mean trying to pick smallest ones to minimize copy
> cost, to get one skb with X less fragment. (X=1 in your case ?)
>
>     I know for example that bnx2x could benefit from such a helper, as
> it has a 13 frags limits.
>     (bnx2x_pkt_req_lin(), called from bnx2x ndo_start_xmit()

As I proposed, we could globally (or per netns) limit the maximum , I 
think this would be okay and could be the best alternative to install 
slow-paths which could be hit quite constantly.

Otherwise, the fallbacks like Eric proposed them are needed. I do not 
see any other choice.

Thanks,
Hannes

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-13 14:19           ` Eric Dumazet
                             ` (2 preceding siblings ...)
  (?)
@ 2016-01-13 15:38           ` David Miller
  2016-01-13 15:44               ` Eric Dumazet
  -1 siblings, 1 reply; 54+ messages in thread
From: David Miller @ 2016-01-13 15:38 UTC (permalink / raw)
  To: edumazet
  Cc: hans.westgaard.ry, hannes, David.Laight, kuznet, jmorris,
	yoshfuji, kaber, ast, jiri, daniel, nicolas.dichtel, ebiederm,
	noureddine, jarod, makita.toshiaki, ja, ying.xue, kraig, mgorman,
	edjee, julia.lawall, netdev, linux-kernel, haakon.bugge,
	knut.omang, wei.lin.guay, santosh.shilimkar, yuval.shaia

From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Jan 2016 06:19:11 -0800

> 2) TCP stack has coalescing support. write(2) or sendmsg(2) should
> append data into the last skb in write queue, and still use 32 KB
> frags.

Another way to get pathological SKBs is to do lots of tiny sendpage()
calls over discontiguous areas of the file.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-13 15:38           ` David Miller
@ 2016-01-13 15:44               ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-01-13 15:44 UTC (permalink / raw)
  To: David Miller
  Cc: Hans Westgaard Ry, Hannes Frederic Sowa, David Laight,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Daniel Borkmann,
	Nicolas Dichtel, Eric W. Biederman, Salam Noureddine,
	Jarod Wilson, Toshiaki Makita, Julian Anastasov, Ying Xue,
	Craig Gallek, Mel Gorman, Edward Jee, Julia Lawall, netdev, LKML,
	Haakon Bugge, Knut Omang, Wei Lin Guay, Santosh Shilimkar,
	Yuval Shaia

On Wed, Jan 13, 2016 at 7:38 AM, David Miller <davem@davemloft.net> wrote:
> From: Eric Dumazet <edumazet@google.com>
> Date: Wed, 13 Jan 2016 06:19:11 -0800
>
>> 2) TCP stack has coalescing support. write(2) or sendmsg(2) should
>> append data into the last skb in write queue, and still use 32 KB
>> frags.
>
> Another way to get pathological SKBs is to do lots of tiny sendpage()
> calls over discontiguous areas of the file.

Yes, this was what I mentioned in the following sentence.
"You get pathological skb when using sendpage() or ..."

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-13 15:44               ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-01-13 15:44 UTC (permalink / raw)
  To: David Miller
  Cc: Hans Westgaard Ry, Hannes Frederic Sowa, David Laight,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Daniel Borkmann,
	Nicolas Dichtel, Eric W. Biederman, Salam Noureddine,
	Jarod Wilson, Toshiaki Makita, Julian Anastasov, Ying Xue,
	Craig Gallek, Mel Gorman, Edward Jee, Julia Lawall, netdev, LKML

On Wed, Jan 13, 2016 at 7:38 AM, David Miller <davem@davemloft.net> wrote:
> From: Eric Dumazet <edumazet@google.com>
> Date: Wed, 13 Jan 2016 06:19:11 -0800
>
>> 2) TCP stack has coalescing support. write(2) or sendmsg(2) should
>> append data into the last skb in write queue, and still use 32 KB
>> frags.
>
> Another way to get pathological SKBs is to do lots of tiny sendpage()
> calls over discontiguous areas of the file.

Yes, this was what I mentioned in the following sentence.
"You get pathological skb when using sendpage() or ..."

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
  2016-01-13 13:57         ` Hans Westgaard Ry
@ 2016-01-13 21:07           ` Eric W. Biederman
  -1 siblings, 0 replies; 54+ messages in thread
From: Eric W. Biederman @ 2016-01-13 21:07 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: Hannes Frederic Sowa, David Laight, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Salam Noureddine, Jarod Wilson,
	Toshiaki Makita, Julian Anastasov, Ying Xue, Craig Gallek,
	Mel Gorman, Edward Jee, Julia Lawall, netdev, linux-kernel,
	Haakon Bugge, Knut Omang, Wei Lin Guay, Santosh Shilimkar,
	Yuval Shaia

Hans Westgaard Ry <hans.westgaard.ry@oracle.com> writes:

> On 01/08/2016 12:47 PM, Hannes Frederic Sowa wrote:
>> On 08.01.2016 10:55, Hans Westgaard Ry wrote:
>>>
>>>
>>> On 01/06/2016 02:59 PM, David Laight wrote:
>>>> From: Hans Westgaard Ry
>>>>> Sent: 06 January 2016 13:16
>>>>> Devices may have limits on the number of fragments in an skb they
>>>>> support. Current codebase uses a constant as maximum for number of
>>>>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>>>>
>>>>> When enabling scatter/gather and running traffic with many small
>>>>> messages the codebase uses the maximum number of fragments and thereby
>>>>> violates the max for certain devices.
>>>>>
>>>>> An example of such a violation is when running IPoIB on a HCA
>>>>> supporting 16 SGE on an architecture with 4K pagesize. The
>>>>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>>>>> segment we end up with send_requests with 18 SGE resulting in
>>>>> kernel-panic.
>>>>>
>>>>> The patch allows the device to limit the maximum number fragments used
>>>>> in one skb.
>>>> This doesn't seem to me to be the correct way to fix this.
>>>> Anything that adds an extra fragment (in this case IPoIB) should allow
>>>> for the skb already having the maximum number of fragments.
>>>> Fully linearising the skb is overkill, but I think the first fragment
>>>> can be added to the linear part of the skb.
>>>>
>>>>     David
>>>>
>>>>
>>> When IpoIB handles a skb-request it converts fragments to SGEs to
>>> be handled by a HCA.
>>> The problem arises when the HCA have a limited number of SGEs less than
>>> MAX_SKB_FRAGS.
>>> (it gets a little worse since IPoIB need to yet another segment)
>>> I have not found any easy way of fixing this with currenct codebase.
>>
>> I think because of the complex forwarding nature, a global counter which
>> driver's can reduce during initialization time is the only solution I see
>> right now without changing the layout of the skb later on.
>>
>> Unfortunately this doesn't resolve the cases were virtual machines inject gso
>> skbs, for those there still needs to be a slow path to do the reformatting of
>> the skb. :/
>>
>> Bye,
>> Hannes
>>
>>
> The use-case for this patch is an application which sends many small messages,
> by write(2) on a TCP socket which has Nagle enabled. A scatter-gather capable
> NIC (potentially also supporting tso) will then be asked to send an skb
> containing up to MAX_SKB_FRAGS worth of fragments (17 considering a 4kb page
> size, hypothetically 65 considering an arch supporting 1kb page size).
>
> Now, if the NIC hardware supports less _gather-fragments_, said hardware must
> run with scatter-gather disabled - or - the NIC driver has to implement a
> partial linearization of the skb to reduce #frags to what the hardware
> supports. The latter is far from elegant, and must be implemented in all NIC
> drivers which have this restriction.
>
> This patch provides the flexibility to choose the maximum number of fragments
> that can be passed down to the NIC in order to
> utilize the NIC SG hardware features.
>
>
> In our view we are discussing two different issues:
>
>    1. Is it reasonable that a NIC can restrict #frags in an skb when
> transmitting?
>    2. If yes to the above, how is this implemented the best possible way.
>
> Thanks a lot for feedback on the implementation from David Laight, Eric Dumazet
> and Hannes Fredreric Sowa.
>
> What do you think?

*Scratches my head*  Why doesn't someone fix the infiniband firmware so
that it supports more scatter gather entries?

Last I looked everything like this in infiniband was all implemented in
firmware and there is only one vendor to pick on, so it should be
comparatively easy to just fix the hardware so it does not have this
limitation.

Eric

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH] net: add per device sg_max_frags for skb
@ 2016-01-13 21:07           ` Eric W. Biederman
  0 siblings, 0 replies; 54+ messages in thread
From: Eric W. Biederman @ 2016-01-13 21:07 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: Hannes Frederic Sowa, David Laight, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Alexei Starovoitov, Jiri Pirko, Eric Dumazet,
	Daniel Borkmann, Nicolas Dichtel, Salam Noureddine, Jarod Wilson,
	Toshiaki Makita, Julian Anastasov, Ying Xue, Craig Gallek,
	Mel Gorman, Edward Jee, Julia Lawall, netdev,

Hans Westgaard Ry <hans.westgaard.ry@oracle.com> writes:

> On 01/08/2016 12:47 PM, Hannes Frederic Sowa wrote:
>> On 08.01.2016 10:55, Hans Westgaard Ry wrote:
>>>
>>>
>>> On 01/06/2016 02:59 PM, David Laight wrote:
>>>> From: Hans Westgaard Ry
>>>>> Sent: 06 January 2016 13:16
>>>>> Devices may have limits on the number of fragments in an skb they
>>>>> support. Current codebase uses a constant as maximum for number of
>>>>> fragments (MAX_SKB_FRAGS) one skb can hold and use.
>>>>>
>>>>> When enabling scatter/gather and running traffic with many small
>>>>> messages the codebase uses the maximum number of fragments and thereby
>>>>> violates the max for certain devices.
>>>>>
>>>>> An example of such a violation is when running IPoIB on a HCA
>>>>> supporting 16 SGE on an architecture with 4K pagesize. The
>>>>> MAX_SKB_FRAGS will be 17 (64K/4K+1) and because IPoIB adds yet another
>>>>> segment we end up with send_requests with 18 SGE resulting in
>>>>> kernel-panic.
>>>>>
>>>>> The patch allows the device to limit the maximum number fragments used
>>>>> in one skb.
>>>> This doesn't seem to me to be the correct way to fix this.
>>>> Anything that adds an extra fragment (in this case IPoIB) should allow
>>>> for the skb already having the maximum number of fragments.
>>>> Fully linearising the skb is overkill, but I think the first fragment
>>>> can be added to the linear part of the skb.
>>>>
>>>>     David
>>>>
>>>>
>>> When IpoIB handles a skb-request it converts fragments to SGEs to
>>> be handled by a HCA.
>>> The problem arises when the HCA have a limited number of SGEs less than
>>> MAX_SKB_FRAGS.
>>> (it gets a little worse since IPoIB need to yet another segment)
>>> I have not found any easy way of fixing this with currenct codebase.
>>
>> I think because of the complex forwarding nature, a global counter which
>> driver's can reduce during initialization time is the only solution I see
>> right now without changing the layout of the skb later on.
>>
>> Unfortunately this doesn't resolve the cases were virtual machines inject gso
>> skbs, for those there still needs to be a slow path to do the reformatting of
>> the skb. :/
>>
>> Bye,
>> Hannes
>>
>>
> The use-case for this patch is an application which sends many small messages,
> by write(2) on a TCP socket which has Nagle enabled. A scatter-gather capable
> NIC (potentially also supporting tso) will then be asked to send an skb
> containing up to MAX_SKB_FRAGS worth of fragments (17 considering a 4kb page
> size, hypothetically 65 considering an arch supporting 1kb page size).
>
> Now, if the NIC hardware supports less _gather-fragments_, said hardware must
> run with scatter-gather disabled - or - the NIC driver has to implement a
> partial linearization of the skb to reduce #frags to what the hardware
> supports. The latter is far from elegant, and must be implemented in all NIC
> drivers which have this restriction.
>
> This patch provides the flexibility to choose the maximum number of fragments
> that can be passed down to the NIC in order to
> utilize the NIC SG hardware features.
>
>
> In our view we are discussing two different issues:
>
>    1. Is it reasonable that a NIC can restrict #frags in an skb when
> transmitting?
>    2. If yes to the above, how is this implemented the best possible way.
>
> Thanks a lot for feedback on the implementation from David Laight, Eric Dumazet
> and Hannes Fredreric Sowa.
>
> What do you think?

*Scratches my head*  Why doesn't someone fix the infiniband firmware so
that it supports more scatter gather entries?

Last I looked everything like this in infiniband was all implemented in
firmware and there is only one vendor to pick on, so it should be
comparatively easy to just fix the hardware so it does not have this
limitation.

Eric

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH v2] net:Add sysctl_tcp_sg_max_skb_frags
  2016-01-08  9:55     ` Hans Westgaard Ry
                       ` (2 preceding siblings ...)
  (?)
@ 2016-01-27 13:20     ` Hans Westgaard Ry
  2016-01-27 15:15       ` Eric Dumazet
  2016-01-27 20:13       ` David Miller
  -1 siblings, 2 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-01-27 13:20 UTC (permalink / raw)
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy,
	""hannes@stressinduktion.org"",
	open list:NETWORKING [GENERAL],
	open list, Haakon Bugge

Devices may have limits on the number of fragments in an skb they support.
Current codebase uses a constant as maximum for number of fragments one
skb can hold and use.
When enabling scatter/gather and running traffic with many small messages
the codebase uses the maximum number of fragments and may thereby violate
the max for certain devices.
The patch introduces a global variable as max number of fragments in
scatter/gather.

Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>

---
 include/net/tcp.h          |  2 ++
 net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
 net/ipv4/tcp.c             |  8 +++++---
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f80e74c..8d18df3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -285,6 +285,8 @@ extern int sysctl_tcp_invalid_ratelimit;
 extern int sysctl_tcp_pacing_ss_ratio;
 extern int sysctl_tcp_pacing_ca_ratio;
 
+extern int sysctl_tcp_sg_max_skb_frags;
+
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
 extern int tcp_memory_pressure;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a0bd7a5..498dcf9 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -40,6 +40,7 @@ static int ip_ttl_min = 1;
 static int ip_ttl_max = 255;
 static int tcp_syn_retries_min = 1;
 static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
+static int tcp_sg_max_skb_frags = MAX_SKB_FRAGS;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 
@@ -761,6 +762,15 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
 	{
+		.procname	= "tcp_sg_max_skb_frags",
+		.data		= &sysctl_tcp_sg_max_skb_frags,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &tcp_sg_max_skb_frags,
+	},
+	{
 		.procname	= "icmp_msgs_per_sec",
 		.data		= &sysctl_icmp_msgs_per_sec,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c82cca1..928d2c7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -300,7 +300,9 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
 
 atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
-
+/* maximum number of fragments for tcp scatter/gather */
+int sysctl_tcp_sg_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_tcp_sg_max_skb_frags);
 /*
  * Current number of TCP sockets.
  */
@@ -938,7 +940,7 @@ new_segment:
 
 		i = skb_shinfo(skb)->nr_frags;
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
-		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+		if (!can_coalesce && i >= sysctl_tcp_sg_max_skb_frags) {
 			tcp_mark_push(tp, skb);
 			goto new_segment;
 		}
@@ -1211,7 +1213,7 @@ new_segment:
 
 			if (!skb_can_coalesce(skb, i, pfrag->page,
 					      pfrag->offset)) {
-				if (i == MAX_SKB_FRAGS || !sg) {
+				if (i == sysctl_tcp_sg_max_skb_frags || !sg) {
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				}
-- 
2.4.3

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v2] net:Add sysctl_tcp_sg_max_skb_frags
  2016-01-27 13:20     ` [PATCH v2] net:Add sysctl_tcp_sg_max_skb_frags Hans Westgaard Ry
@ 2016-01-27 15:15       ` Eric Dumazet
  2016-01-27 18:12         ` Hannes Frederic Sowa
  2016-01-27 20:13       ` David Miller
  1 sibling, 1 reply; 54+ messages in thread
From: Eric Dumazet @ 2016-01-27 15:15 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy,
	""hannes@stressinduktion.org"",
	open list:NETWORKING [GENERAL],
	open list, Haakon Bugge

On Wed, 2016-01-27 at 14:20 +0100, Hans Westgaard Ry wrote:
> Devices may have limits on the number of fragments in an skb they support.
> Current codebase uses a constant as maximum for number of fragments one
> skb can hold and use.
> When enabling scatter/gather and running traffic with many small messages
> the codebase uses the maximum number of fragments and may thereby violate
> the max for certain devices.
> The patch introduces a global variable as max number of fragments in
> scatter/gather.


Principle looks good, but we have to ask if other skb providers [1] will
add other sysctl, or if we could share a common one ?

If it is a common one, it should be /proc/sys/net/core/... instead
of /proc/sys/net/ipv4/tcp_....



Other providers include :

1) GRO stack
2) callers of sock_alloc_send_pskb(), alloc_skb_with_frags(),
sock_alloc_send_skb() ..

Thanks !

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v2] net:Add sysctl_tcp_sg_max_skb_frags
  2016-01-27 15:15       ` Eric Dumazet
@ 2016-01-27 18:12         ` Hannes Frederic Sowa
  2016-02-01 13:12           ` Hans Westgaard Ry
  0 siblings, 1 reply; 54+ messages in thread
From: Hannes Frederic Sowa @ 2016-01-27 18:12 UTC (permalink / raw)
  To: Eric Dumazet, Hans Westgaard Ry
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy,
	open list:NETWORKING [GENERAL],
	open list, Haakon Bugge

On 27.01.2016 16:15, Eric Dumazet wrote:
> On Wed, 2016-01-27 at 14:20 +0100, Hans Westgaard Ry wrote:
>> Devices may have limits on the number of fragments in an skb they support.
>> Current codebase uses a constant as maximum for number of fragments one
>> skb can hold and use.
>> When enabling scatter/gather and running traffic with many small messages
>> the codebase uses the maximum number of fragments and may thereby violate
>> the max for certain devices.
>> The patch introduces a global variable as max number of fragments in
>> scatter/gather.
>
>
> Principle looks good, but we have to ask if other skb providers [1] will
> add other sysctl, or if we could share a common one ?
>
> If it is a common one, it should be /proc/sys/net/core/... instead
> of /proc/sys/net/ipv4/tcp_....
 >
> Other providers include :
>
> 1) GRO stack
> 2) callers of sock_alloc_send_pskb(), alloc_skb_with_frags(),
> sock_alloc_send_skb() ..

I agree, this knob should get a generic name and live in a generic net/ 
directory to control this globally, so things don't break during 
forwarding etc.

It does not solve the problem completely, e.g. when VMs send gso packets 
through a vhost-net onto IPoIB, no?

Thanks,
Hannes

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v2] net:Add sysctl_tcp_sg_max_skb_frags
  2016-01-27 13:20     ` [PATCH v2] net:Add sysctl_tcp_sg_max_skb_frags Hans Westgaard Ry
  2016-01-27 15:15       ` Eric Dumazet
@ 2016-01-27 20:13       ` David Miller
  1 sibling, 0 replies; 54+ messages in thread
From: David Miller @ 2016-01-27 20:13 UTC (permalink / raw)
  To: hans.westgaard.ry
  Cc: kuznet, jmorris, yoshfuji, kaber, hannes, netdev, linux-kernel,
	haakon.bugge

From: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
Date: Wed, 27 Jan 2016 14:20:29 +0100

> Devices may have limits on the number of fragments in an skb they support.

Then this belongs as a global networking setting not a protocol
specfic one.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v2] net:Add sysctl_tcp_sg_max_skb_frags
  2016-01-27 18:12         ` Hannes Frederic Sowa
@ 2016-02-01 13:12           ` Hans Westgaard Ry
  0 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-02-01 13:12 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Eric Dumazet
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy,
	open list:NETWORKING [GENERAL],
	open list, Haakon Bugge



On 01/27/2016 07:12 PM, Hannes Frederic Sowa wrote:
> On 27.01.2016 16:15, Eric Dumazet wrote:
>>
>> If it is a common one, it should be /proc/sys/net/core/... instead
>> of /proc/sys/net/ipv4/tcp_....
> >
>> Other providers include :
>>
>> 1) GRO stack
>> 2) callers of sock_alloc_send_pskb(), alloc_skb_with_frags(),
>> sock_alloc_send_skb() ..
>
> I agree, this knob should get a generic name and live in a generic 
> net/ directory to control this globally, so things don't break during 
> forwarding etc.
>
> It does not solve the problem completely, e.g. when VMs send gso 
> packets through a vhost-net onto IPoIB, no?
>
> Thanks,
> Hannes
>
I have understood more of the problem Hannes raises and I realize that 
we need a slowpath in IPoIB to handle the problem
in question. I will make a new version of this patch with a more correct 
position/name of the sysctl-variable.
I'll also make a separate patch adding the slowpath i IPoIB.

Hans

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH v3] net:Add sysctl_max_skb_frags
  2016-01-08  9:55     ` Hans Westgaard Ry
@ 2016-02-03  8:26       ` Hans Westgaard Ry
  -1 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-02-03  8:26 UTC (permalink / raw)
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

Devices may have limits on the number of fragments in an skb they support.
Current codebase uses a constant as maximum for number of fragments one
skb can hold and use.
When enabling scatter/gather and running traffic with many small messages
the codebase uses the maximum number of fragments and may thereby violate
the max for certain devices.
The patch introduces a global variable as max number of fragments.

Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>

---
 include/linux/skbuff.h     |  1 +
 net/core/skbuff.c          |  2 ++
 net/core/sysctl_net_core.c | 10 ++++++++++
 net/ipv4/tcp.c             |  4 ++--
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4355129..fe47ad3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -219,6 +219,7 @@ struct sk_buff;
 #else
 #define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
 #endif
+extern int sysctl_max_skb_frags;
 
 typedef struct skb_frag_struct skb_frag_t;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 152b9c7..c336b97 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,6 +79,8 @@
 
 struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_max_skb_frags);
 
 /**
  *	skb_panic - private function for out-of-line support
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 95b6139..a6beb7b 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -26,6 +26,7 @@ static int zero = 0;
 static int one = 1;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int max_skb_frags = MAX_SKB_FRAGS;
 
 static int net_msg_warn;	/* Unused, but still a sysctl */
 
@@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "max_skb_frags",
+		.data		= &sysctl_max_skb_frags,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &max_skb_frags,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c82cca1..3dc7a2fd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -938,7 +938,7 @@ new_segment:
 
 		i = skb_shinfo(skb)->nr_frags;
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
-		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+		if (!can_coalesce && i >= sysctl_max_skb_frags) {
 			tcp_mark_push(tp, skb);
 			goto new_segment;
 		}
@@ -1211,7 +1211,7 @@ new_segment:
 
 			if (!skb_can_coalesce(skb, i, pfrag->page,
 					      pfrag->offset)) {
-				if (i == MAX_SKB_FRAGS || !sg) {
+				if (i == sysctl_max_skb_frags || !sg) {
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				}
-- 
2.4.3

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH v3] net:Add sysctl_max_skb_frags
@ 2016-02-03  8:26       ` Hans Westgaard Ry
  0 siblings, 0 replies; 54+ messages in thread
From: Hans Westgaard Ry @ 2016-02-03  8:26 UTC (permalink / raw)
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list

Devices may have limits on the number of fragments in an skb they support.
Current codebase uses a constant as maximum for number of fragments one
skb can hold and use.
When enabling scatter/gather and running traffic with many small messages
the codebase uses the maximum number of fragments and may thereby violate
the max for certain devices.
The patch introduces a global variable as max number of fragments.

Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>

---
 include/linux/skbuff.h     |  1 +
 net/core/skbuff.c          |  2 ++
 net/core/sysctl_net_core.c | 10 ++++++++++
 net/ipv4/tcp.c             |  4 ++--
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4355129..fe47ad3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -219,6 +219,7 @@ struct sk_buff;
 #else
 #define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
 #endif
+extern int sysctl_max_skb_frags;
 
 typedef struct skb_frag_struct skb_frag_t;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 152b9c7..c336b97 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,6 +79,8 @@
 
 struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_max_skb_frags);
 
 /**
  *	skb_panic - private function for out-of-line support
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 95b6139..a6beb7b 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -26,6 +26,7 @@ static int zero = 0;
 static int one = 1;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int max_skb_frags = MAX_SKB_FRAGS;
 
 static int net_msg_warn;	/* Unused, but still a sysctl */
 
@@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "max_skb_frags",
+		.data		= &sysctl_max_skb_frags,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &max_skb_frags,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c82cca1..3dc7a2fd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -938,7 +938,7 @@ new_segment:
 
 		i = skb_shinfo(skb)->nr_frags;
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
-		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+		if (!can_coalesce && i >= sysctl_max_skb_frags) {
 			tcp_mark_push(tp, skb);
 			goto new_segment;
 		}
@@ -1211,7 +1211,7 @@ new_segment:
 
 			if (!skb_can_coalesce(skb, i, pfrag->page,
 					      pfrag->offset)) {
-				if (i == MAX_SKB_FRAGS || !sg) {
+				if (i == sysctl_max_skb_frags || !sg) {
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				}
-- 
2.4.3

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03  8:26       ` Hans Westgaard Ry
  (?)
@ 2016-02-03 11:25       ` Herbert Xu
  2016-02-03 11:36         ` Hannes Frederic Sowa
  -1 siblings, 1 reply; 54+ messages in thread
From: Herbert Xu @ 2016-02-03 11:25 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Tejun Heo, Andrew Morton, Alexey Kodanev,
	Håkon Bugge, open list, open list:NETWORKING [GENERAL]

On Wed, Feb 03, 2016 at 09:26:57AM +0100, Hans Westgaard Ry wrote:
> Devices may have limits on the number of fragments in an skb they support.
> Current codebase uses a constant as maximum for number of fragments one
> skb can hold and use.
> When enabling scatter/gather and running traffic with many small messages
> the codebase uses the maximum number of fragments and may thereby violate
> the max for certain devices.
> The patch introduces a global variable as max number of fragments.
> 
> Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
> Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>

I have to say this seems rather dirty.  I mean if taken to the
extreme wouldn't this mean that we should disable frags altogether
if some NIC can't handle them at all?

Someone suggested earlier to partially linearise the skb, why
couldn't we do that? IOW let's handle this craziness in the crazy
drivers and not in the general stack.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 11:25       ` Herbert Xu
@ 2016-02-03 11:36         ` Hannes Frederic Sowa
  2016-02-03 12:20           ` Herbert Xu
  0 siblings, 1 reply; 54+ messages in thread
From: Hannes Frederic Sowa @ 2016-02-03 11:36 UTC (permalink / raw)
  To: Herbert Xu, Hans Westgaard Ry
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing, Tejun Heo,
	Andrew Morton, Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On 03.02.2016 12:25, Herbert Xu wrote:
> On Wed, Feb 03, 2016 at 09:26:57AM +0100, Hans Westgaard Ry wrote:
>> Devices may have limits on the number of fragments in an skb they support.
>> Current codebase uses a constant as maximum for number of fragments one
>> skb can hold and use.
>> When enabling scatter/gather and running traffic with many small messages
>> the codebase uses the maximum number of fragments and may thereby violate
>> the max for certain devices.
>> The patch introduces a global variable as max number of fragments.
>>
>> Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
>> Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>
>
> I have to say this seems rather dirty.  I mean if taken to the
> extreme wouldn't this mean that we should disable frags altogether
> if some NIC can't handle them at all?
>
> Someone suggested earlier to partially linearise the skb, why
> couldn't we do that? IOW let's handle this craziness in the crazy
> drivers and not in the general stack.

Agreed that it feels like a hack, but a rather simple one. I would
consider this to be just a performance improvement. We certainly need
a slow-path when virtio drivers submit gso packets to the stack (and
already discussed with Hans). The sysctl can't help here. But without
the sysctl the packets would constantly hit the slow-path in case of
e.g. IPoIB and that would also be rather bad.

Thanks,
Hannes

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 11:36         ` Hannes Frederic Sowa
@ 2016-02-03 12:20           ` Herbert Xu
  2016-02-03 14:03             ` Hannes Frederic Sowa
                               ` (2 more replies)
  0 siblings, 3 replies; 54+ messages in thread
From: Herbert Xu @ 2016-02-03 12:20 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing, Tejun Heo,
	Andrew Morton, Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On Wed, Feb 03, 2016 at 12:36:21PM +0100, Hannes Frederic Sowa wrote:
>
> Agreed that it feels like a hack, but a rather simple one. I would
> consider this to be just a performance improvement. We certainly need
> a slow-path when virtio drivers submit gso packets to the stack (and
> already discussed with Hans). The sysctl can't help here. But without
> the sysctl the packets would constantly hit the slow-path in case of
> e.g. IPoIB and that would also be rather bad.

So you want to penalise every NIC in the system if just one of
them is broken? This is insane.  Just do the partial linearisation
in that one driver that needs it and not only won't you have to
penalise anyone else but you still get the best result for that
driver that needs it.

Besides, you have to implement the linearisation anyway because
of virtualisation.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 12:20           ` Herbert Xu
@ 2016-02-03 14:03             ` Hannes Frederic Sowa
  2016-02-03 14:30               ` Eric Dumazet
  2016-02-03 17:36             ` David Laight
  2 siblings, 0 replies; 54+ messages in thread
From: Hannes Frederic Sowa @ 2016-02-03 14:03 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing, Tejun Heo,
	Andrew Morton, Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On 03.02.2016 13:20, Herbert Xu wrote:
> On Wed, Feb 03, 2016 at 12:36:21PM +0100, Hannes Frederic Sowa wrote:
>>
>> Agreed that it feels like a hack, but a rather simple one. I would
>> consider this to be just a performance improvement. We certainly need
>> a slow-path when virtio drivers submit gso packets to the stack (and
>> already discussed with Hans). The sysctl can't help here. But without
>> the sysctl the packets would constantly hit the slow-path in case of
>> e.g. IPoIB and that would also be rather bad.
>
> So you want to penalise every NIC in the system if just one of
> them is broken? This is insane.  Just do the partial linearisation
> in that one driver that needs it and not only won't you have to
> penalise anyone else but you still get the best result for that
> driver that needs it.

Most normal Ethernet systems and drivers currently don't need tweating 
this knob at all, only some special kinds of installations. This patch 
referred to IPoIB as a possible user which drivers/firmware/cards seem 
to have this problem. Current behavior just leaves everything as-is.

If you use IPoIB you probably use it quite regular and linearizing an 
skbs *always* seems to be much more work than simply capping the number 
of frags globally.

> Besides, you have to implement the linearisation anyway because
> of virtualisation.

Yes, the slow-path is necessary. But instead of writing a new 
complicated linearizing function to just reduce the fragments we could 
also simply linearize it completely and ask the admin to also tune the 
vm guests.

I only see this tuning in kind in very specific environments where the 
admins now what they do.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 12:20           ` Herbert Xu
@ 2016-02-03 14:30               ` Eric Dumazet
  2016-02-03 14:30               ` Eric Dumazet
  2016-02-03 17:36             ` David Laight
  2 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-02-03 14:30 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Hannes Frederic Sowa, Hans Westgaard Ry, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Tom Herbert, Pablo Neira Ayuso, Eric Dumazet,
	Florian Westphal, Jiri Pirko, Alexander Duyck, Michal Hocko,
	Linus Lüssing, Tejun Heo, Andrew Morton, Alexey Kodanev,
	Håkon Bugge, open list, open list:NETWORKING [GENERAL]

On Wed, 2016-02-03 at 20:20 +0800, Herbert Xu wrote:
> On Wed, Feb 03, 2016 at 12:36:21PM +0100, Hannes Frederic Sowa wrote:
> >
> > Agreed that it feels like a hack, but a rather simple one. I would
> > consider this to be just a performance improvement. We certainly need
> > a slow-path when virtio drivers submit gso packets to the stack (and
> > already discussed with Hans). The sysctl can't help here. But without
> > the sysctl the packets would constantly hit the slow-path in case of
> > e.g. IPoIB and that would also be rather bad.
> 
> So you want to penalise every NIC in the system if just one of
> them is broken? This is insane.  Just do the partial linearisation
> in that one driver that needs it and not only won't you have to
> penalise anyone else but you still get the best result for that
> driver that needs it.

No penalization :

- default is the optimal value

- TCP stack tends to build skb with 32KB frags anyway. It is very rare
to actually get to 17 frags per skb (pathological sendpage() with tiny
parts, or tiny write() on many sockets from one thread). 

> 
> Besides, you have to implement the linearisation anyway because
> of virtualisation.

Sure.

We use a similar patch here at Google, since bnx2x has in some cases a
limit of 13 frags per skb. This driver calls linearize which can fail
under memory fragmentation. TCP usually retransmits, so only effect of
failures is extra latencies.

I am actually okay with this patch.

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
@ 2016-02-03 14:30               ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-02-03 14:30 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Hannes Frederic Sowa, Hans Westgaard Ry, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Tom Herbert, Pablo Neira Ayuso, Eric Dumazet,
	Florian Westphal, Jiri Pirko, Alexander Duyck, Michal Hocko,
	Linus Lüssing, Tejun Heo, Andrew Morton, Alexey Kodanev,
	Håkon Bugge, open list, open list:NETWORKING [GENERAL]

On Wed, 2016-02-03 at 20:20 +0800, Herbert Xu wrote:
> On Wed, Feb 03, 2016 at 12:36:21PM +0100, Hannes Frederic Sowa wrote:
> >
> > Agreed that it feels like a hack, but a rather simple one. I would
> > consider this to be just a performance improvement. We certainly need
> > a slow-path when virtio drivers submit gso packets to the stack (and
> > already discussed with Hans). The sysctl can't help here. But without
> > the sysctl the packets would constantly hit the slow-path in case of
> > e.g. IPoIB and that would also be rather bad.
> 
> So you want to penalise every NIC in the system if just one of
> them is broken? This is insane.  Just do the partial linearisation
> in that one driver that needs it and not only won't you have to
> penalise anyone else but you still get the best result for that
> driver that needs it.

No penalization :

- default is the optimal value

- TCP stack tends to build skb with 32KB frags anyway. It is very rare
to actually get to 17 frags per skb (pathological sendpage() with tiny
parts, or tiny write() on many sockets from one thread). 

> 
> Besides, you have to implement the linearisation anyway because
> of virtualisation.

Sure.

We use a similar patch here at Google, since bnx2x has in some cases a
limit of 13 frags per skb. This driver calls linearize which can fail
under memory fragmentation. TCP usually retransmits, so only effect of
failures is extra latencies.

I am actually okay with this patch.

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03  8:26       ` Hans Westgaard Ry
  (?)
  (?)
@ 2016-02-03 15:58       ` Alexander Duyck
  2016-02-03 16:07           ` Eric Dumazet
  -1 siblings, 1 reply; 54+ messages in thread
From: Alexander Duyck @ 2016-02-03 15:58 UTC (permalink / raw)
  To: Hans Westgaard Ry
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On Wed, Feb 3, 2016 at 12:26 AM, Hans Westgaard Ry
<hans.westgaard.ry@oracle.com> wrote:
> Devices may have limits on the number of fragments in an skb they support.
> Current codebase uses a constant as maximum for number of fragments one
> skb can hold and use.
> When enabling scatter/gather and running traffic with many small messages
> the codebase uses the maximum number of fragments and may thereby violate
> the max for certain devices.
> The patch introduces a global variable as max number of fragments.
>
> Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
> Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>
>
> ---
>  include/linux/skbuff.h     |  1 +
>  net/core/skbuff.c          |  2 ++
>  net/core/sysctl_net_core.c | 10 ++++++++++
>  net/ipv4/tcp.c             |  4 ++--
>  4 files changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 4355129..fe47ad3 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -219,6 +219,7 @@ struct sk_buff;
>  #else
>  #define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
>  #endif
> +extern int sysctl_max_skb_frags;
>
>  typedef struct skb_frag_struct skb_frag_t;
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 152b9c7..c336b97 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -79,6 +79,8 @@
>
>  struct kmem_cache *skbuff_head_cache __read_mostly;
>  static struct kmem_cache *skbuff_fclone_cache __read_mostly;
> +int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
> +EXPORT_SYMBOL(sysctl_max_skb_frags);
>
>  /**
>   *     skb_panic - private function for out-of-line support
> diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
> index 95b6139..a6beb7b 100644
> --- a/net/core/sysctl_net_core.c
> +++ b/net/core/sysctl_net_core.c

I really don't think these changes belong in the core. Below you only
modify the TCP code path so this more likely belongs in the TCP path
unless you are going to guarantee that all other code paths obey the
sysctl.  It probably belongs in net/ipv4/sysctl_net_ipv4.c

> @@ -26,6 +26,7 @@ static int zero = 0;
>  static int one = 1;
>  static int min_sndbuf = SOCK_MIN_SNDBUF;
>  static int min_rcvbuf = SOCK_MIN_RCVBUF;
> +static int max_skb_frags = MAX_SKB_FRAGS;
>
>  static int net_msg_warn;       /* Unused, but still a sysctl */
>
> @@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = {
>                 .mode           = 0644,
>                 .proc_handler   = proc_dointvec
>         },
> +       {
> +               .procname       = "max_skb_frags",
> +               .data           = &sysctl_max_skb_frags,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec_minmax,
> +               .extra1         = &one,
> +               .extra2         = &max_skb_frags,
> +       },
>         { }
>  };

I'm not really a fan of this name either.  Maybe it should be
something like tcp_max_gso_frags.

> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index c82cca1..3dc7a2fd 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -938,7 +938,7 @@ new_segment:
>
>                 i = skb_shinfo(skb)->nr_frags;
>                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
> -               if (!can_coalesce && i >= MAX_SKB_FRAGS) {
> +               if (!can_coalesce && i >= sysctl_max_skb_frags) {
>                         tcp_mark_push(tp, skb);
>                         goto new_segment;
>                 }
> @@ -1211,7 +1211,7 @@ new_segment:
>
>                         if (!skb_can_coalesce(skb, i, pfrag->page,
>                                               pfrag->offset)) {
> -                               if (i == MAX_SKB_FRAGS || !sg) {
> +                               if (i == sysctl_max_skb_frags || !sg) {
>                                         tcp_mark_push(tp, skb);
>                                         goto new_segment;
>                                 }

This bit looks good.

I was wondering.  Have you considered looking at something like what
was done with gso_max_size?  It seems like it is meant to address a
problem similar to what you have described where the NICs only support
a certain layout for the GSO frame.  Though now that I look over the
code it seems like it might be flawed in that I don't see bridges or
tunnels really respecting the value so it seems like they could cause
issues.

- Alex

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 15:58       ` Alexander Duyck
@ 2016-02-03 16:07           ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-02-03 16:07 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On Wed, 2016-02-03 at 07:58 -0800, Alexander Duyck wrote:
> > +++ b/net/core/sysctl_net_core.c
> 
> I really don't think these changes belong in the core. Below you only
> modify the TCP code path so this more likely belongs in the TCP path
> unless you are going to guarantee that all other code paths obey the
> sysctl.  It probably belongs in net/ipv4/sysctl_net_ipv4.c


Alexander, this is a v3.

We rejected prior attempts doing exactly what you suggest.

Think about GRO : These people also need to use the same sysctl in GRO
to limit number of frags.

Limiting the stuff at the egress is useless in forwarding setups.
It will be too late as they'll need to linearize -> huge performance
drop.

This is why we wanted a global setup so that these guys can tweak the
default limit.

Please read netdev history about this stuff.

Plan of action :

1) This patch, adding a core sysctl.
2) Use it in TCP (already done in this patch)
3) Use it in GRO

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
@ 2016-02-03 16:07           ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-02-03 16:07 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list

On Wed, 2016-02-03 at 07:58 -0800, Alexander Duyck wrote:
> > +++ b/net/core/sysctl_net_core.c
> 
> I really don't think these changes belong in the core. Below you only
> modify the TCP code path so this more likely belongs in the TCP path
> unless you are going to guarantee that all other code paths obey the
> sysctl.  It probably belongs in net/ipv4/sysctl_net_ipv4.c


Alexander, this is a v3.

We rejected prior attempts doing exactly what you suggest.

Think about GRO : These people also need to use the same sysctl in GRO
to limit number of frags.

Limiting the stuff at the egress is useless in forwarding setups.
It will be too late as they'll need to linearize -> huge performance
drop.

This is why we wanted a global setup so that these guys can tweak the
default limit.

Please read netdev history about this stuff.

Plan of action :

1) This patch, adding a core sysctl.
2) Use it in TCP (already done in this patch)
3) Use it in GRO

^ permalink raw reply	[flat|nested] 54+ messages in thread

* RE: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 12:20           ` Herbert Xu
  2016-02-03 14:03             ` Hannes Frederic Sowa
  2016-02-03 14:30               ` Eric Dumazet
@ 2016-02-03 17:36             ` David Laight
  2 siblings, 0 replies; 54+ messages in thread
From: David Laight @ 2016-02-03 17:36 UTC (permalink / raw)
  To: 'Herbert Xu', Hannes Frederic Sowa
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing, Tejun Heo,
	Andrew Morton, Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

From: Herbert Xu
> Sent: 03 February 2016 12:21
> On Wed, Feb 03, 2016 at 12:36:21PM +0100, Hannes Frederic Sowa wrote:
> >
> > Agreed that it feels like a hack, but a rather simple one. I would
> > consider this to be just a performance improvement. We certainly need
> > a slow-path when virtio drivers submit gso packets to the stack (and
> > already discussed with Hans). The sysctl can't help here. But without
> > the sysctl the packets would constantly hit the slow-path in case of
> > e.g. IPoIB and that would also be rather bad.
> 
> So you want to penalise every NIC in the system if just one of
> them is broken? This is insane.  Just do the partial linearisation
> in that one driver that needs it and not only won't you have to
> penalise anyone else but you still get the best result for that
> driver that needs it.
> 
> Besides, you have to implement the linearisation anyway because
> of virtualisation.

And if a MAC driver needs to linearize a tx frame it might as well
copy it into a separately allocated tx buffer area.
Indeed it can copy fragments until the number left is less than the
fragment limit.

	David

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 16:07           ` Eric Dumazet
@ 2016-02-03 17:43             ` Alexander Duyck
  -1 siblings, 0 replies; 54+ messages in thread
From: Alexander Duyck @ 2016-02-03 17:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On Wed, Feb 3, 2016 at 8:07 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-02-03 at 07:58 -0800, Alexander Duyck wrote:
>> > +++ b/net/core/sysctl_net_core.c
>>
>> I really don't think these changes belong in the core. Below you only
>> modify the TCP code path so this more likely belongs in the TCP path
>> unless you are going to guarantee that all other code paths obey the
>> sysctl.  It probably belongs in net/ipv4/sysctl_net_ipv4.c
>
>
> Alexander, this is a v3.

Well I guess that means that a v4 might be needed.  I get that others
have reviewed it but obviously their opinions differed from mine as I
have a few objections to parts of this patch.

> We rejected prior attempts doing exactly what you suggest.

Okay so it sounds like there are some other opinions on this then that
I am not aware of.

> Think about GRO : These people also need to use the same sysctl in GRO
> to limit number of frags.

Okay, well without the GRO changes this patch set is incomplete then.

> Limiting the stuff at the egress is useless in forwarding setups.
> It will be too late as they'll need to linearize -> huge performance
> drop.
>
> This is why we wanted a global setup so that these guys can tweak the
> default limit.
>
> Please read netdev history about this stuff.

Read the history.  I still say it is best if we don't accept a partial
solution.  If we are going to introduce the sysctl as a core item it
should function as a core item and not as something that belongs to
TCP only.

Also I wasn't saying to go the gso_max_size route.  As I commented I
think that probably needs to be fixed as well.  Maybe turned into a
sysctl as is being proposed here since I have found scenarios such as
tunnels where the gso_max_size may not be observed.

> Plan of action :
>
> 1) This patch, adding a core sysctl.
> 2) Use it in TCP (already done in this patch)
> 3) Use it in GRO

What you are talking about is a TCP offloads, one on the transmit side
and one on the receive side.  The name max_skb_frags implies that this
value it is going to cover ALL users of fragments and it doesn't.

If you are going to try and pass this off as a core how about covering
other cases such as __ip_append_data(), skb_append_datato_frags() and
the rest of the functions out there that will totally ignore this
current change and still put together a frame with MAX_SKB_FRAGS
instead of the sysctl value?

In addition it makes sense to have things setup so that you have both
the sysctl and the device value.  Then if someone wants to they can
leave the value set large and just let the one NIC sit there and
linearize frames because NETIF_F_SG gets cleared in netif_skb_features
if the number of frags used exceeds the value for max_frags reported
in the netdev.

- Alex

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
@ 2016-02-03 17:43             ` Alexander Duyck
  0 siblings, 0 replies; 54+ messages in thread
From: Alexander Duyck @ 2016-02-03 17:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list

On Wed, Feb 3, 2016 at 8:07 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-02-03 at 07:58 -0800, Alexander Duyck wrote:
>> > +++ b/net/core/sysctl_net_core.c
>>
>> I really don't think these changes belong in the core. Below you only
>> modify the TCP code path so this more likely belongs in the TCP path
>> unless you are going to guarantee that all other code paths obey the
>> sysctl.  It probably belongs in net/ipv4/sysctl_net_ipv4.c
>
>
> Alexander, this is a v3.

Well I guess that means that a v4 might be needed.  I get that others
have reviewed it but obviously their opinions differed from mine as I
have a few objections to parts of this patch.

> We rejected prior attempts doing exactly what you suggest.

Okay so it sounds like there are some other opinions on this then that
I am not aware of.

> Think about GRO : These people also need to use the same sysctl in GRO
> to limit number of frags.

Okay, well without the GRO changes this patch set is incomplete then.

> Limiting the stuff at the egress is useless in forwarding setups.
> It will be too late as they'll need to linearize -> huge performance
> drop.
>
> This is why we wanted a global setup so that these guys can tweak the
> default limit.
>
> Please read netdev history about this stuff.

Read the history.  I still say it is best if we don't accept a partial
solution.  If we are going to introduce the sysctl as a core item it
should function as a core item and not as something that belongs to
TCP only.

Also I wasn't saying to go the gso_max_size route.  As I commented I
think that probably needs to be fixed as well.  Maybe turned into a
sysctl as is being proposed here since I have found scenarios such as
tunnels where the gso_max_size may not be observed.

> Plan of action :
>
> 1) This patch, adding a core sysctl.
> 2) Use it in TCP (already done in this patch)
> 3) Use it in GRO

What you are talking about is a TCP offloads, one on the transmit side
and one on the receive side.  The name max_skb_frags implies that this
value it is going to cover ALL users of fragments and it doesn't.

If you are going to try and pass this off as a core how about covering
other cases such as __ip_append_data(), skb_append_datato_frags() and
the rest of the functions out there that will totally ignore this
current change and still put together a frame with MAX_SKB_FRAGS
instead of the sysctl value?

In addition it makes sense to have things setup so that you have both
the sysctl and the device value.  Then if someone wants to they can
leave the value set large and just let the one NIC sit there and
linearize frames because NETIF_F_SG gets cleared in netif_skb_features
if the number of frags used exceeds the value for max_frags reported
in the netdev.

- Alex

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 17:43             ` Alexander Duyck
@ 2016-02-03 17:54               ` Eric Dumazet
  -1 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-02-03 17:54 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On Wed, 2016-02-03 at 09:43 -0800, Alexander Duyck wrote:

> Read the history.  I still say it is best if we don't accept a partial
> solution.  If we are going to introduce the sysctl as a core item it
> should function as a core item and not as something that belongs to
> TCP only.


But this patch is the base, adding both the core sysctl and its first
usage.

Do we really need to split it in 2 patches ? Really ?

The goal is to use it in all skb providers were it might be a
performance gain, once they are identified.

Your points were already raised and will be addressed, by either me or
you. And maybe others.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
@ 2016-02-03 17:54               ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-02-03 17:54 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list

On Wed, 2016-02-03 at 09:43 -0800, Alexander Duyck wrote:

> Read the history.  I still say it is best if we don't accept a partial
> solution.  If we are going to introduce the sysctl as a core item it
> should function as a core item and not as something that belongs to
> TCP only.


But this patch is the base, adding both the core sysctl and its first
usage.

Do we really need to split it in 2 patches ? Really ?

The goal is to use it in all skb providers were it might be a
performance gain, once they are identified.

Your points were already raised and will be addressed, by either me or
you. And maybe others.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 17:54               ` Eric Dumazet
@ 2016-02-03 18:24                 ` Alexander Duyck
  -1 siblings, 0 replies; 54+ messages in thread
From: Alexander Duyck @ 2016-02-03 18:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On Wed, Feb 3, 2016 at 9:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-02-03 at 09:43 -0800, Alexander Duyck wrote:
>
>> Read the history.  I still say it is best if we don't accept a partial
>> solution.  If we are going to introduce the sysctl as a core item it
>> should function as a core item and not as something that belongs to
>> TCP only.
>
>
> But this patch is the base, adding both the core sysctl and its first
> usage.
>
> Do we really need to split it in 2 patches ? Really ?
>
> The goal is to use it in all skb providers were it might be a
> performance gain, once they are identified.

That is what I thought.  So why are we trying to sell this as a core
change then.  All I am asking for is the sysctl to be moved and
renamed since based on all of your descriptions this clearly only
impacts TCP.

> Your points were already raised and will be addressed, by either me or
> you. And maybe others.

Please don't sign me up for work I didn't volunteer for.  I already
have enough broken code to try and fix.  I'm pretty sure I need to go
in and fix the gso_max_size code for starters.

If this is only meant to be a performance modification and is only
really targeted at TCP TSO/GRO then all I ask is that we use a name
like tcp_max_gso_frags and relocate the sysctl to the TCP section.
Otherwise if we are actually going to try to scope this out on a wider
level and limit all frags which is what the name implies then the
patch set needs to make a better attempt at covering all cases where
it may apply.

- Alex

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
@ 2016-02-03 18:24                 ` Alexander Duyck
  0 siblings, 0 replies; 54+ messages in thread
From: Alexander Duyck @ 2016-02-03 18:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list

On Wed, Feb 3, 2016 at 9:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-02-03 at 09:43 -0800, Alexander Duyck wrote:
>
>> Read the history.  I still say it is best if we don't accept a partial
>> solution.  If we are going to introduce the sysctl as a core item it
>> should function as a core item and not as something that belongs to
>> TCP only.
>
>
> But this patch is the base, adding both the core sysctl and its first
> usage.
>
> Do we really need to split it in 2 patches ? Really ?
>
> The goal is to use it in all skb providers were it might be a
> performance gain, once they are identified.

That is what I thought.  So why are we trying to sell this as a core
change then.  All I am asking for is the sysctl to be moved and
renamed since based on all of your descriptions this clearly only
impacts TCP.

> Your points were already raised and will be addressed, by either me or
> you. And maybe others.

Please don't sign me up for work I didn't volunteer for.  I already
have enough broken code to try and fix.  I'm pretty sure I need to go
in and fix the gso_max_size code for starters.

If this is only meant to be a performance modification and is only
really targeted at TCP TSO/GRO then all I ask is that we use a name
like tcp_max_gso_frags and relocate the sysctl to the TCP section.
Otherwise if we are actually going to try to scope this out on a wider
level and limit all frags which is what the name implies then the
patch set needs to make a better attempt at covering all cases where
it may apply.

- Alex

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 18:24                 ` Alexander Duyck
@ 2016-02-03 19:23                   ` Eric Dumazet
  -1 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-02-03 19:23 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list,
	open list:NETWORKING [GENERAL]

On Wed, 2016-02-03 at 10:24 -0800, Alexander Duyck wrote:

> If this is only meant to be a performance modification and is only
> really targeted at TCP TSO/GRO then all I ask is that we use a name
> like tcp_max_gso_frags and relocate the sysctl to the TCP section.
> Otherwise if we are actually going to try to scope this out on a wider
> level and limit all frags which is what the name implies then the
> patch set needs to make a better attempt at covering all cases where
> it may apply.


This is the goal.

Other skb providers (like tun and af_packet) will also use this optional
limit.

I fail to see why Hans should send a complete patch series.

We will send followup patches, as we always did.

I will send the GRO change for example.

So please keep a sysctl name _without_ TCP in it, it really has nothing
to do with TCP.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
@ 2016-02-03 19:23                   ` Eric Dumazet
  0 siblings, 0 replies; 54+ messages in thread
From: Eric Dumazet @ 2016-02-03 19:23 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Alexander Duyck, Michal Hocko, Linus Lüssing,
	Hannes Frederic Sowa, Herbert Xu, Tejun Heo, Andrew Morton,
	Alexey Kodanev, Håkon Bugge, open list

On Wed, 2016-02-03 at 10:24 -0800, Alexander Duyck wrote:

> If this is only meant to be a performance modification and is only
> really targeted at TCP TSO/GRO then all I ask is that we use a name
> like tcp_max_gso_frags and relocate the sysctl to the TCP section.
> Otherwise if we are actually going to try to scope this out on a wider
> level and limit all frags which is what the name implies then the
> patch set needs to make a better attempt at covering all cases where
> it may apply.


This is the goal.

Other skb providers (like tun and af_packet) will also use this optional
limit.

I fail to see why Hans should send a complete patch series.

We will send followup patches, as we always did.

I will send the GRO change for example.

So please keep a sysctl name _without_ TCP in it, it really has nothing
to do with TCP.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03 19:23                   ` Eric Dumazet
  (?)
@ 2016-02-03 21:03                   ` Alexander Duyck
  -1 siblings, 0 replies; 54+ messages in thread
From: Alexander Duyck @ 2016-02-03 21:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Hans Westgaard Ry, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Tom Herbert,
	Pablo Neira Ayuso, Eric Dumazet, Florian Westphal, Jiri Pirko,
	Michal Hocko, Linus Lüssing, Hannes Frederic Sowa,
	Herbert Xu, Tejun Heo, Andrew Morton, Alexey Kodanev,
	Håkon Bugge, open list, open list:NETWORKING [GENERAL]

On Wed, Feb 3, 2016 at 11:23 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2016-02-03 at 10:24 -0800, Alexander Duyck wrote:
>
>> If this is only meant to be a performance modification and is only
>> really targeted at TCP TSO/GRO then all I ask is that we use a name
>> like tcp_max_gso_frags and relocate the sysctl to the TCP section.
>> Otherwise if we are actually going to try to scope this out on a wider
>> level and limit all frags which is what the name implies then the
>> patch set needs to make a better attempt at covering all cases where
>> it may apply.
>
>
> This is the goal.
>
> Other skb providers (like tun and af_packet) will also use this optional
> limit.
>
> I fail to see why Hans should send a complete patch series.

You realize that conflicts with what anybody else would be told.  What
was provided in this patch is a half solution, and it may cause bigger
messes since it is unclear exactly how this sysctl is meant to be
used.

> We will send followup patches, as we always did.
>
> I will send the GRO change for example.
>
> So please keep a sysctl name _without_ TCP in it, it really has nothing
> to do with TCP.

In the end I am not the one you have to convince.  I have simply
stated my opinion, and I guess we will have to agree to disagree.  It
is entirely up to Dave if he wants to apply it or not.  I have slides
I need to work on for next week.. :-)

- Alex

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH v3] net:Add sysctl_max_skb_frags
  2016-02-03  8:26       ` Hans Westgaard Ry
                         ` (2 preceding siblings ...)
  (?)
@ 2016-02-09  9:30       ` David Miller
  -1 siblings, 0 replies; 54+ messages in thread
From: David Miller @ 2016-02-09  9:30 UTC (permalink / raw)
  To: hans.westgaard.ry
  Cc: kuznet, jmorris, yoshfuji, kaber, tom, pablo, edumazet, fw, jiri,
	alexander.h.duyck, mhocko, linus.luessing, hannes, herbert, tj,
	akpm, alexey.kodanev, haakon.bugge, linux-kernel, netdev

From: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
Date: Wed,  3 Feb 2016 09:26:57 +0100

> Devices may have limits on the number of fragments in an skb they support.
> Current codebase uses a constant as maximum for number of fragments one
> skb can hold and use.
> When enabling scatter/gather and running traffic with many small messages
> the codebase uses the maximum number of fragments and may thereby violate
> the max for certain devices.
> The patch introduces a global variable as max number of fragments.
> 
> Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com>
> Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>

I know some people don't like this patch, but no better solution exists
at this time.

Like others, I'd personally would rather this be a per-device attribute
but that currently would not work at all.

The device that TCP and other elements see when the build packets is
not necessarily the one that is going to send the frame.  Encapsulation
and other structures hide the truely transmitting device.

And we lack a foolproof way to propagate attributes like this through
the stack of devices up to the top.

So for now this is what we have to use, as unfortunate as it may be.

If someone is suitably angry about this state of affairs, I encourage
them to direct that energy at a better long term solution :-)

Applied and queued up for -stable, thanks.

^ permalink raw reply	[flat|nested] 54+ messages in thread

end of thread, other threads:[~2016-02-09  9:31 UTC | newest]

Thread overview: 54+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-06 13:16 [PATCH] net: add per device sg_max_frags for skb Hans Westgaard Ry
2016-01-06 13:16 ` Hans Westgaard Ry
2016-01-06 13:59 ` David Laight
2016-01-06 13:59   ` David Laight
2016-01-08  9:55   ` Hans Westgaard Ry
2016-01-08  9:55     ` Hans Westgaard Ry
2016-01-08 10:33     ` David Laight
2016-01-08 10:33       ` David Laight
2016-01-08 11:47     ` Hannes Frederic Sowa
2016-01-08 11:47       ` Hannes Frederic Sowa
2016-01-13 13:57       ` Hans Westgaard Ry
2016-01-13 13:57         ` Hans Westgaard Ry
2016-01-13 14:19         ` Eric Dumazet
2016-01-13 14:19           ` Eric Dumazet
2016-01-13 14:20           ` Eric Dumazet
2016-01-13 14:20             ` Eric Dumazet
2016-01-13 15:07           ` Hannes Frederic Sowa
2016-01-13 15:07             ` Hannes Frederic Sowa
2016-01-13 15:38           ` David Miller
2016-01-13 15:44             ` Eric Dumazet
2016-01-13 15:44               ` Eric Dumazet
2016-01-13 21:07         ` Eric W. Biederman
2016-01-13 21:07           ` Eric W. Biederman
2016-01-27 13:20     ` [PATCH v2] net:Add sysctl_tcp_sg_max_skb_frags Hans Westgaard Ry
2016-01-27 15:15       ` Eric Dumazet
2016-01-27 18:12         ` Hannes Frederic Sowa
2016-02-01 13:12           ` Hans Westgaard Ry
2016-01-27 20:13       ` David Miller
2016-02-03  8:26     ` [PATCH v3] net:Add sysctl_max_skb_frags Hans Westgaard Ry
2016-02-03  8:26       ` Hans Westgaard Ry
2016-02-03 11:25       ` Herbert Xu
2016-02-03 11:36         ` Hannes Frederic Sowa
2016-02-03 12:20           ` Herbert Xu
2016-02-03 14:03             ` Hannes Frederic Sowa
2016-02-03 14:30             ` Eric Dumazet
2016-02-03 14:30               ` Eric Dumazet
2016-02-03 17:36             ` David Laight
2016-02-03 15:58       ` Alexander Duyck
2016-02-03 16:07         ` Eric Dumazet
2016-02-03 16:07           ` Eric Dumazet
2016-02-03 17:43           ` Alexander Duyck
2016-02-03 17:43             ` Alexander Duyck
2016-02-03 17:54             ` Eric Dumazet
2016-02-03 17:54               ` Eric Dumazet
2016-02-03 18:24               ` Alexander Duyck
2016-02-03 18:24                 ` Alexander Duyck
2016-02-03 19:23                 ` Eric Dumazet
2016-02-03 19:23                   ` Eric Dumazet
2016-02-03 21:03                   ` Alexander Duyck
2016-02-09  9:30       ` David Miller
2016-01-06 14:05 ` [PATCH] net: add per device sg_max_frags for skb Eric Dumazet
2016-01-06 14:05   ` Eric Dumazet
2016-01-08 10:01   ` Hans Westgaard Ry
2016-01-08 10:01     ` Hans Westgaard Ry

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.