All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v2 0/2] Accurate Memory Charging For MSG_ZEROCOPY
@ 2021-10-30  2:05 Talal Ahmad
  2021-10-30  2:05 ` [PATCH net-next v2 1/2] tcp: rename sk_wmem_free_skb Talal Ahmad
  2021-10-30  2:05 ` [PATCH net-next v2 2/2] net: avoid double accounting for pure zerocopy skbs Talal Ahmad
  0 siblings, 2 replies; 3+ messages in thread
From: Talal Ahmad @ 2021-10-30  2:05 UTC (permalink / raw)
  To: davem, netdev
  Cc: arjunroy, edumazet, soheil, willemb, dsahern, yoshfuji, kuba,
	cong.wang, haokexin, jonathan.lemon, alobakin, pabeni,
	ilias.apalodimas, memxor, elver, nogikh, vvs, Talal Ahmad

From: Talal Ahmad <talalahmad@google.com>

This series improves the accuracy of msg_zerocopy memory accounting.
At present, when msg_zerocopy is used memory is charged twice for the
data - once when user space allocates it, and then again within
__zerocopy_sg_from_iter. The memory charging in the kernel is excessive
because data is held in user pages and is never actually copied to skb
fragments. This leads to incorrectly inflated memory statistics for
programs passing MSG_ZEROCOPY.

We reduce this inaccuracy by introducing the notion of "pure" zerocopy
SKBs - where all the frags in the SKB are backed by pinned userspace
pages, and none are backed by copied pages. For such SKBs, tracked via
the new SKBFL_PURE_ZEROCOPY flag, we elide sk_mem_charge/uncharge
calls, leading to more accurate accounting.

However, SKBs can also be coalesced by the stack at present,
potentially leading to "impure" SKBs. We restrict this coalescing so
it can only happen within the sendmsg() system call itself, for the
most recently allocated SKB. While this can lead to a small degree of
double-charging of memory, this case does not arise often in practice
for workloads that set MSG_ZEROCOPY.

Testing verified that memory usage in the kernel is lowered.
Instrumentation with counters also showed that accounting at time
charging and uncharging is balanced.

Talal Ahmad (2):
  tcp: rename sk_wmem_free_skb
  net: avoid double accounting for pure zerocopy skbs

 include/linux/skbuff.h | 19 ++++++++++++++++++-
 include/net/sock.h     |  7 -------
 include/net/tcp.h      | 15 +++++++++++++--
 net/core/datagram.c    |  3 ++-
 net/core/skbuff.c      |  3 ++-
 net/ipv4/tcp.c         | 28 +++++++++++++++++++++++-----
 net/ipv4/tcp_output.c  |  9 ++++++---
 7 files changed, 64 insertions(+), 20 deletions(-)

-- 
2.33.1.1089.g2158813163f-goog


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH net-next v2 1/2] tcp: rename sk_wmem_free_skb
  2021-10-30  2:05 [PATCH net-next v2 0/2] Accurate Memory Charging For MSG_ZEROCOPY Talal Ahmad
@ 2021-10-30  2:05 ` Talal Ahmad
  2021-10-30  2:05 ` [PATCH net-next v2 2/2] net: avoid double accounting for pure zerocopy skbs Talal Ahmad
  1 sibling, 0 replies; 3+ messages in thread
From: Talal Ahmad @ 2021-10-30  2:05 UTC (permalink / raw)
  To: davem, netdev
  Cc: arjunroy, edumazet, soheil, willemb, dsahern, yoshfuji, kuba,
	cong.wang, haokexin, jonathan.lemon, alobakin, pabeni,
	ilias.apalodimas, memxor, elver, nogikh, vvs, Talal Ahmad

From: Talal Ahmad <talalahmad@google.com>

sk_wmem_free_skb() is only used by TCP.

Rename it to make this clear, and move its declaration to
include/net/tcp.h

Signed-off-by: Talal Ahmad <talalahmad@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sock.h    | 7 -------
 include/net/tcp.h     | 9 ++++++++-
 net/ipv4/tcp.c        | 6 +++---
 net/ipv4/tcp_output.c | 2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 620de053002d..b32906e1ab55 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1603,13 +1603,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 		__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
 }
 
-static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
-{
-	sk_wmem_queued_add(sk, -skb->truesize);
-	sk_mem_uncharge(sk, skb->truesize);
-	__kfree_skb(skb);
-}
-
 static inline void sock_release_ownership(struct sock *sk)
 {
 	if (sk->sk_lock.owned) {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8e8c5922a7b0..70972f3ac8fa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -290,6 +290,13 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 	return false;
 }
 
+static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
+{
+	sk_wmem_queued_add(sk, -skb->truesize);
+	sk_mem_uncharge(sk, skb->truesize);
+	__kfree_skb(skb);
+}
+
 void sk_forced_mem_schedule(struct sock *sk, int size);
 
 bool tcp_check_oom(struct sock *sk, int shift);
@@ -1875,7 +1882,7 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc
 {
 	list_del(&skb->tcp_tsorted_anchor);
 	tcp_rtx_queue_unlink(skb, sk);
-	sk_wmem_free_skb(sk, skb);
+	tcp_wmem_free_skb(sk, skb);
 }
 
 static inline void tcp_push_pending_frames(struct sock *sk)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a7b1138d619c..bc7f419184aa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -932,7 +932,7 @@ void tcp_remove_empty_skb(struct sock *sk)
 		tcp_unlink_write_queue(skb, sk);
 		if (tcp_write_queue_empty(sk))
 			tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-		sk_wmem_free_skb(sk, skb);
+		tcp_wmem_free_skb(sk, skb);
 	}
 }
 
@@ -2893,7 +2893,7 @@ static void tcp_rtx_queue_purge(struct sock *sk)
 		 * list_del(&skb->tcp_tsorted_anchor)
 		 */
 		tcp_rtx_queue_unlink(skb, sk);
-		sk_wmem_free_skb(sk, skb);
+		tcp_wmem_free_skb(sk, skb);
 	}
 }
 
@@ -2904,7 +2904,7 @@ void tcp_write_queue_purge(struct sock *sk)
 	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
 	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
 		tcp_skb_tsorted_anchor_cleanup(skb);
-		sk_wmem_free_skb(sk, skb);
+		tcp_wmem_free_skb(sk, skb);
 	}
 	tcp_rtx_queue_purge(sk);
 	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6867e5db3e35..6fbbf1558033 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2412,7 +2412,7 @@ static int tcp_mtu_probe(struct sock *sk)
 			TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
 			tcp_skb_collapse_tstamp(nskb, skb);
 			tcp_unlink_write_queue(skb, sk);
-			sk_wmem_free_skb(sk, skb);
+			tcp_wmem_free_skb(sk, skb);
 		} else {
 			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
 						   ~(TCPHDR_FIN|TCPHDR_PSH);
-- 
2.33.1.1089.g2158813163f-goog


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH net-next v2 2/2] net: avoid double accounting for pure zerocopy skbs
  2021-10-30  2:05 [PATCH net-next v2 0/2] Accurate Memory Charging For MSG_ZEROCOPY Talal Ahmad
  2021-10-30  2:05 ` [PATCH net-next v2 1/2] tcp: rename sk_wmem_free_skb Talal Ahmad
@ 2021-10-30  2:05 ` Talal Ahmad
  1 sibling, 0 replies; 3+ messages in thread
From: Talal Ahmad @ 2021-10-30  2:05 UTC (permalink / raw)
  To: davem, netdev
  Cc: arjunroy, edumazet, soheil, willemb, dsahern, yoshfuji, kuba,
	cong.wang, haokexin, jonathan.lemon, alobakin, pabeni,
	ilias.apalodimas, memxor, elver, nogikh, vvs, Talal Ahmad

From: Talal Ahmad <talalahmad@google.com>

Track skbs with only zerocopy data and avoid charging them to kernel
memory to correctly account the memory utilization for msg_zerocopy.
All of the data in such skbs is held in user pages which are already
accounted to user. Before this change, they are charged again in
kernel in __zerocopy_sg_from_iter. The charging in kernel is
excessive because data is not being copied into skb frags. This
excessive charging can lead to kernel going into memory pressure
state which impacts all sockets in the system adversely. Mark pure
zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove
charge/uncharge for data in such skbs.

Initially, an skb is marked pure zerocopy when it is empty and in
zerocopy path. skb can then change from a pure zerocopy skb to mixed
data skb (zerocopy and copy data) if it is at tail of write queue and
there is room available in it and non-zerocopy data is being sent in
the next sendmsg call. At this time sk_mem_charge is done for the pure
zerocopied data and the pure zerocopy flag is unmarked. We found that
this happens very rarely on workloads that pass MSG_ZEROCOPY.

A pure zerocopy skb can later be coalesced into normal skb if they are
next to each other in queue but this patch prevents coalescing from
happening. This avoids complexity of charging when skb downgrades from
pure zerocopy to mixed. This is also rare.

In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge
for SKB_TRUESIZE(MAX_TCP_HEADER) is done for sk_mem_charge in
tcp_skb_entail for an skb without data.

Testing with the msg_zerocopy.c benchmark between two hosts(100G nics)
with zerocopy showed that before this patch the 'sock' variable in
memory.stat for cgroup2 that tracks sum of sk_forward_alloc,
sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this
change it is 0. This is due to no charge to sk_forward_alloc for
zerocopy data and shows memory utilization for kernel is lowered.

Signed-off-by: Talal Ahmad <talalahmad@google.com>
Acked-by: Arjun Roy <arjunroy@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/skbuff.h | 19 ++++++++++++++++++-
 include/net/tcp.h      |  8 ++++++--
 net/core/datagram.c    |  3 ++-
 net/core/skbuff.c      |  3 ++-
 net/ipv4/tcp.c         | 22 ++++++++++++++++++++--
 net/ipv4/tcp_output.c  |  7 +++++--
 6 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0bd6520329f6..10869906cc57 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -454,9 +454,15 @@ enum {
 	 * all frags to avoid possible bad checksum
 	 */
 	SKBFL_SHARED_FRAG = BIT(1),
+
+	/* segment contains only zerocopy data and should not be
+	 * charged to the kernel memory.
+	 */
+	SKBFL_PURE_ZEROCOPY = BIT(2),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
+#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
 
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
+static inline bool skb_zcopy_pure(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
+}
+
+static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
+				       const struct sk_buff *skb2)
+{
+	return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
+}
+
 static inline void net_zcopy_get(struct ubuf_info *uarg)
 {
 	refcount_inc(&uarg->refcnt);
@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 		if (!skb_zcopy_is_nouarg(skb))
 			uarg->callback(skb, uarg, zerocopy_success);
 
-		skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
+		skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
 	}
 }
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70972f3ac8fa..af91f370432e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -293,7 +293,10 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sk_wmem_queued_add(sk, -skb->truesize);
-	sk_mem_uncharge(sk, skb->truesize);
+	if (!skb_zcopy_pure(skb))
+		sk_mem_uncharge(sk, skb->truesize);
+	else
+		sk_mem_uncharge(sk, SKB_TRUESIZE(MAX_TCP_HEADER));
 	__kfree_skb(skb);
 }
 
@@ -974,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
 					const struct sk_buff *from)
 {
 	return likely(tcp_skb_can_collapse_to(to) &&
-		      mptcp_skb_can_collapse(to, from));
+		      mptcp_skb_can_collapse(to, from) &&
+		      skb_pure_zcopy_same(to, from));
 }
 
 /* Events passed to congestion control interface */
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 15ab9ffb27fe..ee290776c661 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 		skb->truesize += truesize;
 		if (sk && sk->sk_type == SOCK_STREAM) {
 			sk_wmem_queued_add(sk, truesize);
-			sk_mem_charge(sk, truesize);
+			if (!skb_zcopy_pure(skb))
+				sk_mem_charge(sk, truesize);
 		} else {
 			refcount_add(truesize, &skb->sk->sk_wmem_alloc);
 		}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 67a9188d8a49..29e617d8d7fb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb,
 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 {
 	int pos = skb_headlen(skb);
+	const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
 
-	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
+	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
 	skb_zerocopy_clone(skb1, skb, 0);
 	if (len < pos)	/* Split line is inside header. */
 		skb_split_inside_header(skb, skb1, len, pos);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bc7f419184aa..2561c14a6e63 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 	if (likely(skb)) {
 		bool mem_scheduled;
 
+		skb->truesize = SKB_TRUESIZE(size + MAX_TCP_HEADER);
 		if (force_schedule) {
 			mem_scheduled = true;
 			sk_forced_mem_schedule(sk, skb->truesize);
@@ -1319,6 +1320,15 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 
 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
 
+			/* skb changing from pure zc to mixed, must charge zc */
+			if (unlikely(skb_zcopy_pure(skb))) {
+				if (!sk_wmem_schedule(sk, skb->data_len))
+					goto wait_for_space;
+
+				sk_mem_charge(sk, skb->data_len);
+				skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+			}
+
 			if (!sk_wmem_schedule(sk, copy))
 				goto wait_for_space;
 
@@ -1339,8 +1349,16 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			}
 			pfrag->offset += copy;
 		} else {
-			if (!sk_wmem_schedule(sk, copy))
-				goto wait_for_space;
+			/* First append to a fragless skb builds initial
+			 * pure zerocopy skb
+			 */
+			if (!skb->len)
+				skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
+
+			if (!skb_zcopy_pure(skb)) {
+				if (!sk_wmem_schedule(sk, copy))
+					goto wait_for_space;
+			}
 
 			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
 			if (err == -EMSGSIZE || err == -EEXIST) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6fbbf1558033..287b57aadc37 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	if (delta_truesize) {
 		skb->truesize	   -= delta_truesize;
 		sk_wmem_queued_add(sk, -delta_truesize);
-		sk_mem_uncharge(sk, delta_truesize);
+		if (!skb_zcopy_pure(skb))
+			sk_mem_uncharge(sk, delta_truesize);
 	}
 
 	/* Any change of skb->len requires recalculation of tso factor. */
@@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
 		if (len <= skb->len)
 			break;
 
-		if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
+		if (unlikely(TCP_SKB_CB(skb)->eor) ||
+		    tcp_has_tx_tstamp(skb) ||
+		    !skb_pure_zcopy_same(skb, next))
 			return false;
 
 		len -= skb->len;
-- 
2.33.1.1089.g2158813163f-goog


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-10-30  2:06 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-30  2:05 [PATCH net-next v2 0/2] Accurate Memory Charging For MSG_ZEROCOPY Talal Ahmad
2021-10-30  2:05 ` [PATCH net-next v2 1/2] tcp: rename sk_wmem_free_skb Talal Ahmad
2021-10-30  2:05 ` [PATCH net-next v2 2/2] net: avoid double accounting for pure zerocopy skbs Talal Ahmad

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.