All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path
@ 2021-10-05 16:39 Paolo Abeni
  2021-10-05 19:49 ` kernel test robot
                   ` (5 more replies)
  0 siblings, 6 replies; 9+ messages in thread
From: Paolo Abeni @ 2021-10-05 16:39 UTC (permalink / raw)
  To: mptcp; +Cc: fwestpha

All the mptcp receive path is protected by the msk socket
spinlock. As consequence, the tx path has to play a few to allocate
the forward memory without acquiring the spinlock multile times, making
the overall TX path quite complex.

This patch tries to clean-up a bit the tx path, using completely
separated fwd memory allocation, for the rx and the tx path.

The forward memory allocated in the rx path is now accounted in
msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.

To cope with the above we provied a few MPTCP-specific variant for
the helpers to charge, uncharge, reclaim and free the forward
memory in the receive path.

msk->sk_forward_alloc now accounts only the forward memory for the tx path,
we can the plain core sock helper to manipulate it and drop quite a bit
of complexity.

On memory pressure reclaim both rx and tx fwd memory.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
Note: this would additionally require a core change to properly fetch
the forward allocated memory in inet_sk_diag_fill(). I think a new
'struct proto' ops would do:

struct proto {
	// ...
	int (*forward_alloc)(const struct sock *sk);
	// ...
};

static inline sk_forward_alloc(const struct sock *sk)
{
	if (!sk->sk_prot.forward_alloc)
		return sk->sk_forward_alloc;
	return sk->sk_prot.forward_alloc(sk);
}
---
 net/mptcp/protocol.c | 219 ++++++++++++++++++-------------------------
 net/mptcp/protocol.h |  15 +--
 2 files changed, 90 insertions(+), 144 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 21716392e754..1d26462a1daf 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
+static void mptcp_rmem_charge(struct sock *sk, int size)
+{
+	mptcp_sk(sk)->rmem_fwd_alloc -= size;
+}
+
 static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
 			       struct sk_buff *from)
 {
@@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
 	MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
 	kfree_skb_partial(from, fragstolen);
 	atomic_add(delta, &sk->sk_rmem_alloc);
-	sk_mem_charge(sk, delta);
+	mptcp_rmem_charge(sk, delta);
 	return true;
 }
 
@@ -155,6 +160,50 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
 	return mptcp_try_coalesce((struct sock *)msk, to, from);
 }
 
+void __mptcp_rmem_reclaim(struct sock *sk, int amount)
+{
+	amount >>= SK_MEM_QUANTUM_SHIFT;
+	mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+	__sk_mem_reduce_allocated(sk, amount);
+}
+
+static void mptcp_rmem_uncharge(struct sock *sk, int size)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	int reclaimable;
+
+	msk->rmem_fwd_alloc += size;
+	reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
+
+	/* Avoid a possible overflow.
+	 * TCP send queues can make this happen, if sk_mem_reclaim()
+	 * is not called and more than 2 GBytes are released at once.
+	 *
+	 * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
+	* no need to hold that much forward allocation anyway.
+	*/
+	if (unlikely(reclaimable >= 1 << 21))
+		__mptcp_rmem_reclaim(sk, (1 << 20));
+}
+
+static void mptcp_rfree(struct sk_buff *skb)
+{
+	unsigned int len = skb->truesize;
+	struct sock *sk = skb->sk;
+
+	atomic_sub(len, &sk->sk_rmem_alloc);
+	mptcp_rmem_uncharge(sk, len);
+}
+
+static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = mptcp_rfree;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+	mptcp_rmem_charge(sk, skb->truesize);
+}
+
 /* "inspired" by tcp_data_queue_ofo(), main differences:
  * - use mptcp seqs
  * - don't cope with sacks
@@ -267,7 +316,29 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
 
 end:
 	skb_condense(skb);
-	skb_set_owner_r(skb, sk);
+	mptcp_set_owner_r(skb, sk);
+}
+
+static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+	int amt, amount;
+
+	if (size < msk->rmem_fwd_alloc)
+		return true;;
+
+	amt = sk_mem_pages(size);
+	amount = amt << SK_MEM_QUANTUM_SHIFT;
+	msk->rmem_fwd_alloc += amount;
+	if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
+		if (ssk->sk_forward_alloc < amount) {
+			msk->rmem_fwd_alloc -= amount;
+			return false;
+		}
+
+		ssk->sk_forward_alloc -= amount;
+	}
+	return true;
 }
 
 static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@@ -285,15 +356,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
 	skb_orphan(skb);
 
 	/* try to fetch required memory from subflow */
-	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
-		int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
-
-		if (ssk->sk_forward_alloc < amount)
-			goto drop;
-
-		ssk->sk_forward_alloc -= amount;
-		sk->sk_forward_alloc += amount;
-	}
+	if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
+		goto drop;
 
 	has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
 
@@ -313,7 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
 		if (tail && mptcp_try_coalesce(sk, tail, skb))
 			return true;
 
-		skb_set_owner_r(skb, sk);
+		mptcp_set_owner_r(skb, sk);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
 		return true;
 	} else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
@@ -908,122 +972,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
 		df->data_seq + df->data_len == msk->write_seq;
 }
 
-static int mptcp_wmem_with_overhead(int size)
-{
-	return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
-}
-
-static void __mptcp_wmem_reserve(struct sock *sk, int size)
-{
-	int amount = mptcp_wmem_with_overhead(size);
-	struct mptcp_sock *msk = mptcp_sk(sk);
-
-	WARN_ON_ONCE(msk->wmem_reserved);
-	if (WARN_ON_ONCE(amount < 0))
-		amount = 0;
-
-	if (amount <= sk->sk_forward_alloc)
-		goto reserve;
-
-	/* under memory pressure try to reserve at most a single page
-	 * otherwise try to reserve the full estimate and fallback
-	 * to a single page before entering the error path
-	 */
-	if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
-	    !sk_wmem_schedule(sk, amount)) {
-		if (amount <= PAGE_SIZE)
-			goto nomem;
-
-		amount = PAGE_SIZE;
-		if (!sk_wmem_schedule(sk, amount))
-			goto nomem;
-	}
-
-reserve:
-	msk->wmem_reserved = amount;
-	sk->sk_forward_alloc -= amount;
-	return;
-
-nomem:
-	/* we will wait for memory on next allocation */
-	msk->wmem_reserved = -1;
-}
-
-static void __mptcp_update_wmem(struct sock *sk)
+static void __mptcp_mem_reclaim_partial(struct sock *sk)
 {
-	struct mptcp_sock *msk = mptcp_sk(sk);
+	int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
 
 	lockdep_assert_held_once(&sk->sk_lock.slock);
 
-	if (!msk->wmem_reserved)
-		return;
-
-	if (msk->wmem_reserved < 0)
-		msk->wmem_reserved = 0;
-	if (msk->wmem_reserved > 0) {
-		sk->sk_forward_alloc += msk->wmem_reserved;
-		msk->wmem_reserved = 0;
-	}
-}
-
-static bool mptcp_wmem_alloc(struct sock *sk, int size)
-{
-	struct mptcp_sock *msk = mptcp_sk(sk);
-
-	/* check for pre-existing error condition */
-	if (msk->wmem_reserved < 0)
-		return false;
-
-	if (msk->wmem_reserved >= size)
-		goto account;
-
-	mptcp_data_lock(sk);
-	if (!sk_wmem_schedule(sk, size)) {
-		mptcp_data_unlock(sk);
-		return false;
-	}
-
-	sk->sk_forward_alloc -= size;
-	msk->wmem_reserved += size;
-	mptcp_data_unlock(sk);
-
-account:
-	msk->wmem_reserved -= size;
-	return true;
-}
-
-static void mptcp_wmem_uncharge(struct sock *sk, int size)
-{
-	struct mptcp_sock *msk = mptcp_sk(sk);
-
-	if (msk->wmem_reserved < 0)
-		msk->wmem_reserved = 0;
-	msk->wmem_reserved += size;
-}
-
-static void __mptcp_mem_reclaim_partial(struct sock *sk)
-{
-	lockdep_assert_held_once(&sk->sk_lock.slock);
-	__mptcp_update_wmem(sk);
+	__mptcp_rmem_reclaim(sk, reclaimable - 1);
 	sk_mem_reclaim_partial(sk);
 }
 
 static void mptcp_mem_reclaim_partial(struct sock *sk)
 {
-	struct mptcp_sock *msk = mptcp_sk(sk);
-
-	/* if we are experiencing a transint allocation error,
-	 * the forward allocation memory has been already
-	 * released
-	 */
-	if (msk->wmem_reserved < 0)
-		return;
-
 	mptcp_data_lock(sk);
-	sk->sk_forward_alloc += msk->wmem_reserved;
-	sk_mem_reclaim_partial(sk);
-	msk->wmem_reserved = sk->sk_forward_alloc;
-	sk->sk_forward_alloc = 0;
+	__mptcp_mem_reclaim_partial(sk);
 	mptcp_data_unlock(sk);
 }
 
@@ -1664,7 +1626,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
 	/* __mptcp_alloc_tx_skb could have released some wmem and we are
 	 * not going to flush it via release_sock()
 	 */
-	__mptcp_update_wmem(sk);
 	if (copied) {
 		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
 			 info.size_goal);
@@ -1701,7 +1662,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	/* silently ignore everything else */
 	msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
 
-	mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
+	lock_sock(sk);
 
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
@@ -1749,17 +1710,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		psize = min_t(size_t, psize, msg_data_left(msg));
 		total_ts = psize + frag_truesize;
 
-		if (!mptcp_wmem_alloc(sk, total_ts))
+		if (!sk_wmem_schedule(sk, total_ts))
 			goto wait_for_memory;
 
 		if (copy_page_from_iter(dfrag->page, offset, psize,
 					&msg->msg_iter) != psize) {
-			mptcp_wmem_uncharge(sk, psize + frag_truesize);
 			ret = -EFAULT;
 			goto out;
 		}
 
 		/* data successfully copied into the write queue */
+		sk->sk_forward_alloc -= total_ts;
 		copied += psize;
 		dfrag->data_len += psize;
 		frag_truesize += psize;
@@ -1956,7 +1917,7 @@ static void __mptcp_update_rmem(struct sock *sk)
 		return;
 
 	atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
-	sk_mem_uncharge(sk, msk->rmem_released);
+	mptcp_rmem_uncharge(sk, msk->rmem_released);
 	WRITE_ONCE(msk->rmem_released, 0);
 }
 
@@ -2024,7 +1985,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
 
-	mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
+	lock_sock(sk);
 	if (unlikely(sk->sk_state == TCP_LISTEN)) {
 		copied = -ENOTCONN;
 		goto out_err;
@@ -2504,7 +2465,6 @@ static int __mptcp_init_sock(struct sock *sk)
 	__skb_queue_head_init(&msk->receive_queue);
 	msk->out_of_order_queue = RB_ROOT;
 	msk->first_pending = NULL;
-	msk->wmem_reserved = 0;
 	WRITE_ONCE(msk->rmem_released, 0);
 	msk->timer_ival = TCP_RTO_MIN;
 
@@ -2719,7 +2679,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
 
 	sk->sk_prot->destroy(sk);
 
-	WARN_ON_ONCE(msk->wmem_reserved);
+	WARN_ON_ONCE(msk->rmem_fwd_alloc);
 	WARN_ON_ONCE(msk->rmem_released);
 	sk_stream_kill_queues(sk);
 	xfrm_sk_free_policy(sk);
@@ -2954,6 +2914,9 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
 
 	/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
 	skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
+	__skb_queue_purge(&sk->sk_receive_queue);
+	sk->sk_forward_alloc += msk->rmem_fwd_alloc;
+	msk->rmem_fwd_alloc = 0;
 
 	skb_rbtree_purge(&msk->out_of_order_queue);
 	mptcp_token_destroy(msk);
@@ -3037,10 +3000,6 @@ static void mptcp_release_cb(struct sock *sk)
 	if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
 		__mptcp_error_report(sk);
 
-	/* push_pending may touch wmem_reserved, ensure we do the cleanup
-	 * later
-	 */
-	__mptcp_update_wmem(sk);
 	__mptcp_update_rmem(sk);
 }
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 7379ab580a7e..cfb374634a83 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -227,7 +227,7 @@ struct mptcp_sock {
 	u64		ack_seq;
 	u64		rcv_wnd_sent;
 	u64		rcv_data_fin_seq;
-	int		wmem_reserved;
+	int		rmem_fwd_alloc;
 	struct sock	*last_snd;
 	int		snd_burst;
 	int		old_wspace;
@@ -272,19 +272,6 @@ struct mptcp_sock {
 	char		ca_name[TCP_CA_NAME_MAX];
 };
 
-#define mptcp_lock_sock(___sk, cb) do {					\
-	struct sock *__sk = (___sk); /* silence macro reuse warning */	\
-	might_sleep();							\
-	spin_lock_bh(&__sk->sk_lock.slock);				\
-	if (__sk->sk_lock.owned)					\
-		__lock_sock(__sk);					\
-	cb;								\
-	__sk->sk_lock.owned = 1;					\
-	spin_unlock(&__sk->sk_lock.slock);				\
-	mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_);		\
-	local_bh_enable();						\
-} while (0)
-
 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
 #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
 
-- 
2.26.3


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path
  2021-10-05 16:39 [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path Paolo Abeni
@ 2021-10-05 19:49 ` kernel test robot
  2021-10-05 23:07 ` Mat Martineau
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2021-10-05 19:49 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2150 bytes --]

Hi Paolo,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on mptcp/export]
[also build test WARNING on linus/master v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Paolo-Abeni/mptcp-allocate-fwd-memory-separatelly-on-the-rx-and-tx-path/20211006-004118
base:   https://github.com/multipath-tcp/mptcp_net-next.git export
config: ia64-defconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/a2933712a580c1907d1f4837a5deee90e8353c36
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Paolo-Abeni/mptcp-allocate-fwd-memory-separatelly-on-the-rx-and-tx-path/20211006-004118
        git checkout a2933712a580c1907d1f4837a5deee90e8353c36
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=ia64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> net/mptcp/protocol.c:163:6: warning: no previous prototype for '__mptcp_rmem_reclaim' [-Wmissing-prototypes]
     163 | void __mptcp_rmem_reclaim(struct sock *sk, int amount)
         |      ^~~~~~~~~~~~~~~~~~~~


vim +/__mptcp_rmem_reclaim +163 net/mptcp/protocol.c

   162	
 > 163	void __mptcp_rmem_reclaim(struct sock *sk, int amount)
   164	{
   165		amount >>= SK_MEM_QUANTUM_SHIFT;
   166		mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
   167		__sk_mem_reduce_allocated(sk, amount);
   168	}
   169	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 19981 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path
  2021-10-05 16:39 [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path Paolo Abeni
  2021-10-05 19:49 ` kernel test robot
@ 2021-10-05 23:07 ` Mat Martineau
  2021-10-06  9:42   ` Paolo Abeni
  2021-10-06  1:30   ` kernel test robot
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 9+ messages in thread
From: Mat Martineau @ 2021-10-05 23:07 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: mptcp, fwestpha

On Tue, 5 Oct 2021, Paolo Abeni wrote:

> All the mptcp receive path is protected by the msk socket
> spinlock. As consequence, the tx path has to play a few to allocate
> the forward memory without acquiring the spinlock multile times, making
> the overall TX path quite complex.
>
> This patch tries to clean-up a bit the tx path, using completely
> separated fwd memory allocation, for the rx and the tx path.
>

Overall, it does seem like a cleaner approach than the existing 
wmem_reserved code. Do you think the existing code is related to any 
current bugs or performance issues?

> The forward memory allocated in the rx path is now accounted in
> msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.
>
> To cope with the above we provied a few MPTCP-specific variant for
> the helpers to charge, uncharge, reclaim and free the forward
> memory in the receive path.
>
> msk->sk_forward_alloc now accounts only the forward memory for the tx path,
> we can the plain core sock helper to manipulate it and drop quite a bit
> of complexity.
>
> On memory pressure reclaim both rx and tx fwd memory.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> Note: this would additionally require a core change to properly fetch
> the forward allocated memory in inet_sk_diag_fill(). I think a new
> 'struct proto' ops would do:
>
> struct proto {
> 	// ...
> 	int (*forward_alloc)(const struct sock *sk);
> 	// ...
> };
>
> static inline sk_forward_alloc(const struct sock *sk)
> {
> 	if (!sk->sk_prot.forward_alloc)
> 		return sk->sk_forward_alloc;
> 	return sk->sk_prot.forward_alloc(sk);
> }

Seems ok to me to handle the diag info this way, hope upstream reviewers 
agree!

> ---
> net/mptcp/protocol.c | 219 ++++++++++++++++++-------------------------
> net/mptcp/protocol.h |  15 +--
> 2 files changed, 90 insertions(+), 144 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 21716392e754..1d26462a1daf 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
> 	__kfree_skb(skb);
> }
>
> +static void mptcp_rmem_charge(struct sock *sk, int size)
> +{
> +	mptcp_sk(sk)->rmem_fwd_alloc -= size;
> +}
> +
> static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
> 			       struct sk_buff *from)
> {
> @@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
> 	MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
> 	kfree_skb_partial(from, fragstolen);
> 	atomic_add(delta, &sk->sk_rmem_alloc);
> -	sk_mem_charge(sk, delta);
> +	mptcp_rmem_charge(sk, delta);
> 	return true;
> }
>
> @@ -155,6 +160,50 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
> 	return mptcp_try_coalesce((struct sock *)msk, to, from);
> }
>
> +void __mptcp_rmem_reclaim(struct sock *sk, int amount)
> +{
> +	amount >>= SK_MEM_QUANTUM_SHIFT;
> +	mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
> +	__sk_mem_reduce_allocated(sk, amount);
> +}
> +
> +static void mptcp_rmem_uncharge(struct sock *sk, int size)
> +{
> +	struct mptcp_sock *msk = mptcp_sk(sk);
> +	int reclaimable;
> +
> +	msk->rmem_fwd_alloc += size;
> +	reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);

I was going to ask if this patch takes in to account the recent 
SO_RESERVE_MEM patchset's changes to sk_forward_alloc handling, but these 
changes mirror those in sk_mem_reclaim() so it seems the answer is "yes".

> +
> +	/* Avoid a possible overflow.
> +	 * TCP send queues can make this happen, if sk_mem_reclaim()
> +	 * is not called and more than 2 GBytes are released at once.
> +	 *
> +	 * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
> +	* no need to hold that much forward allocation anyway.
> +	*/

Minor indent issue in the above comment.

> +	if (unlikely(reclaimable >= 1 << 21))
> +		__mptcp_rmem_reclaim(sk, (1 << 20));

I realize these "1 << 21" and "1 << 20" values are copied from 
sk_mem_uncharge(). Should they be changed in both files to symbolic values 
defined in sock.h, to help keep the values matched if someone decides to 
adjust them in the future?

> +}
> +
> +static void mptcp_rfree(struct sk_buff *skb)
> +{
> +	unsigned int len = skb->truesize;
> +	struct sock *sk = skb->sk;
> +
> +	atomic_sub(len, &sk->sk_rmem_alloc);
> +	mptcp_rmem_uncharge(sk, len);
> +}
> +
> +static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
> +{
> +	skb_orphan(skb);
> +	skb->sk = sk;
> +	skb->destructor = mptcp_rfree;
> +	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
> +	mptcp_rmem_charge(sk, skb->truesize);
> +}
> +
> /* "inspired" by tcp_data_queue_ofo(), main differences:
>  * - use mptcp seqs
>  * - don't cope with sacks
> @@ -267,7 +316,29 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
>
> end:
> 	skb_condense(skb);
> -	skb_set_owner_r(skb, sk);
> +	mptcp_set_owner_r(skb, sk);
> +}
> +
> +static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
> +{
> +	struct mptcp_sock *msk = mptcp_sk(sk);
> +	int amt, amount;
> +
> +	if (size < msk->rmem_fwd_alloc)
> +		return true;;
> +
> +	amt = sk_mem_pages(size);
> +	amount = amt << SK_MEM_QUANTUM_SHIFT;
> +	msk->rmem_fwd_alloc += amount;
> +	if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
> +		if (ssk->sk_forward_alloc < amount) {
> +			msk->rmem_fwd_alloc -= amount;
> +			return false;
> +		}
> +
> +		ssk->sk_forward_alloc -= amount;
> +	}
> +	return true;
> }
>
> static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> @@ -285,15 +356,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> 	skb_orphan(skb);
>
> 	/* try to fetch required memory from subflow */
> -	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
> -		int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
> -
> -		if (ssk->sk_forward_alloc < amount)
> -			goto drop;
> -
> -		ssk->sk_forward_alloc -= amount;
> -		sk->sk_forward_alloc += amount;
> -	}
> +	if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
> +		goto drop;
>
> 	has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
>
> @@ -313,7 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> 		if (tail && mptcp_try_coalesce(sk, tail, skb))
> 			return true;
>
> -		skb_set_owner_r(skb, sk);
> +		mptcp_set_owner_r(skb, sk);
> 		__skb_queue_tail(&sk->sk_receive_queue, skb);
> 		return true;
> 	} else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
> @@ -908,122 +972,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
> 		df->data_seq + df->data_len == msk->write_seq;
> }
>
> -static int mptcp_wmem_with_overhead(int size)
> -{
> -	return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
> -}
> -
> -static void __mptcp_wmem_reserve(struct sock *sk, int size)
> -{
> -	int amount = mptcp_wmem_with_overhead(size);
> -	struct mptcp_sock *msk = mptcp_sk(sk);
> -
> -	WARN_ON_ONCE(msk->wmem_reserved);
> -	if (WARN_ON_ONCE(amount < 0))
> -		amount = 0;
> -
> -	if (amount <= sk->sk_forward_alloc)
> -		goto reserve;
> -
> -	/* under memory pressure try to reserve at most a single page
> -	 * otherwise try to reserve the full estimate and fallback
> -	 * to a single page before entering the error path
> -	 */
> -	if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
> -	    !sk_wmem_schedule(sk, amount)) {
> -		if (amount <= PAGE_SIZE)
> -			goto nomem;
> -
> -		amount = PAGE_SIZE;
> -		if (!sk_wmem_schedule(sk, amount))
> -			goto nomem;
> -	}
> -
> -reserve:
> -	msk->wmem_reserved = amount;
> -	sk->sk_forward_alloc -= amount;
> -	return;
> -
> -nomem:
> -	/* we will wait for memory on next allocation */
> -	msk->wmem_reserved = -1;
> -}
> -
> -static void __mptcp_update_wmem(struct sock *sk)
> +static void __mptcp_mem_reclaim_partial(struct sock *sk)
> {
> -	struct mptcp_sock *msk = mptcp_sk(sk);
> +	int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
>
> 	lockdep_assert_held_once(&sk->sk_lock.slock);
>
> -	if (!msk->wmem_reserved)
> -		return;
> -
> -	if (msk->wmem_reserved < 0)
> -		msk->wmem_reserved = 0;
> -	if (msk->wmem_reserved > 0) {
> -		sk->sk_forward_alloc += msk->wmem_reserved;
> -		msk->wmem_reserved = 0;
> -	}
> -}
> -
> -static bool mptcp_wmem_alloc(struct sock *sk, int size)
> -{
> -	struct mptcp_sock *msk = mptcp_sk(sk);
> -
> -	/* check for pre-existing error condition */
> -	if (msk->wmem_reserved < 0)
> -		return false;
> -
> -	if (msk->wmem_reserved >= size)
> -		goto account;
> -
> -	mptcp_data_lock(sk);
> -	if (!sk_wmem_schedule(sk, size)) {
> -		mptcp_data_unlock(sk);
> -		return false;
> -	}
> -
> -	sk->sk_forward_alloc -= size;
> -	msk->wmem_reserved += size;
> -	mptcp_data_unlock(sk);
> -
> -account:
> -	msk->wmem_reserved -= size;
> -	return true;
> -}
> -
> -static void mptcp_wmem_uncharge(struct sock *sk, int size)
> -{
> -	struct mptcp_sock *msk = mptcp_sk(sk);
> -
> -	if (msk->wmem_reserved < 0)
> -		msk->wmem_reserved = 0;
> -	msk->wmem_reserved += size;
> -}
> -
> -static void __mptcp_mem_reclaim_partial(struct sock *sk)
> -{
> -	lockdep_assert_held_once(&sk->sk_lock.slock);
> -	__mptcp_update_wmem(sk);
> +	__mptcp_rmem_reclaim(sk, reclaimable - 1);
> 	sk_mem_reclaim_partial(sk);
> }
>
> static void mptcp_mem_reclaim_partial(struct sock *sk)
> {
> -	struct mptcp_sock *msk = mptcp_sk(sk);
> -
> -	/* if we are experiencing a transint allocation error,
> -	 * the forward allocation memory has been already
> -	 * released
> -	 */
> -	if (msk->wmem_reserved < 0)
> -		return;
> -
> 	mptcp_data_lock(sk);
> -	sk->sk_forward_alloc += msk->wmem_reserved;
> -	sk_mem_reclaim_partial(sk);
> -	msk->wmem_reserved = sk->sk_forward_alloc;
> -	sk->sk_forward_alloc = 0;
> +	__mptcp_mem_reclaim_partial(sk);
> 	mptcp_data_unlock(sk);
> }
>
> @@ -1664,7 +1626,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
> 	/* __mptcp_alloc_tx_skb could have released some wmem and we are
> 	 * not going to flush it via release_sock()
> 	 */
> -	__mptcp_update_wmem(sk);
> 	if (copied) {
> 		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
> 			 info.size_goal);
> @@ -1701,7 +1662,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 	/* silently ignore everything else */
> 	msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
>
> -	mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
> +	lock_sock(sk);
>
> 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
>
> @@ -1749,17 +1710,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> 		psize = min_t(size_t, psize, msg_data_left(msg));
> 		total_ts = psize + frag_truesize;
>
> -		if (!mptcp_wmem_alloc(sk, total_ts))
> +		if (!sk_wmem_schedule(sk, total_ts))
> 			goto wait_for_memory;
>
> 		if (copy_page_from_iter(dfrag->page, offset, psize,
> 					&msg->msg_iter) != psize) {
> -			mptcp_wmem_uncharge(sk, psize + frag_truesize);
> 			ret = -EFAULT;
> 			goto out;
> 		}
>
> 		/* data successfully copied into the write queue */
> +		sk->sk_forward_alloc -= total_ts;
> 		copied += psize;
> 		dfrag->data_len += psize;
> 		frag_truesize += psize;
> @@ -1956,7 +1917,7 @@ static void __mptcp_update_rmem(struct sock *sk)
> 		return;
>
> 	atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
> -	sk_mem_uncharge(sk, msk->rmem_released);
> +	mptcp_rmem_uncharge(sk, msk->rmem_released);
> 	WRITE_ONCE(msk->rmem_released, 0);
> }
>
> @@ -2024,7 +1985,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> 	if (unlikely(flags & MSG_ERRQUEUE))
> 		return inet_recv_error(sk, msg, len, addr_len);
>
> -	mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
> +	lock_sock(sk);
> 	if (unlikely(sk->sk_state == TCP_LISTEN)) {
> 		copied = -ENOTCONN;
> 		goto out_err;
> @@ -2504,7 +2465,6 @@ static int __mptcp_init_sock(struct sock *sk)
> 	__skb_queue_head_init(&msk->receive_queue);
> 	msk->out_of_order_queue = RB_ROOT;
> 	msk->first_pending = NULL;
> -	msk->wmem_reserved = 0;

msk->rmem_fwd_alloc init needs to be added, right?

> 	WRITE_ONCE(msk->rmem_released, 0);
> 	msk->timer_ival = TCP_RTO_MIN;
>
> @@ -2719,7 +2679,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
>
> 	sk->sk_prot->destroy(sk);
>
> -	WARN_ON_ONCE(msk->wmem_reserved);
> +	WARN_ON_ONCE(msk->rmem_fwd_alloc);
> 	WARN_ON_ONCE(msk->rmem_released);
> 	sk_stream_kill_queues(sk);
> 	xfrm_sk_free_policy(sk);
> @@ -2954,6 +2914,9 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
>
> 	/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
> 	skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
> +	__skb_queue_purge(&sk->sk_receive_queue);
> +	sk->sk_forward_alloc += msk->rmem_fwd_alloc;
> +	msk->rmem_fwd_alloc = 0;
>
> 	skb_rbtree_purge(&msk->out_of_order_queue);
> 	mptcp_token_destroy(msk);
> @@ -3037,10 +3000,6 @@ static void mptcp_release_cb(struct sock *sk)
> 	if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
> 		__mptcp_error_report(sk);
>
> -	/* push_pending may touch wmem_reserved, ensure we do the cleanup
> -	 * later
> -	 */
> -	__mptcp_update_wmem(sk);
> 	__mptcp_update_rmem(sk);
> }
>
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 7379ab580a7e..cfb374634a83 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -227,7 +227,7 @@ struct mptcp_sock {
> 	u64		ack_seq;
> 	u64		rcv_wnd_sent;
> 	u64		rcv_data_fin_seq;
> -	int		wmem_reserved;
> +	int		rmem_fwd_alloc;
> 	struct sock	*last_snd;
> 	int		snd_burst;
> 	int		old_wspace;
> @@ -272,19 +272,6 @@ struct mptcp_sock {
> 	char		ca_name[TCP_CA_NAME_MAX];
> };
>
> -#define mptcp_lock_sock(___sk, cb) do {					\
> -	struct sock *__sk = (___sk); /* silence macro reuse warning */	\
> -	might_sleep();							\
> -	spin_lock_bh(&__sk->sk_lock.slock);				\
> -	if (__sk->sk_lock.owned)					\
> -		__lock_sock(__sk);					\
> -	cb;								\
> -	__sk->sk_lock.owned = 1;					\
> -	spin_unlock(&__sk->sk_lock.slock);				\
> -	mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_);		\
> -	local_bh_enable();						\
> -} while (0)
> -
> #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
> #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
>

--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path
  2021-10-05 16:39 [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path Paolo Abeni
@ 2021-10-06  1:30   ` kernel test robot
  2021-10-05 23:07 ` Mat Martineau
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2021-10-06  1:30 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: llvm, kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2392 bytes --]

Hi Paolo,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on mptcp/export]
[also build test WARNING on linus/master v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Paolo-Abeni/mptcp-allocate-fwd-memory-separatelly-on-the-rx-and-tx-path/20211006-004118
base:   https://github.com/multipath-tcp/mptcp_net-next.git export
config: i386-randconfig-a001-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/a2933712a580c1907d1f4837a5deee90e8353c36
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Paolo-Abeni/mptcp-allocate-fwd-memory-separatelly-on-the-rx-and-tx-path/20211006-004118
        git checkout a2933712a580c1907d1f4837a5deee90e8353c36
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> net/mptcp/protocol.c:163:6: warning: no previous prototype for function '__mptcp_rmem_reclaim' [-Wmissing-prototypes]
   void __mptcp_rmem_reclaim(struct sock *sk, int amount)
        ^
   net/mptcp/protocol.c:163:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __mptcp_rmem_reclaim(struct sock *sk, int amount)
   ^
   static 
   1 warning generated.


vim +/__mptcp_rmem_reclaim +163 net/mptcp/protocol.c

   162	
 > 163	void __mptcp_rmem_reclaim(struct sock *sk, int amount)
   164	{
   165		amount >>= SK_MEM_QUANTUM_SHIFT;
   166		mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
   167		__sk_mem_reduce_allocated(sk, amount);
   168	}
   169	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 38698 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path
@ 2021-10-06  1:30   ` kernel test robot
  0 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2021-10-06  1:30 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2444 bytes --]

Hi Paolo,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on mptcp/export]
[also build test WARNING on linus/master v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Paolo-Abeni/mptcp-allocate-fwd-memory-separatelly-on-the-rx-and-tx-path/20211006-004118
base:   https://github.com/multipath-tcp/mptcp_net-next.git export
config: i386-randconfig-a001-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/a2933712a580c1907d1f4837a5deee90e8353c36
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Paolo-Abeni/mptcp-allocate-fwd-memory-separatelly-on-the-rx-and-tx-path/20211006-004118
        git checkout a2933712a580c1907d1f4837a5deee90e8353c36
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> net/mptcp/protocol.c:163:6: warning: no previous prototype for function '__mptcp_rmem_reclaim' [-Wmissing-prototypes]
   void __mptcp_rmem_reclaim(struct sock *sk, int amount)
        ^
   net/mptcp/protocol.c:163:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __mptcp_rmem_reclaim(struct sock *sk, int amount)
   ^
   static 
   1 warning generated.


vim +/__mptcp_rmem_reclaim +163 net/mptcp/protocol.c

   162	
 > 163	void __mptcp_rmem_reclaim(struct sock *sk, int amount)
   164	{
   165		amount >>= SK_MEM_QUANTUM_SHIFT;
   166		mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
   167		__sk_mem_reduce_allocated(sk, amount);
   168	}
   169	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 38698 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path
  2021-10-05 23:07 ` Mat Martineau
@ 2021-10-06  9:42   ` Paolo Abeni
  0 siblings, 0 replies; 9+ messages in thread
From: Paolo Abeni @ 2021-10-06  9:42 UTC (permalink / raw)
  To: Mat Martineau; +Cc: mptcp, fwestpha

On Tue, 2021-10-05 at 16:07 -0700, Mat Martineau wrote:
> On Tue, 5 Oct 2021, Paolo Abeni wrote:
> 
> > All the mptcp receive path is protected by the msk socket
> > spinlock. As consequence, the tx path has to play a few to allocate
> > the forward memory without acquiring the spinlock multile times, making
> > the overall TX path quite complex.
> > 
> > This patch tries to clean-up a bit the tx path, using completely
> > separated fwd memory allocation, for the rx and the tx path.
> > 
> 
> Overall, it does seem like a cleaner approach than the existing 
> wmem_reserved code. Do you think the existing code is related to any 
> current bugs or performance issues?

None that I know, so far. But I fear (I'm almost quite sure) that the
current wmem_reserved is prone to problems with memory pressure and
could perform badly on some scenarios. Overall the 'wmem_reserved'
looks like the ugliest spot we currently have in the mptcp
implementation.

I "feel" that an approach like this one should be more roboust in the
long term.

> > The forward memory allocated in the rx path is now accounted in
> > msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.
> > 
> > To cope with the above we provied a few MPTCP-specific variant for
> > the helpers to charge, uncharge, reclaim and free the forward
> > memory in the receive path.
> > 
> > msk->sk_forward_alloc now accounts only the forward memory for the tx path,
> > we can the plain core sock helper to manipulate it and drop quite a bit
> > of complexity.
> > 
> > On memory pressure reclaim both rx and tx fwd memory.
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > Note: this would additionally require a core change to properly fetch
> > the forward allocated memory in inet_sk_diag_fill(). I think a new
> > 'struct proto' ops would do:
> > 
> > struct proto {
> > 	// ...
> > 	int (*forward_alloc)(const struct sock *sk);
> > 	// ...
> > };
> > 
> > static inline sk_forward_alloc(const struct sock *sk)
> > {
> > 	if (!sk->sk_prot.forward_alloc)
> > 		return sk->sk_forward_alloc;
> > 	return sk->sk_prot.forward_alloc(sk);
> > }
> 
> Seems ok to me to handle the diag info this way, hope upstream reviewers 
> agree!

:)

> > ---
> > net/mptcp/protocol.c | 219 ++++++++++++++++++-------------------------
> > net/mptcp/protocol.h |  15 +--
> > 2 files changed, 90 insertions(+), 144 deletions(-)
> > 
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 21716392e754..1d26462a1daf 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
> > 	__kfree_skb(skb);
> > }
> > 
> > +static void mptcp_rmem_charge(struct sock *sk, int size)
> > +{
> > +	mptcp_sk(sk)->rmem_fwd_alloc -= size;
> > +}
> > +
> > static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
> > 			       struct sk_buff *from)
> > {
> > @@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
> > 	MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
> > 	kfree_skb_partial(from, fragstolen);
> > 	atomic_add(delta, &sk->sk_rmem_alloc);
> > -	sk_mem_charge(sk, delta);
> > +	mptcp_rmem_charge(sk, delta);
> > 	return true;
> > }
> > 
> > @@ -155,6 +160,50 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
> > 	return mptcp_try_coalesce((struct sock *)msk, to, from);
> > }
> > 
> > +void __mptcp_rmem_reclaim(struct sock *sk, int amount)
> > +{
> > +	amount >>= SK_MEM_QUANTUM_SHIFT;
> > +	mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
> > +	__sk_mem_reduce_allocated(sk, amount);
> > +}
> > +
> > +static void mptcp_rmem_uncharge(struct sock *sk, int size)
> > +{
> > +	struct mptcp_sock *msk = mptcp_sk(sk);
> > +	int reclaimable;
> > +
> > +	msk->rmem_fwd_alloc += size;
> > +	reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
> 
> I was going to ask if this patch takes in to account the recent 
> SO_RESERVE_MEM patchset's changes to sk_forward_alloc handling, but these 
> changes mirror those in sk_mem_reclaim() so it seems the answer is "yes".

Indeed!

> > +
> > +	/* Avoid a possible overflow.
> > +	 * TCP send queues can make this happen, if sk_mem_reclaim()
> > +	 * is not called and more than 2 GBytes are released at once.
> > +	 *
> > +	 * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
> > +	* no need to hold that much forward allocation anyway.
> > +	*/
> 
> Minor indent issue in the above comment.
> 
> > +	if (unlikely(reclaimable >= 1 << 21))
> > +		__mptcp_rmem_reclaim(sk, (1 << 20));
> 
> I realize these "1 << 21" and "1 << 20" values are copied from 
> sk_mem_uncharge(). Should they be changed in both files to symbolic values 
> defined in sock.h, to help keep the values matched if someone decides to 
> adjust them in the future?

I'll define some macro for that.

> > +}
> > +
> > +static void mptcp_rfree(struct sk_buff *skb)
> > +{
> > +	unsigned int len = skb->truesize;
> > +	struct sock *sk = skb->sk;
> > +
> > +	atomic_sub(len, &sk->sk_rmem_alloc);
> > +	mptcp_rmem_uncharge(sk, len);
> > +}
> > +
> > +static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
> > +{
> > +	skb_orphan(skb);
> > +	skb->sk = sk;
> > +	skb->destructor = mptcp_rfree;
> > +	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
> > +	mptcp_rmem_charge(sk, skb->truesize);
> > +}
> > +
> > /* "inspired" by tcp_data_queue_ofo(), main differences:
> >  * - use mptcp seqs
> >  * - don't cope with sacks
> > @@ -267,7 +316,29 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
> > 
> > end:
> > 	skb_condense(skb);
> > -	skb_set_owner_r(skb, sk);
> > +	mptcp_set_owner_r(skb, sk);
> > +}
> > +
> > +static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
> > +{
> > +	struct mptcp_sock *msk = mptcp_sk(sk);
> > +	int amt, amount;
> > +
> > +	if (size < msk->rmem_fwd_alloc)
> > +		return true;;
> > +
> > +	amt = sk_mem_pages(size);
> > +	amount = amt << SK_MEM_QUANTUM_SHIFT;
> > +	msk->rmem_fwd_alloc += amount;
> > +	if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
> > +		if (ssk->sk_forward_alloc < amount) {
> > +			msk->rmem_fwd_alloc -= amount;
> > +			return false;
> > +		}
> > +
> > +		ssk->sk_forward_alloc -= amount;
> > +	}
> > +	return true;
> > }
> > 
> > static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> > @@ -285,15 +356,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> > 	skb_orphan(skb);
> > 
> > 	/* try to fetch required memory from subflow */
> > -	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
> > -		int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
> > -
> > -		if (ssk->sk_forward_alloc < amount)
> > -			goto drop;
> > -
> > -		ssk->sk_forward_alloc -= amount;
> > -		sk->sk_forward_alloc += amount;
> > -	}
> > +	if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
> > +		goto drop;
> > 
> > 	has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
> > 
> > @@ -313,7 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> > 		if (tail && mptcp_try_coalesce(sk, tail, skb))
> > 			return true;
> > 
> > -		skb_set_owner_r(skb, sk);
> > +		mptcp_set_owner_r(skb, sk);
> > 		__skb_queue_tail(&sk->sk_receive_queue, skb);
> > 		return true;
> > 	} else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
> > @@ -908,122 +972,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
> > 		df->data_seq + df->data_len == msk->write_seq;
> > }
> > 
> > -static int mptcp_wmem_with_overhead(int size)
> > -{
> > -	return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
> > -}
> > -
> > -static void __mptcp_wmem_reserve(struct sock *sk, int size)
> > -{
> > -	int amount = mptcp_wmem_with_overhead(size);
> > -	struct mptcp_sock *msk = mptcp_sk(sk);
> > -
> > -	WARN_ON_ONCE(msk->wmem_reserved);
> > -	if (WARN_ON_ONCE(amount < 0))
> > -		amount = 0;
> > -
> > -	if (amount <= sk->sk_forward_alloc)
> > -		goto reserve;
> > -
> > -	/* under memory pressure try to reserve at most a single page
> > -	 * otherwise try to reserve the full estimate and fallback
> > -	 * to a single page before entering the error path
> > -	 */
> > -	if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
> > -	    !sk_wmem_schedule(sk, amount)) {
> > -		if (amount <= PAGE_SIZE)
> > -			goto nomem;
> > -
> > -		amount = PAGE_SIZE;
> > -		if (!sk_wmem_schedule(sk, amount))
> > -			goto nomem;
> > -	}
> > -
> > -reserve:
> > -	msk->wmem_reserved = amount;
> > -	sk->sk_forward_alloc -= amount;
> > -	return;
> > -
> > -nomem:
> > -	/* we will wait for memory on next allocation */
> > -	msk->wmem_reserved = -1;
> > -}
> > -
> > -static void __mptcp_update_wmem(struct sock *sk)
> > +static void __mptcp_mem_reclaim_partial(struct sock *sk)
> > {
> > -	struct mptcp_sock *msk = mptcp_sk(sk);
> > +	int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
> > 
> > 	lockdep_assert_held_once(&sk->sk_lock.slock);
> > 
> > -	if (!msk->wmem_reserved)
> > -		return;
> > -
> > -	if (msk->wmem_reserved < 0)
> > -		msk->wmem_reserved = 0;
> > -	if (msk->wmem_reserved > 0) {
> > -		sk->sk_forward_alloc += msk->wmem_reserved;
> > -		msk->wmem_reserved = 0;
> > -	}
> > -}
> > -
> > -static bool mptcp_wmem_alloc(struct sock *sk, int size)
> > -{
> > -	struct mptcp_sock *msk = mptcp_sk(sk);
> > -
> > -	/* check for pre-existing error condition */
> > -	if (msk->wmem_reserved < 0)
> > -		return false;
> > -
> > -	if (msk->wmem_reserved >= size)
> > -		goto account;
> > -
> > -	mptcp_data_lock(sk);
> > -	if (!sk_wmem_schedule(sk, size)) {
> > -		mptcp_data_unlock(sk);
> > -		return false;
> > -	}
> > -
> > -	sk->sk_forward_alloc -= size;
> > -	msk->wmem_reserved += size;
> > -	mptcp_data_unlock(sk);
> > -
> > -account:
> > -	msk->wmem_reserved -= size;
> > -	return true;
> > -}
> > -
> > -static void mptcp_wmem_uncharge(struct sock *sk, int size)
> > -{
> > -	struct mptcp_sock *msk = mptcp_sk(sk);
> > -
> > -	if (msk->wmem_reserved < 0)
> > -		msk->wmem_reserved = 0;
> > -	msk->wmem_reserved += size;
> > -}
> > -
> > -static void __mptcp_mem_reclaim_partial(struct sock *sk)
> > -{
> > -	lockdep_assert_held_once(&sk->sk_lock.slock);
> > -	__mptcp_update_wmem(sk);
> > +	__mptcp_rmem_reclaim(sk, reclaimable - 1);
> > 	sk_mem_reclaim_partial(sk);
> > }
> > 
> > static void mptcp_mem_reclaim_partial(struct sock *sk)
> > {
> > -	struct mptcp_sock *msk = mptcp_sk(sk);
> > -
> > -	/* if we are experiencing a transint allocation error,
> > -	 * the forward allocation memory has been already
> > -	 * released
> > -	 */
> > -	if (msk->wmem_reserved < 0)
> > -		return;
> > -
> > 	mptcp_data_lock(sk);
> > -	sk->sk_forward_alloc += msk->wmem_reserved;
> > -	sk_mem_reclaim_partial(sk);
> > -	msk->wmem_reserved = sk->sk_forward_alloc;
> > -	sk->sk_forward_alloc = 0;
> > +	__mptcp_mem_reclaim_partial(sk);
> > 	mptcp_data_unlock(sk);
> > }
> > 
> > @@ -1664,7 +1626,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
> > 	/* __mptcp_alloc_tx_skb could have released some wmem and we are
> > 	 * not going to flush it via release_sock()
> > 	 */
> > -	__mptcp_update_wmem(sk);
> > 	if (copied) {
> > 		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
> > 			 info.size_goal);
> > @@ -1701,7 +1662,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> > 	/* silently ignore everything else */
> > 	msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
> > 
> > -	mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
> > +	lock_sock(sk);
> > 
> > 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
> > 
> > @@ -1749,17 +1710,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> > 		psize = min_t(size_t, psize, msg_data_left(msg));
> > 		total_ts = psize + frag_truesize;
> > 
> > -		if (!mptcp_wmem_alloc(sk, total_ts))
> > +		if (!sk_wmem_schedule(sk, total_ts))
> > 			goto wait_for_memory;
> > 
> > 		if (copy_page_from_iter(dfrag->page, offset, psize,
> > 					&msg->msg_iter) != psize) {
> > -			mptcp_wmem_uncharge(sk, psize + frag_truesize);
> > 			ret = -EFAULT;
> > 			goto out;
> > 		}
> > 
> > 		/* data successfully copied into the write queue */
> > +		sk->sk_forward_alloc -= total_ts;
> > 		copied += psize;
> > 		dfrag->data_len += psize;
> > 		frag_truesize += psize;
> > @@ -1956,7 +1917,7 @@ static void __mptcp_update_rmem(struct sock *sk)
> > 		return;
> > 
> > 	atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
> > -	sk_mem_uncharge(sk, msk->rmem_released);
> > +	mptcp_rmem_uncharge(sk, msk->rmem_released);
> > 	WRITE_ONCE(msk->rmem_released, 0);
> > }
> > 
> > @@ -2024,7 +1985,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> > 	if (unlikely(flags & MSG_ERRQUEUE))
> > 		return inet_recv_error(sk, msg, len, addr_len);
> > 
> > -	mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
> > +	lock_sock(sk);
> > 	if (unlikely(sk->sk_state == TCP_LISTEN)) {
> > 		copied = -ENOTCONN;
> > 		goto out_err;
> > @@ -2504,7 +2465,6 @@ static int __mptcp_init_sock(struct sock *sk)
> > 	__skb_queue_head_init(&msk->receive_queue);
> > 	msk->out_of_order_queue = RB_ROOT;
> > 	msk->first_pending = NULL;
> > -	msk->wmem_reserved = 0;
> 
> msk->rmem_fwd_alloc init needs to be added, right?

Right. Not sure with self-tests did not show any issues ??!

Thanks!

Paolo


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path
  2021-10-05 16:39 [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path Paolo Abeni
                   ` (2 preceding siblings ...)
  2021-10-06  1:30   ` kernel test robot
@ 2021-10-12  9:33 ` Matthieu Baerts
  2021-10-14 17:03 ` mptcp: allocate fwd memory separatelly on the rx and tx path: Build Failure MPTCP CI
  2021-10-14 17:53 ` mptcp: allocate fwd memory separatelly on the rx and tx path: Tests Results MPTCP CI
  5 siblings, 0 replies; 9+ messages in thread
From: Matthieu Baerts @ 2021-10-12  9:33 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: MPTCP Upstream, Davide Caratti

Hi Paolo,

On 05/10/2021 18:39, Paolo Abeni wrote:
> All the mptcp receive path is protected by the msk socket
> spinlock. As consequence, the tx path has to play a few to allocate
> the forward memory without acquiring the spinlock multile times, making
> the overall TX path quite complex.
> 
> This patch tries to clean-up a bit the tx path, using completely
> separated fwd memory allocation, for the rx and the tx path.
> 
> The forward memory allocated in the rx path is now accounted in
> msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.
> 
> To cope with the above we provied a few MPTCP-specific variant for
> the helpers to charge, uncharge, reclaim and free the forward
> memory in the receive path.
> 
> msk->sk_forward_alloc now accounts only the forward memory for the tx path,
> we can the plain core sock helper to manipulate it and drop quite a bit
> of complexity.
> 
> On memory pressure reclaim both rx and tx fwd memory.

Thank you for looking at that!

(...)

> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 21716392e754..1d26462a1daf 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c

(...)

> @@ -155,6 +160,50 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
>  	return mptcp_try_coalesce((struct sock *)msk, to, from);
>  }
>  
> +void __mptcp_rmem_reclaim(struct sock *sk, int amount)

The public CI complained about this:

ERROR: Unable to compile mptcp source code with W=1 C=1:

  net/mptcp/protocol.c:163:6: error: no previous prototype for
'__mptcp_rmem_reclaim' [-Werror=missing-prototypes]
    163 | void __mptcp_rmem_reclaim(struct sock *sk, int amount)
        |      ^~~~~~~~~~~~~~~~~~~~

https://github.com/multipath-tcp/mptcp_net-next/runs/3868191624?check_suite_focus=true

As confirmed by Davide on IRC, marking the function as 'static' fixes
the error.

Cheers,
Matt
-- 
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: mptcp: allocate fwd memory separatelly on the rx and tx path: Build Failure
  2021-10-05 16:39 [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path Paolo Abeni
                   ` (3 preceding siblings ...)
  2021-10-12  9:33 ` Matthieu Baerts
@ 2021-10-14 17:03 ` MPTCP CI
  2021-10-14 17:53 ` mptcp: allocate fwd memory separatelly on the rx and tx path: Tests Results MPTCP CI
  5 siblings, 0 replies; 9+ messages in thread
From: MPTCP CI @ 2021-10-14 17:03 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: mptcp

Hi Paolo Abeni,

Thank you for your modifications, that's great!

But sadly, our CI spotted some issues with it when trying to build it.

You can find more details there:

  https://github.com/multipath-tcp/mptcp_net-next/actions/runs/1342749505

Status: failure
Initiator: matttbe
Ref: patchew/cf3710062ecac93e469306b3ed59a70688d36620.1633451893.git.pabeni@redhat.com
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/patchew/cf3710062ecac93e469306b3ed59a70688d36620.1633451893.git.pabeni@redhat.com
Tree: https://github.com/multipath-tcp/mptcp_net-next/tree/patchew/cf3710062ecac93e469306b3ed59a70688d36620.1633451893.git.pabeni@redhat.com

Feel free to reply to this email if you cannot access logs, if you need
some support to fix the error, if this doesn't seem to be caused by your
modifications or if the error is a false positive one.

Cheers,
MPTCP GH Action bot

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: mptcp: allocate fwd memory separatelly on the rx and tx path: Tests Results
  2021-10-05 16:39 [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path Paolo Abeni
                   ` (4 preceding siblings ...)
  2021-10-14 17:03 ` mptcp: allocate fwd memory separatelly on the rx and tx path: Build Failure MPTCP CI
@ 2021-10-14 17:53 ` MPTCP CI
  5 siblings, 0 replies; 9+ messages in thread
From: MPTCP CI @ 2021-10-14 17:53 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: mptcp

Hi Paolo Abeni,

Thank you for your modifications, that's great!

Our CI did some validations and here is its report:

- KVM Validation: normal: Unstable: 2 failed test(s): selftest_mptcp_join selftest_simult_flows 🔴:
  - Task: https://cirrus-ci.com/task/4843090516115456
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/4843090516115456/summary/summary.txt

- KVM Validation: debug: Unstable: 3 failed test(s): selftest_diag selftest_mptcp_join selftest_simult_flows 🔴:
  - Task: https://cirrus-ci.com/task/5968990422958080
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/5968990422958080/summary/summary.txt

Initiator: ghost
Ref: patchew/cf3710062ecac93e469306b3ed59a70688d36620.1633451893.git.pabeni@redhat.com
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/patchew/cf3710062ecac93e469306b3ed59a70688d36620.1633451893.git.pabeni@redhat.com
Tree: https://github.com/multipath-tcp/mptcp_net-next/tree/patchew/cf3710062ecac93e469306b3ed59a70688d36620.1633451893.git.pabeni@redhat.com

Please note that some efforts might still be needed to stabilise the tests
suite when ran on a public CI like here. It is then possible some reported
issues are not due to your modifications. Do not hesitate to help us
improving that ;-)

Cheers,
MPTCP GH Action bot

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-10-14 17:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-05 16:39 [RFC PATCH] mptcp: allocate fwd memory separatelly on the rx and tx path Paolo Abeni
2021-10-05 19:49 ` kernel test robot
2021-10-05 23:07 ` Mat Martineau
2021-10-06  9:42   ` Paolo Abeni
2021-10-06  1:30 ` kernel test robot
2021-10-06  1:30   ` kernel test robot
2021-10-12  9:33 ` Matthieu Baerts
2021-10-14 17:03 ` mptcp: allocate fwd memory separatelly on the rx and tx path: Build Failure MPTCP CI
2021-10-14 17:53 ` mptcp: allocate fwd memory separatelly on the rx and tx path: Tests Results MPTCP CI

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.