All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net] net/smc: Fix expected buffersizes and sync logic
@ 2022-11-23 10:49 Jan Karcher
  2022-11-23 11:53 ` Tony Lu
  0 siblings, 1 reply; 13+ messages in thread
From: Jan Karcher @ 2022-11-23 10:49 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, Alexandra Winter,
	Wenjia Zhang, Thorsten Winkler, Stefan Raspl, Karsten Graul,
	Jan Karcher, Tony Lu

The fixed commit changed the expected behavior of buffersizes
set by the user using the setsockopt mechanism.
Before the fixed patch the logic for determining the buffersizes used
was the following:

default  = net.ipv4.tcp_{w|r}mem[1]
sockopt  = the setsockopt mechanism
val      = the value assigned in default or via setsockopt
sk_buf   = short for sk_{snd|rcv}buf
real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)

  exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
            |                  |                |
+---------+ |                  | +------------+ | +-------------------+
| default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
+---------+ |                  | +------------+ | +-------------------+
            |                  |                |    ^
            |                  |                |    |
+---------+ | +--------------+ |                |    |
| sockopt |---| sk_buf=val*2 |-----------------------|
+---------+ | +--------------+ |                |
            |                  |                |

The fixed patch introduced a dedicated sysctl for smc
and removed the /2 in smc_core.c resulting in the following flow:

default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
sockopt  = the setsockopt mechanism
val      = the value assigned in default or via setsockopt
sk_buf   = short for sk_{snd|rcv}buf
real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)

  exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
            |                  |                |
+---------+ |                  | +------------+ | +-----------------+
| default |----------------------| sk_buf=val |---| real_buf=sk_buf |
+---------+ |                  | +------------+ | +-----------------+
            |                  |                |    ^
            |                  |                |    |
+---------+ | +--------------+ |                |    |
| sockopt |---| sk_buf=val*2 |-----------------------|
+---------+ | +--------------+ |                |
            |                  |                |

This would result in double of memory used for existing configurations
that are using setsockopt.

SMC historically decided to use the explicit value given by the user
to allocate the memory. This is why we used the /2 in smc_core.c.
That logic was not applied to the default value.
Since we now have our own sysctl, which is also exposed to the user,
we should sync the logic in a way that both values are the real value
used by our code and shown by smc_stats. To achieve this this patch
changes the behavior to:

default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
sockopt  = the setsockopt mechanism
val      = the value assigned in default or via setsockopt
sk_buf   = short for sk_{snd|rcv}buf
real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)

  exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
            |                  |                 |
+---------+ |                  | +-------------+ | +-----------------+
| default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
+---------+ |                  | +-------------+ | +-----------------+
            |                  |                 |    ^
            |                  |                 |    |
+---------+ | +--------------+ |                 |    |
| sockopt |---| sk_buf=val*2 |------------------------|
+---------+ | +--------------+ |                 |
            |                  |                 |

This way both paths follow the same pattern and the expected behavior
is re-established.

Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
---
 net/smc/af_smc.c   | 9 +++++++--
 net/smc/smc_core.c | 8 ++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 036532cf39aa..a8c84e7bac99 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
 				   int protocol)
 {
+	int buffersize_without_overhead;
 	struct smc_sock *smc;
 	struct proto *prot;
 	struct sock *sk;
@@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
 	sk->sk_state = SMC_INIT;
 	sk->sk_destruct = smc_destruct;
 	sk->sk_protocol = protocol;
-	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
-	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
+	buffersize_without_overhead =
+		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
+	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
+	buffersize_without_overhead =
+		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
+	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
 	smc = smc_sk(sk);
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 	INIT_WORK(&smc->connect_work, smc_connect_work);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 00fb352c2765..36850a2ae167 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
 	if (is_rmb)
 		/* use socket recv buffer size (w/o overhead) as start value */
-		sk_buf_size = smc->sk.sk_rcvbuf;
+		sk_buf_size = smc->sk.sk_rcvbuf / 2;
 	else
 		/* use socket send buffer size (w/o overhead) as start value */
-		sk_buf_size = smc->sk.sk_sndbuf;
+		sk_buf_size = smc->sk.sk_sndbuf / 2;
 
 	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
 	     bufsize_short >= 0; bufsize_short--) {
@@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	if (is_rmb) {
 		conn->rmb_desc = buf_desc;
 		conn->rmbe_size_short = bufsize_short;
-		smc->sk.sk_rcvbuf = bufsize;
+		smc->sk.sk_rcvbuf = bufsize * 2;
 		atomic_set(&conn->bytes_to_rcv, 0);
 		conn->rmbe_update_limit =
 			smc_rmb_wnd_update_limit(buf_desc->len);
@@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
 	} else {
 		conn->sndbuf_desc = buf_desc;
-		smc->sk.sk_sndbuf = bufsize;
+		smc->sk.sk_sndbuf = bufsize * 2;
 		atomic_set(&conn->sndbuf_space, bufsize);
 	}
 	return 0;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-23 10:49 [PATCH net] net/smc: Fix expected buffersizes and sync logic Jan Karcher
@ 2022-11-23 11:53 ` Tony Lu
  2022-11-23 13:13   ` Jan Karcher
  0 siblings, 1 reply; 13+ messages in thread
From: Tony Lu @ 2022-11-23 11:53 UTC (permalink / raw)
  To: Jan Karcher
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Alexandra Winter, Wenjia Zhang, Thorsten Winkler, Stefan Raspl,
	Karsten Graul

On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
> The fixed commit changed the expected behavior of buffersizes
> set by the user using the setsockopt mechanism.
> Before the fixed patch the logic for determining the buffersizes used
> was the following:
> 
> default  = net.ipv4.tcp_{w|r}mem[1]
> sockopt  = the setsockopt mechanism
> val      = the value assigned in default or via setsockopt
> sk_buf   = short for sk_{snd|rcv}buf
> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> 
>   exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>             |                  |                |
> +---------+ |                  | +------------+ | +-------------------+
> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
> +---------+ |                  | +------------+ | +-------------------+
>             |                  |                |    ^
>             |                  |                |    |
> +---------+ | +--------------+ |                |    |
> | sockopt |---| sk_buf=val*2 |-----------------------|
> +---------+ | +--------------+ |                |
>             |                  |                |
> 
> The fixed patch introduced a dedicated sysctl for smc
> and removed the /2 in smc_core.c resulting in the following flow:
> 
> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> sockopt  = the setsockopt mechanism
> val      = the value assigned in default or via setsockopt
> sk_buf   = short for sk_{snd|rcv}buf
> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> 
>   exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>             |                  |                |
> +---------+ |                  | +------------+ | +-----------------+
> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
> +---------+ |                  | +------------+ | +-----------------+
>             |                  |                |    ^
>             |                  |                |    |
> +---------+ | +--------------+ |                |    |
> | sockopt |---| sk_buf=val*2 |-----------------------|
> +---------+ | +--------------+ |                |
>             |                  |                |
> 
> This would result in double of memory used for existing configurations
> that are using setsockopt.

Firstly, thanks for your detailed diagrams :-)

And the original decision to use user-provided values rather than
value/2 to follow the instructions of the socket manual [1].

  SO_RCVBUF
         Sets or gets the maximum socket receive buffer in bytes.
         The kernel doubles this value (to allow space for
         bookkeeping overhead) when it is set using setsockopt(2),
         and this doubled value is returned by getsockopt(2).  The
         default value is set by the
         /proc/sys/net/core/rmem_default file, and the maximum
         allowed value is set by the /proc/sys/net/core/rmem_max
         file.  The minimum (doubled) value for this option is 256.

[1] https://man7.org/linux/man-pages/man7/socket.7.html

The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
double the values in kernel, and getsockopt() will return the doubled
values. So that they should use half of the values which are passed to
setsockopt(). The original patch tries to make things easier in SMC and
let user-space to handle them following the socket manual.

> SMC historically decided to use the explicit value given by the user
> to allocate the memory. This is why we used the /2 in smc_core.c.
> That logic was not applied to the default value.

Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
trade-off to follow original logic of SMC, or follow the socket manual.
We decides to follow the instruction of manuals in the end.

Cheers,
Tony Lu

> Since we now have our own sysctl, which is also exposed to the user,
> we should sync the logic in a way that both values are the real value
> used by our code and shown by smc_stats. To achieve this this patch
> changes the behavior to:
> 
> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> sockopt  = the setsockopt mechanism
> val      = the value assigned in default or via setsockopt
> sk_buf   = short for sk_{snd|rcv}buf
> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> 
>   exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
>             |                  |                 |
> +---------+ |                  | +-------------+ | +-----------------+
> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
> +---------+ |                  | +-------------+ | +-----------------+
>             |                  |                 |    ^
>             |                  |                 |    |
> +---------+ | +--------------+ |                 |    |
> | sockopt |---| sk_buf=val*2 |------------------------|
> +---------+ | +--------------+ |                 |
>             |                  |                 |
> 
> This way both paths follow the same pattern and the expected behavior
> is re-established.
> 
> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
> ---
>  net/smc/af_smc.c   | 9 +++++++--
>  net/smc/smc_core.c | 8 ++++----
>  2 files changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> index 036532cf39aa..a8c84e7bac99 100644
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
>  static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>  				   int protocol)
>  {
> +	int buffersize_without_overhead;
>  	struct smc_sock *smc;
>  	struct proto *prot;
>  	struct sock *sk;
> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>  	sk->sk_state = SMC_INIT;
>  	sk->sk_destruct = smc_destruct;
>  	sk->sk_protocol = protocol;
> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
> +	buffersize_without_overhead =
> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
> +	buffersize_without_overhead =
> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
>  	smc = smc_sk(sk);
>  	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
>  	INIT_WORK(&smc->connect_work, smc_connect_work);
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index 00fb352c2765..36850a2ae167 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>  
>  	if (is_rmb)
>  		/* use socket recv buffer size (w/o overhead) as start value */
> -		sk_buf_size = smc->sk.sk_rcvbuf;
> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
>  	else
>  		/* use socket send buffer size (w/o overhead) as start value */
> -		sk_buf_size = smc->sk.sk_sndbuf;
> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
>  
>  	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
>  	     bufsize_short >= 0; bufsize_short--) {
> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>  	if (is_rmb) {
>  		conn->rmb_desc = buf_desc;
>  		conn->rmbe_size_short = bufsize_short;
> -		smc->sk.sk_rcvbuf = bufsize;
> +		smc->sk.sk_rcvbuf = bufsize * 2;
>  		atomic_set(&conn->bytes_to_rcv, 0);
>  		conn->rmbe_update_limit =
>  			smc_rmb_wnd_update_limit(buf_desc->len);
> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>  			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
>  	} else {
>  		conn->sndbuf_desc = buf_desc;
> -		smc->sk.sk_sndbuf = bufsize;
> +		smc->sk.sk_sndbuf = bufsize * 2;
>  		atomic_set(&conn->sndbuf_space, bufsize);
>  	}
>  	return 0;
> -- 
> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-23 11:53 ` Tony Lu
@ 2022-11-23 13:13   ` Jan Karcher
  2022-11-23 13:41     ` Tony Lu
  0 siblings, 1 reply; 13+ messages in thread
From: Jan Karcher @ 2022-11-23 13:13 UTC (permalink / raw)
  To: Tony Lu
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Alexandra Winter, Wenjia Zhang, Thorsten Winkler, Stefan Raspl,
	Karsten Graul



On 23/11/2022 12:53, Tony Lu wrote:
> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
>> The fixed commit changed the expected behavior of buffersizes
>> set by the user using the setsockopt mechanism.
>> Before the fixed patch the logic for determining the buffersizes used
>> was the following:
>>
>> default  = net.ipv4.tcp_{w|r}mem[1]
>> sockopt  = the setsockopt mechanism
>> val      = the value assigned in default or via setsockopt
>> sk_buf   = short for sk_{snd|rcv}buf
>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>
>>    exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>              |                  |                |
>> +---------+ |                  | +------------+ | +-------------------+
>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
>> +---------+ |                  | +------------+ | +-------------------+
>>              |                  |                |    ^
>>              |                  |                |    |
>> +---------+ | +--------------+ |                |    |
>> | sockopt |---| sk_buf=val*2 |-----------------------|
>> +---------+ | +--------------+ |                |
>>              |                  |                |
>>
>> The fixed patch introduced a dedicated sysctl for smc
>> and removed the /2 in smc_core.c resulting in the following flow:
>>
>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>> sockopt  = the setsockopt mechanism
>> val      = the value assigned in default or via setsockopt
>> sk_buf   = short for sk_{snd|rcv}buf
>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>
>>    exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>              |                  |                |
>> +---------+ |                  | +------------+ | +-----------------+
>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
>> +---------+ |                  | +------------+ | +-----------------+
>>              |                  |                |    ^
>>              |                  |                |    |
>> +---------+ | +--------------+ |                |    |
>> | sockopt |---| sk_buf=val*2 |-----------------------|
>> +---------+ | +--------------+ |                |
>>              |                  |                |
>>
>> This would result in double of memory used for existing configurations
>> that are using setsockopt.
> 
> Firstly, thanks for your detailed diagrams :-)
> 
> And the original decision to use user-provided values rather than
> value/2 to follow the instructions of the socket manual [1].
> 
>    SO_RCVBUF
>           Sets or gets the maximum socket receive buffer in bytes.
>           The kernel doubles this value (to allow space for
>           bookkeeping overhead) when it is set using setsockopt(2),
>           and this doubled value is returned by getsockopt(2).  The
>           default value is set by the
>           /proc/sys/net/core/rmem_default file, and the maximum
>           allowed value is set by the /proc/sys/net/core/rmem_max
>           file.  The minimum (doubled) value for this option is 256.
> 
> [1] https://man7.org/linux/man-pages/man7/socket.7.html
> 
> The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will

I totally agree that an educated user of SMC should know about that 
behavior if they decide to use it.
We do provide our users preload libraries where they can pass preferred 
buffersizes via arguments and we handle the Sockopts for them.

> double the values in kernel, and getsockopt() will return the doubled
> values. So that they should use half of the values which are passed to
> setsockopt(). The original patch tries to make things easier in SMC and
> let user-space to handle them following the socket manual.
> 
>> SMC historically decided to use the explicit value given by the user
>> to allocate the memory. This is why we used the /2 in smc_core.c.
>> That logic was not applied to the default value.
> 
> Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
> trade-off to follow original logic of SMC, or follow the socket manual.
> We decides to follow the instruction of manuals in the end.

I understand the point. I spend a lot of time trying to decide what to do.

Since it was an intentional decision to not follow the general socket 
option, and we do not have anyone complaining we do not really have a 
reason to change it.
Changing it means that users with existing configurations would have to 
change their configs on an update or suddenly expect double the memory 
consumption.
That's why we in the end preffered to stay with the current logic.

I'm thinking that maybe - if we stay with the historic logic - we should 
document that desicion somewhere. So that in the future, if a user that 
expects the man page behavior, has a way to understand what SMC is 
doing. What do oyu think?

- Jan

> 
> Cheers,
> Tony Lu
> 
>> Since we now have our own sysctl, which is also exposed to the user,
>> we should sync the logic in a way that both values are the real value
>> used by our code and shown by smc_stats. To achieve this this patch
>> changes the behavior to:
>>
>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>> sockopt  = the setsockopt mechanism
>> val      = the value assigned in default or via setsockopt
>> sk_buf   = short for sk_{snd|rcv}buf
>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>
>>    exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
>>              |                  |                 |
>> +---------+ |                  | +-------------+ | +-----------------+
>> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
>> +---------+ |                  | +-------------+ | +-----------------+
>>              |                  |                 |    ^
>>              |                  |                 |    |
>> +---------+ | +--------------+ |                 |    |
>> | sockopt |---| sk_buf=val*2 |------------------------|
>> +---------+ | +--------------+ |                 |
>>              |                  |                 |
>>
>> This way both paths follow the same pattern and the expected behavior
>> is re-established.
>>
>> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
>> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
>> ---
>>   net/smc/af_smc.c   | 9 +++++++--
>>   net/smc/smc_core.c | 8 ++++----
>>   2 files changed, 11 insertions(+), 6 deletions(-)
>>
>> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
>> index 036532cf39aa..a8c84e7bac99 100644
>> --- a/net/smc/af_smc.c
>> +++ b/net/smc/af_smc.c
>> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
>>   static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>   				   int protocol)
>>   {
>> +	int buffersize_without_overhead;
>>   	struct smc_sock *smc;
>>   	struct proto *prot;
>>   	struct sock *sk;
>> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>   	sk->sk_state = SMC_INIT;
>>   	sk->sk_destruct = smc_destruct;
>>   	sk->sk_protocol = protocol;
>> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
>> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
>> +	buffersize_without_overhead =
>> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
>> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
>> +	buffersize_without_overhead =
>> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
>> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
>>   	smc = smc_sk(sk);
>>   	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
>>   	INIT_WORK(&smc->connect_work, smc_connect_work);
>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>> index 00fb352c2765..36850a2ae167 100644
>> --- a/net/smc/smc_core.c
>> +++ b/net/smc/smc_core.c
>> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>   
>>   	if (is_rmb)
>>   		/* use socket recv buffer size (w/o overhead) as start value */
>> -		sk_buf_size = smc->sk.sk_rcvbuf;
>> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
>>   	else
>>   		/* use socket send buffer size (w/o overhead) as start value */
>> -		sk_buf_size = smc->sk.sk_sndbuf;
>> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
>>   
>>   	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
>>   	     bufsize_short >= 0; bufsize_short--) {
>> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>   	if (is_rmb) {
>>   		conn->rmb_desc = buf_desc;
>>   		conn->rmbe_size_short = bufsize_short;
>> -		smc->sk.sk_rcvbuf = bufsize;
>> +		smc->sk.sk_rcvbuf = bufsize * 2;
>>   		atomic_set(&conn->bytes_to_rcv, 0);
>>   		conn->rmbe_update_limit =
>>   			smc_rmb_wnd_update_limit(buf_desc->len);
>> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>   			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
>>   	} else {
>>   		conn->sndbuf_desc = buf_desc;
>> -		smc->sk.sk_sndbuf = bufsize;
>> +		smc->sk.sk_sndbuf = bufsize * 2;
>>   		atomic_set(&conn->sndbuf_space, bufsize);
>>   	}
>>   	return 0;
>> -- 
>> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-23 13:13   ` Jan Karcher
@ 2022-11-23 13:41     ` Tony Lu
  2022-11-24 13:00       ` Alexandra Winter
  0 siblings, 1 reply; 13+ messages in thread
From: Tony Lu @ 2022-11-23 13:41 UTC (permalink / raw)
  To: Jan Karcher
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Alexandra Winter, Wenjia Zhang, Thorsten Winkler, Stefan Raspl,
	Karsten Graul

On Wed, Nov 23, 2022 at 02:13:04PM +0100, Jan Karcher wrote:
> 
> 
> On 23/11/2022 12:53, Tony Lu wrote:
> > On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
> > > The fixed commit changed the expected behavior of buffersizes
> > > set by the user using the setsockopt mechanism.
> > > Before the fixed patch the logic for determining the buffersizes used
> > > was the following:
> > > 
> > > default  = net.ipv4.tcp_{w|r}mem[1]
> > > sockopt  = the setsockopt mechanism
> > > val      = the value assigned in default or via setsockopt
> > > sk_buf   = short for sk_{snd|rcv}buf
> > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > 
> > >    exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
> > >              |                  |                |
> > > +---------+ |                  | +------------+ | +-------------------+
> > > | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
> > > +---------+ |                  | +------------+ | +-------------------+
> > >              |                  |                |    ^
> > >              |                  |                |    |
> > > +---------+ | +--------------+ |                |    |
> > > | sockopt |---| sk_buf=val*2 |-----------------------|
> > > +---------+ | +--------------+ |                |
> > >              |                  |                |
> > > 
> > > The fixed patch introduced a dedicated sysctl for smc
> > > and removed the /2 in smc_core.c resulting in the following flow:
> > > 
> > > default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> > > sockopt  = the setsockopt mechanism
> > > val      = the value assigned in default or via setsockopt
> > > sk_buf   = short for sk_{snd|rcv}buf
> > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > 
> > >    exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
> > >              |                  |                |
> > > +---------+ |                  | +------------+ | +-----------------+
> > > | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
> > > +---------+ |                  | +------------+ | +-----------------+
> > >              |                  |                |    ^
> > >              |                  |                |    |
> > > +---------+ | +--------------+ |                |    |
> > > | sockopt |---| sk_buf=val*2 |-----------------------|
> > > +---------+ | +--------------+ |                |
> > >              |                  |                |
> > > 
> > > This would result in double of memory used for existing configurations
> > > that are using setsockopt.
> > 
> > Firstly, thanks for your detailed diagrams :-)
> > 
> > And the original decision to use user-provided values rather than
> > value/2 to follow the instructions of the socket manual [1].
> > 
> >    SO_RCVBUF
> >           Sets or gets the maximum socket receive buffer in bytes.
> >           The kernel doubles this value (to allow space for
> >           bookkeeping overhead) when it is set using setsockopt(2),
> >           and this doubled value is returned by getsockopt(2).  The
> >           default value is set by the
> >           /proc/sys/net/core/rmem_default file, and the maximum
> >           allowed value is set by the /proc/sys/net/core/rmem_max
> >           file.  The minimum (doubled) value for this option is 256.
> > 
> > [1] https://man7.org/linux/man-pages/man7/socket.7.html
> > 
> > The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
> 
> I totally agree that an educated user of SMC should know about that behavior
> if they decide to use it.
> We do provide our users preload libraries where they can pass preferred
> buffersizes via arguments and we handle the Sockopts for them.
> 
> > double the values in kernel, and getsockopt() will return the doubled
> > values. So that they should use half of the values which are passed to
> > setsockopt(). The original patch tries to make things easier in SMC and
> > let user-space to handle them following the socket manual.
> > 
> > > SMC historically decided to use the explicit value given by the user
> > > to allocate the memory. This is why we used the /2 in smc_core.c.
> > > That logic was not applied to the default value.
> > 
> > Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
> > trade-off to follow original logic of SMC, or follow the socket manual.
> > We decides to follow the instruction of manuals in the end.
> 
> I understand the point. I spend a lot of time trying to decide what to do.
> 
> Since it was an intentional decision to not follow the general socket
> option, and we do not have anyone complaining we do not really have a reason
> to change it.
> Changing it means that users with existing configurations would have to
> change their configs on an update or suddenly expect double the memory
> consumption.
> That's why we in the end preffered to stay with the current logic.

I can't agree with you more with the points to follow the historic logic
and not break the user-space applications.

> I'm thinking that maybe - if we stay with the historic logic - we should
> document that desicion somewhere. So that in the future, if a user that
> expects the man page behavior, has a way to understand what SMC is doing.
> What do oyu think?

Yep, we _really_ need to document it if we change the convention.
Actually, I spent a lot of time to find the history about the logic of
buffer (/2 and *2) in SMC. So I'm really in favor of adding
documentation, at least code comments to help others to understand them.

Cheers,
Tony Lu
 
> - Jan
> 
> > 
> > Cheers,
> > Tony Lu
> > 
> > > Since we now have our own sysctl, which is also exposed to the user,
> > > we should sync the logic in a way that both values are the real value
> > > used by our code and shown by smc_stats. To achieve this this patch
> > > changes the behavior to:
> > > 
> > > default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> > > sockopt  = the setsockopt mechanism
> > > val      = the value assigned in default or via setsockopt
> > > sk_buf   = short for sk_{snd|rcv}buf
> > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > 
> > >    exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
> > >              |                  |                 |
> > > +---------+ |                  | +-------------+ | +-----------------+
> > > | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
> > > +---------+ |                  | +-------------+ | +-----------------+
> > >              |                  |                 |    ^
> > >              |                  |                 |    |
> > > +---------+ | +--------------+ |                 |    |
> > > | sockopt |---| sk_buf=val*2 |------------------------|
> > > +---------+ | +--------------+ |                 |
> > >              |                  |                 |
> > > 
> > > This way both paths follow the same pattern and the expected behavior
> > > is re-established.
> > > 
> > > Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> > > Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
> > > Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
> > > ---
> > >   net/smc/af_smc.c   | 9 +++++++--
> > >   net/smc/smc_core.c | 8 ++++----
> > >   2 files changed, 11 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> > > index 036532cf39aa..a8c84e7bac99 100644
> > > --- a/net/smc/af_smc.c
> > > +++ b/net/smc/af_smc.c
> > > @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
> > >   static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
> > >   				   int protocol)
> > >   {
> > > +	int buffersize_without_overhead;
> > >   	struct smc_sock *smc;
> > >   	struct proto *prot;
> > >   	struct sock *sk;
> > > @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
> > >   	sk->sk_state = SMC_INIT;
> > >   	sk->sk_destruct = smc_destruct;
> > >   	sk->sk_protocol = protocol;
> > > -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
> > > -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
> > > +	buffersize_without_overhead =
> > > +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
> > > +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
> > > +	buffersize_without_overhead =
> > > +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
> > > +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
> > >   	smc = smc_sk(sk);
> > >   	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
> > >   	INIT_WORK(&smc->connect_work, smc_connect_work);
> > > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > > index 00fb352c2765..36850a2ae167 100644
> > > --- a/net/smc/smc_core.c
> > > +++ b/net/smc/smc_core.c
> > > @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > >   	if (is_rmb)
> > >   		/* use socket recv buffer size (w/o overhead) as start value */
> > > -		sk_buf_size = smc->sk.sk_rcvbuf;
> > > +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
> > >   	else
> > >   		/* use socket send buffer size (w/o overhead) as start value */
> > > -		sk_buf_size = smc->sk.sk_sndbuf;
> > > +		sk_buf_size = smc->sk.sk_sndbuf / 2;
> > >   	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
> > >   	     bufsize_short >= 0; bufsize_short--) {
> > > @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > >   	if (is_rmb) {
> > >   		conn->rmb_desc = buf_desc;
> > >   		conn->rmbe_size_short = bufsize_short;
> > > -		smc->sk.sk_rcvbuf = bufsize;
> > > +		smc->sk.sk_rcvbuf = bufsize * 2;
> > >   		atomic_set(&conn->bytes_to_rcv, 0);
> > >   		conn->rmbe_update_limit =
> > >   			smc_rmb_wnd_update_limit(buf_desc->len);
> > > @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > >   			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
> > >   	} else {
> > >   		conn->sndbuf_desc = buf_desc;
> > > -		smc->sk.sk_sndbuf = bufsize;
> > > +		smc->sk.sk_sndbuf = bufsize * 2;
> > >   		atomic_set(&conn->sndbuf_space, bufsize);
> > >   	}
> > >   	return 0;
> > > -- 
> > > 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-23 13:41     ` Tony Lu
@ 2022-11-24 13:00       ` Alexandra Winter
  2022-11-24 14:07         ` Alexandra Winter
  0 siblings, 1 reply; 13+ messages in thread
From: Alexandra Winter @ 2022-11-24 13:00 UTC (permalink / raw)
  To: Tony Lu, Jan Karcher
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Wenjia Zhang, Thorsten Winkler, Stefan Raspl, Karsten Graul



On 23.11.22 14:41, Tony Lu wrote:
> On Wed, Nov 23, 2022 at 02:13:04PM +0100, Jan Karcher wrote:
>>
>>
>> On 23/11/2022 12:53, Tony Lu wrote:
>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
>>>> The fixed commit changed the expected behavior of buffersizes
>>>> set by the user using the setsockopt mechanism.
>>>> Before the fixed patch the logic for determining the buffersizes used
>>>> was the following:
>>>>
>>>> default  = net.ipv4.tcp_{w|r}mem[1]
>>>> sockopt  = the setsockopt mechanism
>>>> val      = the value assigned in default or via setsockopt
>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>
>>>>    exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>              |                  |                |
>>>> +---------+ |                  | +------------+ | +-------------------+
>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>              |                  |                |    ^
>>>>              |                  |                |    |
>>>> +---------+ | +--------------+ |                |    |
>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>> +---------+ | +--------------+ |                |
>>>>              |                  |                |
>>>>
>>>> The fixed patch introduced a dedicated sysctl for smc
>>>> and removed the /2 in smc_core.c resulting in the following flow:
>>>>
>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>> sockopt  = the setsockopt mechanism
>>>> val      = the value assigned in default or via setsockopt
>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>
>>>>    exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>              |                  |                |
>>>> +---------+ |                  | +------------+ | +-----------------+
>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>              |                  |                |    ^
>>>>              |                  |                |    |
>>>> +---------+ | +--------------+ |                |    |
>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>> +---------+ | +--------------+ |                |
>>>>              |                  |                |
>>>>
>>>> This would result in double of memory used for existing configurations
>>>> that are using setsockopt.
>>>
>>> Firstly, thanks for your detailed diagrams :-)
>>>
>>> And the original decision to use user-provided values rather than
>>> value/2 to follow the instructions of the socket manual [1].
>>>
>>>    SO_RCVBUF
>>>           Sets or gets the maximum socket receive buffer in bytes.
>>>           The kernel doubles this value (to allow space for
>>>           bookkeeping overhead) when it is set using setsockopt(2),
>>>           and this doubled value is returned by getsockopt(2).  The
>>>           default value is set by the
>>>           /proc/sys/net/core/rmem_default file, and the maximum
>>>           allowed value is set by the /proc/sys/net/core/rmem_max
>>>           file.  The minimum (doubled) value for this option is 256.
>>>
>>> [1] https://man7.org/linux/man-pages/man7/socket.7.html
>>>
>>> The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
>>
>> I totally agree that an educated user of SMC should know about that behavior
>> if they decide to use it.
>> We do provide our users preload libraries where they can pass preferred
>> buffersizes via arguments and we handle the Sockopts for them.
>>
>>> double the values in kernel, and getsockopt() will return the doubled
>>> values. So that they should use half of the values which are passed to
>>> setsockopt(). The original patch tries to make things easier in SMC and
>>> let user-space to handle them following the socket manual.
>>>
>>>> SMC historically decided to use the explicit value given by the user
>>>> to allocate the memory. This is why we used the /2 in smc_core.c.
>>>> That logic was not applied to the default value.
>>>
>>> Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
>>> trade-off to follow original logic of SMC, or follow the socket manual.
>>> We decides to follow the instruction of manuals in the end.
>>
>> I understand the point. I spend a lot of time trying to decide what to do.
>>
>> Since it was an intentional decision to not follow the general socket
>> option, and we do not have anyone complaining we do not really have a reason
>> to change it.
>> Changing it means that users with existing configurations would have to
>> change their configs on an update or suddenly expect double the memory
>> consumption.
>> That's why we in the end preffered to stay with the current logic.
> 
> I can't agree with you more with the points to follow the historic logic
> and not break the user-space applications.
> 
>> I'm thinking that maybe - if we stay with the historic logic - we should
>> document that desicion somewhere. So that in the future, if a user that
>> expects the man page behavior, has a way to understand what SMC is doing.
>> What do oyu think?
> 
> Yep, we _really_ need to document it if we change the convention.
> Actually, I spent a lot of time to find the history about the logic of
> buffer (/2 and *2) in SMC. So I'm really in favor of adding
> documentation, at least code comments to help others to understand them.
> 
> Cheers,
> Tony Lu
Iiuc you are changing the default values in this a patch and your other patch:
Default values for real_buf for send and receive:

before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
    real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k 
    
after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072) 

after net/smc: Fix expected buffersizes and sync logic
real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072) 

after net/smc: Unbind smc control from tcp control
real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)

If my understanding is correct, then I nack this. 
Defaults should be restored to the values before 0227f058aa29.
Otherwise users will notice a change in memory usage that needs to
be avoided or announced more explicitely. (and don't change them twice)
>  
>> - Jan
>>
>>>
>>> Cheers,
>>> Tony Lu
>>>
>>>> Since we now have our own sysctl, which is also exposed to the user,
>>>> we should sync the logic in a way that both values are the real value
>>>> used by our code and shown by smc_stats. To achieve this this patch
>>>> changes the behavior to:
>>>>
>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>> sockopt  = the setsockopt mechanism
>>>> val      = the value assigned in default or via setsockopt
>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>
>>>>    exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
>>>>              |                  |                 |
>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>              |                  |                 |    ^
>>>>              |                  |                 |    |
>>>> +---------+ | +--------------+ |                 |    |
>>>> | sockopt |---| sk_buf=val*2 |------------------------|
>>>> +---------+ | +--------------+ |                 |
>>>>              |                  |                 |
>>>>
>>>> This way both paths follow the same pattern and the expected behavior
>>>> is re-established.
>>>>
>>>> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
>>>> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
>>>> ---
>>>>   net/smc/af_smc.c   | 9 +++++++--
>>>>   net/smc/smc_core.c | 8 ++++----
>>>>   2 files changed, 11 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
>>>> index 036532cf39aa..a8c84e7bac99 100644
>>>> --- a/net/smc/af_smc.c
>>>> +++ b/net/smc/af_smc.c
>>>> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
>>>>   static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>   				   int protocol)
>>>>   {
>>>> +	int buffersize_without_overhead;
>>>>   	struct smc_sock *smc;
>>>>   	struct proto *prot;
>>>>   	struct sock *sk;
>>>> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>   	sk->sk_state = SMC_INIT;
>>>>   	sk->sk_destruct = smc_destruct;
>>>>   	sk->sk_protocol = protocol;
>>>> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
>>>> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
>>>> +	buffersize_without_overhead =
>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
>>>> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
>>>> +	buffersize_without_overhead =
>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
>>>> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
>>>>   	smc = smc_sk(sk);
>>>>   	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
>>>>   	INIT_WORK(&smc->connect_work, smc_connect_work);
>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>> index 00fb352c2765..36850a2ae167 100644
>>>> --- a/net/smc/smc_core.c
>>>> +++ b/net/smc/smc_core.c
>>>> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>   	if (is_rmb)
>>>>   		/* use socket recv buffer size (w/o overhead) as start value */
>>>> -		sk_buf_size = smc->sk.sk_rcvbuf;
>>>> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
>>>>   	else
>>>>   		/* use socket send buffer size (w/o overhead) as start value */
>>>> -		sk_buf_size = smc->sk.sk_sndbuf;
>>>> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
>>>>   	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
>>>>   	     bufsize_short >= 0; bufsize_short--) {
>>>> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>   	if (is_rmb) {
>>>>   		conn->rmb_desc = buf_desc;
>>>>   		conn->rmbe_size_short = bufsize_short;
>>>> -		smc->sk.sk_rcvbuf = bufsize;
>>>> +		smc->sk.sk_rcvbuf = bufsize * 2;
>>>>   		atomic_set(&conn->bytes_to_rcv, 0);
>>>>   		conn->rmbe_update_limit =
>>>>   			smc_rmb_wnd_update_limit(buf_desc->len);
>>>> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>   			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
>>>>   	} else {
>>>>   		conn->sndbuf_desc = buf_desc;
>>>> -		smc->sk.sk_sndbuf = bufsize;
>>>> +		smc->sk.sk_sndbuf = bufsize * 2;
>>>>   		atomic_set(&conn->sndbuf_space, bufsize);
>>>>   	}
>>>>   	return 0;
>>>> -- 
>>>> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-24 13:00       ` Alexandra Winter
@ 2022-11-24 14:07         ` Alexandra Winter
  2022-11-25  6:15           ` Jan Karcher
  0 siblings, 1 reply; 13+ messages in thread
From: Alexandra Winter @ 2022-11-24 14:07 UTC (permalink / raw)
  To: Tony Lu, Jan Karcher
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Wenjia Zhang, Thorsten Winkler, Stefan Raspl, Karsten Graul



On 24.11.22 14:00, Alexandra Winter wrote:
> 
> 
> On 23.11.22 14:41, Tony Lu wrote:
>> On Wed, Nov 23, 2022 at 02:13:04PM +0100, Jan Karcher wrote:
>>>
>>>
>>> On 23/11/2022 12:53, Tony Lu wrote:
>>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
>>>>> The fixed commit changed the expected behavior of buffersizes
>>>>> set by the user using the setsockopt mechanism.
>>>>> Before the fixed patch the logic for determining the buffersizes used
>>>>> was the following:
>>>>>
>>>>> default  = net.ipv4.tcp_{w|r}mem[1]
Jan, you explained to me: "the minima is 16Kib. This is enforced in smc_compress_bufsize 
which would move any value <= 16Kib into bucket 0 - which is 16KiB "
net.ipv4.tcp_wmem[1] defaults to 8Kib. So in the default case (unchanged net.ipv4.tcp_wmem[1])
the default for the send path is not net.ipv4.tcp_wmem[1]. Should be clarified here.
>>>>> sockopt  = the setsockopt mechanism
>>>>> val      = the value assigned in default or via setsockopt
>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>
>>>>>    exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>              |                  |                |
>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>              |                  |                |    ^
>>>>>              |                  |                |    |
>>>>> +---------+ | +--------------+ |                |    |
>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>> +---------+ | +--------------+ |                |
>>>>>              |                  |                |
>>>>>
>>>>> The fixed patch introduced a dedicated sysctl for smc
>>>>> and removed the /2 in smc_core.c resulting in the following flow:
>>>>>
>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>> sockopt  = the setsockopt mechanism
>>>>> val      = the value assigned in default or via setsockopt
>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>
>>>>>    exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>              |                  |                |
>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>              |                  |                |    ^
>>>>>              |                  |                |    |
>>>>> +---------+ | +--------------+ |                |    |
>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>> +---------+ | +--------------+ |                |
>>>>>              |                  |                |
>>>>>
>>>>> This would result in double of memory used for existing configurations
>>>>> that are using setsockopt.
>>>>
>>>> Firstly, thanks for your detailed diagrams :-)
>>>>
>>>> And the original decision to use user-provided values rather than
>>>> value/2 to follow the instructions of the socket manual [1].
>>>>
>>>>    SO_RCVBUF
>>>>           Sets or gets the maximum socket receive buffer in bytes.
>>>>           The kernel doubles this value (to allow space for
>>>>           bookkeeping overhead) when it is set using setsockopt(2),
>>>>           and this doubled value is returned by getsockopt(2).  The
>>>>           default value is set by the
>>>>           /proc/sys/net/core/rmem_default file, and the maximum
>>>>           allowed value is set by the /proc/sys/net/core/rmem_max
>>>>           file.  The minimum (doubled) value for this option is 256.
>>>>
>>>> [1] https://man7.org/linux/man-pages/man7/socket.7.html
>>>>
>>>> The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
>>>
>>> I totally agree that an educated user of SMC should know about that behavior
>>> if they decide to use it.
>>> We do provide our users preload libraries where they can pass preferred
>>> buffersizes via arguments and we handle the Sockopts for them.
>>>
>>>> double the values in kernel, and getsockopt() will return the doubled
>>>> values. So that they should use half of the values which are passed to
>>>> setsockopt(). The original patch tries to make things easier in SMC and
>>>> let user-space to handle them following the socket manual.
>>>>
>>>>> SMC historically decided to use the explicit value given by the user
>>>>> to allocate the memory. This is why we used the /2 in smc_core.c.
>>>>> That logic was not applied to the default value.
>>>>
>>>> Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
>>>> trade-off to follow original logic of SMC, or follow the socket manual.
>>>> We decides to follow the instruction of manuals in the end.
>>>
>>> I understand the point. I spend a lot of time trying to decide what to do.
>>>
>>> Since it was an intentional decision to not follow the general socket
>>> option, and we do not have anyone complaining we do not really have a reason
>>> to change it.
>>> Changing it means that users with existing configurations would have to
>>> change their configs on an update or suddenly expect double the memory
>>> consumption.
>>> That's why we in the end preffered to stay with the current logic.
>>
>> I can't agree with you more with the points to follow the historic logic
>> and not break the user-space applications.
>>
>>> I'm thinking that maybe - if we stay with the historic logic - we should
>>> document that desicion somewhere. So that in the future, if a user that
>>> expects the man page behavior, has a way to understand what SMC is doing.
>>> What do oyu think?
>>
>> Yep, we _really_ need to document it if we change the convention.
>> Actually, I spent a lot of time to find the history about the logic of
>> buffer (/2 and *2) in SMC. So I'm really in favor of adding
>> documentation, at least code comments to help others to understand them.
>>
>> Cheers,
>> Tony Lu
> Iiuc you are changing the default values in this a patch and your other patch:
> Default values for real_buf for send and receive:
> 
> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>     real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k 
      see above: 			    send: 16k recv: 64k
>     
> after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072) 
> 
> after net/smc: Fix expected buffersizes and sync logic
> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072) 
> 
> after net/smc: Unbind smc control from tcp control
> real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)
> 
> If my understanding is correct, then I nack this. 
> Defaults should be restored to the values before 0227f058aa29.
> Otherwise users will notice a change in memory usage that needs to
> be avoided or announced more explicitely. (and don't change them twice)
See above, I stand corrected. However this patch fixes/restores the buffersize
for setsockopt, but not for the default recieve path. 
Could you please clarify that in the title and description?

Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
>>  
>>> - Jan
>>>
>>>>
>>>> Cheers,
>>>> Tony Lu
>>>>
>>>>> Since we now have our own sysctl, which is also exposed to the user,
>>>>> we should sync the logic in a way that both values are the real value
>>>>> used by our code and shown by smc_stats. To achieve this this patch
>>>>> changes the behavior to:
>>>>>
>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>> sockopt  = the setsockopt mechanism
>>>>> val      = the value assigned in default or via setsockopt
>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>
>>>>>    exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
>>>>>              |                  |                 |
>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>              |                  |                 |    ^
>>>>>              |                  |                 |    |
>>>>> +---------+ | +--------------+ |                 |    |
>>>>> | sockopt |---| sk_buf=val*2 |------------------------|
>>>>> +---------+ | +--------------+ |                 |
>>>>>              |                  |                 |
>>>>>
>>>>> This way both paths follow the same pattern and the expected behavior
>>>>> is re-established.
>>>>>
>>>>> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
>>>>> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
>>>>> ---
>>>>>   net/smc/af_smc.c   | 9 +++++++--
>>>>>   net/smc/smc_core.c | 8 ++++----
>>>>>   2 files changed, 11 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
>>>>> index 036532cf39aa..a8c84e7bac99 100644
>>>>> --- a/net/smc/af_smc.c
>>>>> +++ b/net/smc/af_smc.c
>>>>> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
>>>>>   static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>   				   int protocol)
>>>>>   {
>>>>> +	int buffersize_without_overhead;
>>>>>   	struct smc_sock *smc;
>>>>>   	struct proto *prot;
>>>>>   	struct sock *sk;
>>>>> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>   	sk->sk_state = SMC_INIT;
>>>>>   	sk->sk_destruct = smc_destruct;
>>>>>   	sk->sk_protocol = protocol;
>>>>> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
>>>>> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
>>>>> +	buffersize_without_overhead =
>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
>>>>> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
>>>>> +	buffersize_without_overhead =
>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
>>>>> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
>>>>>   	smc = smc_sk(sk);
>>>>>   	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
>>>>>   	INIT_WORK(&smc->connect_work, smc_connect_work);
>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>> index 00fb352c2765..36850a2ae167 100644
>>>>> --- a/net/smc/smc_core.c
>>>>> +++ b/net/smc/smc_core.c
>>>>> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>   	if (is_rmb)
>>>>>   		/* use socket recv buffer size (w/o overhead) as start value */
>>>>> -		sk_buf_size = smc->sk.sk_rcvbuf;
>>>>> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
>>>>>   	else
>>>>>   		/* use socket send buffer size (w/o overhead) as start value */
>>>>> -		sk_buf_size = smc->sk.sk_sndbuf;
>>>>> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
>>>>>   	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
>>>>>   	     bufsize_short >= 0; bufsize_short--) {
>>>>> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>   	if (is_rmb) {
>>>>>   		conn->rmb_desc = buf_desc;
>>>>>   		conn->rmbe_size_short = bufsize_short;
>>>>> -		smc->sk.sk_rcvbuf = bufsize;
>>>>> +		smc->sk.sk_rcvbuf = bufsize * 2;
>>>>>   		atomic_set(&conn->bytes_to_rcv, 0);
>>>>>   		conn->rmbe_update_limit =
>>>>>   			smc_rmb_wnd_update_limit(buf_desc->len);
>>>>> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>   			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
>>>>>   	} else {
>>>>>   		conn->sndbuf_desc = buf_desc;
>>>>> -		smc->sk.sk_sndbuf = bufsize;
>>>>> +		smc->sk.sk_sndbuf = bufsize * 2;
>>>>>   		atomic_set(&conn->sndbuf_space, bufsize);
>>>>>   	}
>>>>>   	return 0;
>>>>> -- 
>>>>> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-24 14:07         ` Alexandra Winter
@ 2022-11-25  6:15           ` Jan Karcher
  2022-11-25  7:05             ` Tony Lu
  0 siblings, 1 reply; 13+ messages in thread
From: Jan Karcher @ 2022-11-25  6:15 UTC (permalink / raw)
  To: Alexandra Winter, Tony Lu
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Wenjia Zhang, Thorsten Winkler, Stefan Raspl, Karsten Graul



On 24/11/2022 15:07, Alexandra Winter wrote:
> 
> 
> On 24.11.22 14:00, Alexandra Winter wrote:
>>
>>
[ ... ]>>>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
>>>>>> The fixed commit changed the expected behavior of buffersizes
>>>>>> set by the user using the setsockopt mechanism.
>>>>>> Before the fixed patch the logic for determining the buffersizes used
>>>>>> was the following:
>>>>>>
>>>>>> default  = net.ipv4.tcp_{w|r}mem[1]
> Jan, you explained to me: "the minima is 16Kib. This is enforced in smc_compress_bufsize
> which would move any value <= 16Kib into bucket 0 - which is 16KiB "
> net.ipv4.tcp_wmem[1] defaults to 8Kib. So in the default case (unchanged net.ipv4.tcp_wmem[1])
> the default for the send path is not net.ipv4.tcp_wmem[1]. Should be clarified here.

The default value is still set to the net.ipv4.tcp_{w|r}mem[1]. This is 
a *very* top level overview about what is happening and *not* a 
documentation.
I don't really want to explain the full code flow here.

What we still should do - as Tony aggreed on - is documenting the SMC 
behavior. This is a follow up on my list.

>>>>>> sockopt  = the setsockopt mechanism
>>>>>> val      = the value assigned in default or via setsockopt
>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>
>>>>>>     exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>>               |                  |                |
>>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
>>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>>               |                  |                |    ^
>>>>>>               |                  |                |    |
>>>>>> +---------+ | +--------------+ |                |    |
>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>>> +---------+ | +--------------+ |                |
>>>>>>               |                  |                |
>>>>>>
>>>>>> The fixed patch introduced a dedicated sysctl for smc
>>>>>> and removed the /2 in smc_core.c resulting in the following flow:
>>>>>>
>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>>> sockopt  = the setsockopt mechanism
>>>>>> val      = the value assigned in default or via setsockopt
>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>
>>>>>>     exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>>               |                  |                |
>>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
>>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>>               |                  |                |    ^
>>>>>>               |                  |                |    |
>>>>>> +---------+ | +--------------+ |                |    |
>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>>> +---------+ | +--------------+ |                |
>>>>>>               |                  |                |
>>>>>>
>>>>>> This would result in double of memory used for existing configurations
>>>>>> that are using setsockopt.
>>>>>
>>>>> Firstly, thanks for your detailed diagrams :-)
>>>>>
>>>>> And the original decision to use user-provided values rather than
>>>>> value/2 to follow the instructions of the socket manual [1].
>>>>>
>>>>>     SO_RCVBUF
>>>>>            Sets or gets the maximum socket receive buffer in bytes.
>>>>>            The kernel doubles this value (to allow space for
>>>>>            bookkeeping overhead) when it is set using setsockopt(2),
>>>>>            and this doubled value is returned by getsockopt(2).  The
>>>>>            default value is set by the
>>>>>            /proc/sys/net/core/rmem_default file, and the maximum
>>>>>            allowed value is set by the /proc/sys/net/core/rmem_max
>>>>>            file.  The minimum (doubled) value for this option is 256.
>>>>>
>>>>> [1] https://man7.org/linux/man-pages/man7/socket.7.html
>>>>>
>>>>> The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
>>>>
>>>> I totally agree that an educated user of SMC should know about that behavior
>>>> if they decide to use it.
>>>> We do provide our users preload libraries where they can pass preferred
>>>> buffersizes via arguments and we handle the Sockopts for them.
>>>>
>>>>> double the values in kernel, and getsockopt() will return the doubled
>>>>> values. So that they should use half of the values which are passed to
>>>>> setsockopt(). The original patch tries to make things easier in SMC and
>>>>> let user-space to handle them following the socket manual.
>>>>>
>>>>>> SMC historically decided to use the explicit value given by the user
>>>>>> to allocate the memory. This is why we used the /2 in smc_core.c.
>>>>>> That logic was not applied to the default value.
>>>>>
>>>>> Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
>>>>> trade-off to follow original logic of SMC, or follow the socket manual.
>>>>> We decides to follow the instruction of manuals in the end.
>>>>
>>>> I understand the point. I spend a lot of time trying to decide what to do.
>>>>
>>>> Since it was an intentional decision to not follow the general socket
>>>> option, and we do not have anyone complaining we do not really have a reason
>>>> to change it.
>>>> Changing it means that users with existing configurations would have to
>>>> change their configs on an update or suddenly expect double the memory
>>>> consumption.
>>>> That's why we in the end preffered to stay with the current logic.
>>>
>>> I can't agree with you more with the points to follow the historic logic
>>> and not break the user-space applications.
>>>
>>>> I'm thinking that maybe - if we stay with the historic logic - we should
>>>> document that desicion somewhere. So that in the future, if a user that
>>>> expects the man page behavior, has a way to understand what SMC is doing.
>>>> What do oyu think?
>>>
>>> Yep, we _really_ need to document it if we change the convention.
>>> Actually, I spent a lot of time to find the history about the logic of
>>> buffer (/2 and *2) in SMC. So I'm really in favor of adding
>>> documentation, at least code comments to help others to understand them.
>>>
>>> Cheers,
>>> Tony Lu
>> Iiuc you are changing the default values in this a patch and your other patch:
>> Default values for real_buf for send and receive:
>>
>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>      real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k
>        see above: 			    send: 16k recv: 64k
>>      
>> after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
>>
>> after net/smc: Fix expected buffersizes and sync logic
>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
>>
>> after net/smc: Unbind smc control from tcp control
>> real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)
>>
>> If my understanding is correct, then I nack this.
>> Defaults should be restored to the values before 0227f058aa29.
>> Otherwise users will notice a change in memory usage that needs to
>> be avoided or announced more explicitely. (and don't change them twice)
> See above, I stand corrected. However this patch fixes/restores the buffersize
> for setsockopt, but not for the default recieve path.
> Could you please clarify that in the title and description?
> 

I am trying to keep the commit title as crisp as possible while 
providing enough information and set the context in the commit message:

"The fixed commit changed the expected behavior of buffersizes set by 
the user using the setsockopt mechanism."

  + There is now a whole e-mail thread to consult in case of any further 
questions.

Thank you for your comments
- Jan

> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
>>>   
>>>> - Jan
>>>>
>>>>>
>>>>> Cheers,
>>>>> Tony Lu
>>>>>
>>>>>> Since we now have our own sysctl, which is also exposed to the user,
>>>>>> we should sync the logic in a way that both values are the real value
>>>>>> used by our code and shown by smc_stats. To achieve this this patch
>>>>>> changes the behavior to:
>>>>>>
>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>>> sockopt  = the setsockopt mechanism
>>>>>> val      = the value assigned in default or via setsockopt
>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>
>>>>>>     exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
>>>>>>               |                  |                 |
>>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
>>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>>               |                  |                 |    ^
>>>>>>               |                  |                 |    |
>>>>>> +---------+ | +--------------+ |                 |    |
>>>>>> | sockopt |---| sk_buf=val*2 |------------------------|
>>>>>> +---------+ | +--------------+ |                 |
>>>>>>               |                  |                 |
>>>>>>
>>>>>> This way both paths follow the same pattern and the expected behavior
>>>>>> is re-established.
>>>>>>
>>>>>> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>>> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
>>>>>> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
>>>>>> ---
>>>>>>    net/smc/af_smc.c   | 9 +++++++--
>>>>>>    net/smc/smc_core.c | 8 ++++----
>>>>>>    2 files changed, 11 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
>>>>>> index 036532cf39aa..a8c84e7bac99 100644
>>>>>> --- a/net/smc/af_smc.c
>>>>>> +++ b/net/smc/af_smc.c
>>>>>> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
>>>>>>    static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>>    				   int protocol)
>>>>>>    {
>>>>>> +	int buffersize_without_overhead;
>>>>>>    	struct smc_sock *smc;
>>>>>>    	struct proto *prot;
>>>>>>    	struct sock *sk;
>>>>>> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>>    	sk->sk_state = SMC_INIT;
>>>>>>    	sk->sk_destruct = smc_destruct;
>>>>>>    	sk->sk_protocol = protocol;
>>>>>> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
>>>>>> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
>>>>>> +	buffersize_without_overhead =
>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
>>>>>> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
>>>>>> +	buffersize_without_overhead =
>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
>>>>>> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
>>>>>>    	smc = smc_sk(sk);
>>>>>>    	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
>>>>>>    	INIT_WORK(&smc->connect_work, smc_connect_work);
>>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>>> index 00fb352c2765..36850a2ae167 100644
>>>>>> --- a/net/smc/smc_core.c
>>>>>> +++ b/net/smc/smc_core.c
>>>>>> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>    	if (is_rmb)
>>>>>>    		/* use socket recv buffer size (w/o overhead) as start value */
>>>>>> -		sk_buf_size = smc->sk.sk_rcvbuf;
>>>>>> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
>>>>>>    	else
>>>>>>    		/* use socket send buffer size (w/o overhead) as start value */
>>>>>> -		sk_buf_size = smc->sk.sk_sndbuf;
>>>>>> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
>>>>>>    	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
>>>>>>    	     bufsize_short >= 0; bufsize_short--) {
>>>>>> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>    	if (is_rmb) {
>>>>>>    		conn->rmb_desc = buf_desc;
>>>>>>    		conn->rmbe_size_short = bufsize_short;
>>>>>> -		smc->sk.sk_rcvbuf = bufsize;
>>>>>> +		smc->sk.sk_rcvbuf = bufsize * 2;
>>>>>>    		atomic_set(&conn->bytes_to_rcv, 0);
>>>>>>    		conn->rmbe_update_limit =
>>>>>>    			smc_rmb_wnd_update_limit(buf_desc->len);
>>>>>> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>    			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
>>>>>>    	} else {
>>>>>>    		conn->sndbuf_desc = buf_desc;
>>>>>> -		smc->sk.sk_sndbuf = bufsize;
>>>>>> +		smc->sk.sk_sndbuf = bufsize * 2;
>>>>>>    		atomic_set(&conn->sndbuf_space, bufsize);
>>>>>>    	}
>>>>>>    	return 0;
>>>>>> -- 
>>>>>> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-25  6:15           ` Jan Karcher
@ 2022-11-25  7:05             ` Tony Lu
  2022-11-25 10:59               ` Alexandra Winter
  0 siblings, 1 reply; 13+ messages in thread
From: Tony Lu @ 2022-11-25  7:05 UTC (permalink / raw)
  To: Jan Karcher, Alexandra Winter
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Wenjia Zhang, Thorsten Winkler, Stefan Raspl, Karsten Graul

On Fri, Nov 25, 2022 at 07:15:33AM +0100, Jan Karcher wrote:
> 
> 
> On 24/11/2022 15:07, Alexandra Winter wrote:
> > 
> > 
> > On 24.11.22 14:00, Alexandra Winter wrote:
> > > 
> > > 
> [ ... ]>>>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
> > > > > > > The fixed commit changed the expected behavior of buffersizes
> > > > > > > set by the user using the setsockopt mechanism.
> > > > > > > Before the fixed patch the logic for determining the buffersizes used
> > > > > > > was the following:
> > > > > > > 
> > > > > > > default  = net.ipv4.tcp_{w|r}mem[1]
> > Jan, you explained to me: "the minima is 16Kib. This is enforced in smc_compress_bufsize
> > which would move any value <= 16Kib into bucket 0 - which is 16KiB "
> > net.ipv4.tcp_wmem[1] defaults to 8Kib. So in the default case (unchanged net.ipv4.tcp_wmem[1])
> > the default for the send path is not net.ipv4.tcp_wmem[1]. Should be clarified here.
> 
> The default value is still set to the net.ipv4.tcp_{w|r}mem[1]. This is a
> *very* top level overview about what is happening and *not* a documentation.
> I don't really want to explain the full code flow here.
> 
> What we still should do - as Tony aggreed on - is documenting the SMC
> behavior. This is a follow up on my list.

Hello Jan and Alexandra,

It looks like the misalignment of information is causing some trouble,
which is introduced by my patch. Maybe we could have an off-maillist and
online meeting to discussion?

We have some progress updates of scalability, and we are really like the
extension of SMC-D. Also we have some ideas for SMC, in case of
misalignment of information, we'd like to put them on the table and
discuss them earlier. Maybe an online meeting is an efficient way. What
do you think?

If possible, I would prepared the meetings and topics and send them to
everyone first.

Cheers,
Tony Lu

> 
> > > > > > > sockopt  = the setsockopt mechanism
> > > > > > > val      = the value assigned in default or via setsockopt
> > > > > > > sk_buf   = short for sk_{snd|rcv}buf
> > > > > > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > > > > > 
> > > > > > >     exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
> > > > > > >               |                  |                |
> > > > > > > +---------+ |                  | +------------+ | +-------------------+
> > > > > > > | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
> > > > > > > +---------+ |                  | +------------+ | +-------------------+
> > > > > > >               |                  |                |    ^
> > > > > > >               |                  |                |    |
> > > > > > > +---------+ | +--------------+ |                |    |
> > > > > > > | sockopt |---| sk_buf=val*2 |-----------------------|
> > > > > > > +---------+ | +--------------+ |                |
> > > > > > >               |                  |                |
> > > > > > > 
> > > > > > > The fixed patch introduced a dedicated sysctl for smc
> > > > > > > and removed the /2 in smc_core.c resulting in the following flow:
> > > > > > > 
> > > > > > > default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> > > > > > > sockopt  = the setsockopt mechanism
> > > > > > > val      = the value assigned in default or via setsockopt
> > > > > > > sk_buf   = short for sk_{snd|rcv}buf
> > > > > > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > > > > > 
> > > > > > >     exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
> > > > > > >               |                  |                |
> > > > > > > +---------+ |                  | +------------+ | +-----------------+
> > > > > > > | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
> > > > > > > +---------+ |                  | +------------+ | +-----------------+
> > > > > > >               |                  |                |    ^
> > > > > > >               |                  |                |    |
> > > > > > > +---------+ | +--------------+ |                |    |
> > > > > > > | sockopt |---| sk_buf=val*2 |-----------------------|
> > > > > > > +---------+ | +--------------+ |                |
> > > > > > >               |                  |                |
> > > > > > > 
> > > > > > > This would result in double of memory used for existing configurations
> > > > > > > that are using setsockopt.
> > > > > > 
> > > > > > Firstly, thanks for your detailed diagrams :-)
> > > > > > 
> > > > > > And the original decision to use user-provided values rather than
> > > > > > value/2 to follow the instructions of the socket manual [1].
> > > > > > 
> > > > > >     SO_RCVBUF
> > > > > >            Sets or gets the maximum socket receive buffer in bytes.
> > > > > >            The kernel doubles this value (to allow space for
> > > > > >            bookkeeping overhead) when it is set using setsockopt(2),
> > > > > >            and this doubled value is returned by getsockopt(2).  The
> > > > > >            default value is set by the
> > > > > >            /proc/sys/net/core/rmem_default file, and the maximum
> > > > > >            allowed value is set by the /proc/sys/net/core/rmem_max
> > > > > >            file.  The minimum (doubled) value for this option is 256.
> > > > > > 
> > > > > > [1] https://man7.org/linux/man-pages/man7/socket.7.html
> > > > > > 
> > > > > > The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
> > > > > 
> > > > > I totally agree that an educated user of SMC should know about that behavior
> > > > > if they decide to use it.
> > > > > We do provide our users preload libraries where they can pass preferred
> > > > > buffersizes via arguments and we handle the Sockopts for them.
> > > > > 
> > > > > > double the values in kernel, and getsockopt() will return the doubled
> > > > > > values. So that they should use half of the values which are passed to
> > > > > > setsockopt(). The original patch tries to make things easier in SMC and
> > > > > > let user-space to handle them following the socket manual.
> > > > > > 
> > > > > > > SMC historically decided to use the explicit value given by the user
> > > > > > > to allocate the memory. This is why we used the /2 in smc_core.c.
> > > > > > > That logic was not applied to the default value.
> > > > > > 
> > > > > > Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
> > > > > > trade-off to follow original logic of SMC, or follow the socket manual.
> > > > > > We decides to follow the instruction of manuals in the end.
> > > > > 
> > > > > I understand the point. I spend a lot of time trying to decide what to do.
> > > > > 
> > > > > Since it was an intentional decision to not follow the general socket
> > > > > option, and we do not have anyone complaining we do not really have a reason
> > > > > to change it.
> > > > > Changing it means that users with existing configurations would have to
> > > > > change their configs on an update or suddenly expect double the memory
> > > > > consumption.
> > > > > That's why we in the end preffered to stay with the current logic.
> > > > 
> > > > I can't agree with you more with the points to follow the historic logic
> > > > and not break the user-space applications.
> > > > 
> > > > > I'm thinking that maybe - if we stay with the historic logic - we should
> > > > > document that desicion somewhere. So that in the future, if a user that
> > > > > expects the man page behavior, has a way to understand what SMC is doing.
> > > > > What do oyu think?
> > > > 
> > > > Yep, we _really_ need to document it if we change the convention.
> > > > Actually, I spent a lot of time to find the history about the logic of
> > > > buffer (/2 and *2) in SMC. So I'm really in favor of adding
> > > > documentation, at least code comments to help others to understand them.
> > > > 
> > > > Cheers,
> > > > Tony Lu
> > > Iiuc you are changing the default values in this a patch and your other patch:
> > > Default values for real_buf for send and receive:
> > > 
> > > before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> > >      real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k
> >        see above: 			    send: 16k recv: 64k
> > > after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> > > real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
> > > 
> > > after net/smc: Fix expected buffersizes and sync logic
> > > real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
> > > 
> > > after net/smc: Unbind smc control from tcp control
> > > real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)
> > > 
> > > If my understanding is correct, then I nack this.
> > > Defaults should be restored to the values before 0227f058aa29.
> > > Otherwise users will notice a change in memory usage that needs to
> > > be avoided or announced more explicitely. (and don't change them twice)
> > See above, I stand corrected. However this patch fixes/restores the buffersize
> > for setsockopt, but not for the default recieve path.
> > Could you please clarify that in the title and description?
> > 
> 
> I am trying to keep the commit title as crisp as possible while providing
> enough information and set the context in the commit message:
> 
> "The fixed commit changed the expected behavior of buffersizes set by the
> user using the setsockopt mechanism."
> 
>  + There is now a whole e-mail thread to consult in case of any further
> questions.
> 
> Thank you for your comments
> - Jan
> 
> > Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
> > > > > - Jan
> > > > > 
> > > > > > 
> > > > > > Cheers,
> > > > > > Tony Lu
> > > > > > 
> > > > > > > Since we now have our own sysctl, which is also exposed to the user,
> > > > > > > we should sync the logic in a way that both values are the real value
> > > > > > > used by our code and shown by smc_stats. To achieve this this patch
> > > > > > > changes the behavior to:
> > > > > > > 
> > > > > > > default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> > > > > > > sockopt  = the setsockopt mechanism
> > > > > > > val      = the value assigned in default or via setsockopt
> > > > > > > sk_buf   = short for sk_{snd|rcv}buf
> > > > > > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > > > > > 
> > > > > > >     exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
> > > > > > >               |                  |                 |
> > > > > > > +---------+ |                  | +-------------+ | +-----------------+
> > > > > > > | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
> > > > > > > +---------+ |                  | +-------------+ | +-----------------+
> > > > > > >               |                  |                 |    ^
> > > > > > >               |                  |                 |    |
> > > > > > > +---------+ | +--------------+ |                 |    |
> > > > > > > | sockopt |---| sk_buf=val*2 |------------------------|
> > > > > > > +---------+ | +--------------+ |                 |
> > > > > > >               |                  |                 |
> > > > > > > 
> > > > > > > This way both paths follow the same pattern and the expected behavior
> > > > > > > is re-established.
> > > > > > > 
> > > > > > > Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> > > > > > > Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
> > > > > > > Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
> > > > > > > ---
> > > > > > >    net/smc/af_smc.c   | 9 +++++++--
> > > > > > >    net/smc/smc_core.c | 8 ++++----
> > > > > > >    2 files changed, 11 insertions(+), 6 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> > > > > > > index 036532cf39aa..a8c84e7bac99 100644
> > > > > > > --- a/net/smc/af_smc.c
> > > > > > > +++ b/net/smc/af_smc.c
> > > > > > > @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
> > > > > > >    static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
> > > > > > >    				   int protocol)
> > > > > > >    {
> > > > > > > +	int buffersize_without_overhead;
> > > > > > >    	struct smc_sock *smc;
> > > > > > >    	struct proto *prot;
> > > > > > >    	struct sock *sk;
> > > > > > > @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
> > > > > > >    	sk->sk_state = SMC_INIT;
> > > > > > >    	sk->sk_destruct = smc_destruct;
> > > > > > >    	sk->sk_protocol = protocol;
> > > > > > > -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
> > > > > > > -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
> > > > > > > +	buffersize_without_overhead =
> > > > > > > +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
> > > > > > > +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
> > > > > > > +	buffersize_without_overhead =
> > > > > > > +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
> > > > > > > +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
> > > > > > >    	smc = smc_sk(sk);
> > > > > > >    	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
> > > > > > >    	INIT_WORK(&smc->connect_work, smc_connect_work);
> > > > > > > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > > > > > > index 00fb352c2765..36850a2ae167 100644
> > > > > > > --- a/net/smc/smc_core.c
> > > > > > > +++ b/net/smc/smc_core.c
> > > > > > > @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > > > > > >    	if (is_rmb)
> > > > > > >    		/* use socket recv buffer size (w/o overhead) as start value */
> > > > > > > -		sk_buf_size = smc->sk.sk_rcvbuf;
> > > > > > > +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
> > > > > > >    	else
> > > > > > >    		/* use socket send buffer size (w/o overhead) as start value */
> > > > > > > -		sk_buf_size = smc->sk.sk_sndbuf;
> > > > > > > +		sk_buf_size = smc->sk.sk_sndbuf / 2;
> > > > > > >    	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
> > > > > > >    	     bufsize_short >= 0; bufsize_short--) {
> > > > > > > @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > > > > > >    	if (is_rmb) {
> > > > > > >    		conn->rmb_desc = buf_desc;
> > > > > > >    		conn->rmbe_size_short = bufsize_short;
> > > > > > > -		smc->sk.sk_rcvbuf = bufsize;
> > > > > > > +		smc->sk.sk_rcvbuf = bufsize * 2;
> > > > > > >    		atomic_set(&conn->bytes_to_rcv, 0);
> > > > > > >    		conn->rmbe_update_limit =
> > > > > > >    			smc_rmb_wnd_update_limit(buf_desc->len);
> > > > > > > @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > > > > > >    			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
> > > > > > >    	} else {
> > > > > > >    		conn->sndbuf_desc = buf_desc;
> > > > > > > -		smc->sk.sk_sndbuf = bufsize;
> > > > > > > +		smc->sk.sk_sndbuf = bufsize * 2;
> > > > > > >    		atomic_set(&conn->sndbuf_space, bufsize);
> > > > > > >    	}
> > > > > > >    	return 0;
> > > > > > > -- 
> > > > > > > 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-25  7:05             ` Tony Lu
@ 2022-11-25 10:59               ` Alexandra Winter
  2022-11-28  4:33                 ` Tony Lu
  2022-12-02 13:10                 ` Jan Karcher
  0 siblings, 2 replies; 13+ messages in thread
From: Alexandra Winter @ 2022-11-25 10:59 UTC (permalink / raw)
  To: Tony Lu, Jan Karcher
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Wenjia Zhang, Thorsten Winkler, Stefan Raspl, Karsten Graul



On 25.11.22 08:05, Tony Lu wrote:
> On Fri, Nov 25, 2022 at 07:15:33AM +0100, Jan Karcher wrote:
>>
>>
>> On 24/11/2022 15:07, Alexandra Winter wrote:
>>>
>>>
>>> On 24.11.22 14:00, Alexandra Winter wrote:
>>>>
>>>>
>> [ ... ]>>>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
>>>>>>>> The fixed commit changed the expected behavior of buffersizes
>>>>>>>> set by the user using the setsockopt mechanism.
>>>>>>>> Before the fixed patch the logic for determining the buffersizes used
>>>>>>>> was the following:
>>>>>>>>
>>>>>>>> default  = net.ipv4.tcp_{w|r}mem[1]
>>> Jan, you explained to me: "the minima is 16Kib. This is enforced in smc_compress_bufsize
>>> which would move any value <= 16Kib into bucket 0 - which is 16KiB "
>>> net.ipv4.tcp_wmem[1] defaults to 8Kib. So in the default case (unchanged net.ipv4.tcp_wmem[1])
>>> the default for the send path is not net.ipv4.tcp_wmem[1]. Should be clarified here.
>>
>> The default value is still set to the net.ipv4.tcp_{w|r}mem[1]. This is a
>> *very* top level overview about what is happening and *not* a documentation.
>> I don't really want to explain the full code flow here.
>>
>> What we still should do - as Tony aggreed on - is documenting the SMC
>> behavior. This is a follow up on my list.
> 
> Hello Jan and Alexandra,
> 
> It looks like the misalignment of information is causing some trouble,
> which is introduced by my patch. Maybe we could have an off-maillist and
> online meeting to discussion?
> 
> We have some progress updates of scalability, and we are really like the
> extension of SMC-D. Also we have some ideas for SMC, in case of
> misalignment of information, we'd like to put them on the table and
> discuss them earlier. Maybe an online meeting is an efficient way. What
> do you think?
> 
> If possible, I would prepared the meetings and topics and send them to
> everyone first.
> 
> Cheers,
> Tony Lu
> 

Thanks a lot for your constructive proposals Tony. Yes, we should have a discussion off-mailinglist
about future topics.

My remaining concern for this fix is the default values (user does not use setsockopt, nor 
changes the new sysfs parameters, nor changes tcp defaults):
>>>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
	    send: 16k recv: 64k
>>>> after net/smc: Fix expected buffersizes and sync logic   (this patch)
>>>>       send: 16k recv: 128k

@Jan, as this is the only patch you want to send to net, please change the default size of
the receive buffers back to 64k (I don't care how).


>>
>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>
>>>>>>>>     exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>>>>               |                  |                |
>>>>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
>>>>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>>>>               |                  |                |    ^
>>>>>>>>               |                  |                |    |
>>>>>>>> +---------+ | +--------------+ |                |    |
>>>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>>>>> +---------+ | +--------------+ |                |
>>>>>>>>               |                  |                |
>>>>>>>>
>>>>>>>> The fixed patch introduced a dedicated sysctl for smc
>>>>>>>> and removed the /2 in smc_core.c resulting in the following flow:
>>>>>>>>
>>>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>
>>>>>>>>     exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>>>>               |                  |                |
>>>>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
>>>>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>>>>               |                  |                |    ^
>>>>>>>>               |                  |                |    |
>>>>>>>> +---------+ | +--------------+ |                |    |
>>>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>>>>> +---------+ | +--------------+ |                |
>>>>>>>>               |                  |                |
>>>>>>>>
>>>>>>>> This would result in double of memory used for existing configurations
>>>>>>>> that are using setsockopt.
>>>>>>>
>>>>>>> Firstly, thanks for your detailed diagrams :-)
>>>>>>>
>>>>>>> And the original decision to use user-provided values rather than
>>>>>>> value/2 to follow the instructions of the socket manual [1].
>>>>>>>
>>>>>>>     SO_RCVBUF
>>>>>>>            Sets or gets the maximum socket receive buffer in bytes.
>>>>>>>            The kernel doubles this value (to allow space for
>>>>>>>            bookkeeping overhead) when it is set using setsockopt(2),
>>>>>>>            and this doubled value is returned by getsockopt(2).  The
>>>>>>>            default value is set by the
>>>>>>>            /proc/sys/net/core/rmem_default file, and the maximum
>>>>>>>            allowed value is set by the /proc/sys/net/core/rmem_max
>>>>>>>            file.  The minimum (doubled) value for this option is 256.
>>>>>>>
>>>>>>> [1] https://man7.org/linux/man-pages/man7/socket.7.html
>>>>>>>
>>>>>>> The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
>>>>>>
>>>>>> I totally agree that an educated user of SMC should know about that behavior
>>>>>> if they decide to use it.
>>>>>> We do provide our users preload libraries where they can pass preferred
>>>>>> buffersizes via arguments and we handle the Sockopts for them.
>>>>>>
>>>>>>> double the values in kernel, and getsockopt() will return the doubled
>>>>>>> values. So that they should use half of the values which are passed to
>>>>>>> setsockopt(). The original patch tries to make things easier in SMC and
>>>>>>> let user-space to handle them following the socket manual.
>>>>>>>
>>>>>>>> SMC historically decided to use the explicit value given by the user
>>>>>>>> to allocate the memory. This is why we used the /2 in smc_core.c.
>>>>>>>> That logic was not applied to the default value.
>>>>>>>
>>>>>>> Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
>>>>>>> trade-off to follow original logic of SMC, or follow the socket manual.
>>>>>>> We decides to follow the instruction of manuals in the end.
>>>>>>
>>>>>> I understand the point. I spend a lot of time trying to decide what to do.
>>>>>>
>>>>>> Since it was an intentional decision to not follow the general socket
>>>>>> option, and we do not have anyone complaining we do not really have a reason
>>>>>> to change it.
>>>>>> Changing it means that users with existing configurations would have to
>>>>>> change their configs on an update or suddenly expect double the memory
>>>>>> consumption.
>>>>>> That's why we in the end preffered to stay with the current logic.
>>>>>
>>>>> I can't agree with you more with the points to follow the historic logic
>>>>> and not break the user-space applications.
>>>>>
>>>>>> I'm thinking that maybe - if we stay with the historic logic - we should
>>>>>> document that desicion somewhere. So that in the future, if a user that
>>>>>> expects the man page behavior, has a way to understand what SMC is doing.
>>>>>> What do oyu think?
>>>>>
>>>>> Yep, we _really_ need to document it if we change the convention.
>>>>> Actually, I spent a lot of time to find the history about the logic of
>>>>> buffer (/2 and *2) in SMC. So I'm really in favor of adding
>>>>> documentation, at least code comments to help others to understand them.
>>>>>
>>>>> Cheers,
>>>>> Tony Lu
>>>> Iiuc you are changing the default values in this a patch and your other patch:
>>>> Default values for real_buf for send and receive:
>>>>
>>>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>      real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k
>>>        see above: 			    send: 16k recv: 64k
>>>> after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
>>>>
>>>> after net/smc: Fix expected buffersizes and sync logic
>>>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
>>>>
>>>> after net/smc: Unbind smc control from tcp control
>>>> real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)
>>>>
>>>> If my understanding is correct, then I nack this.
>>>> Defaults should be restored to the values before 0227f058aa29.
>>>> Otherwise users will notice a change in memory usage that needs to
>>>> be avoided or announced more explicitely. (and don't change them twice)
>>> See above, I stand corrected. However this patch fixes/restores the buffersize
>>> for setsockopt, but not for the default recieve path.
>>> Could you please clarify that in the title and description?
>>>
>>
>> I am trying to keep the commit title as crisp as possible while providing
>> enough information and set the context in the commit message:
>>
>> "The fixed commit changed the expected behavior of buffersizes set by the
>> user using the setsockopt mechanism."
>>
>>  + There is now a whole e-mail thread to consult in case of any further
>> questions.
>>
>> Thank you for your comments
>> - Jan
>>
>>> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
>>>>>> - Jan
>>>>>>
>>>>>>>
>>>>>>> Cheers,
>>>>>>> Tony Lu
>>>>>>>
>>>>>>>> Since we now have our own sysctl, which is also exposed to the user,
>>>>>>>> we should sync the logic in a way that both values are the real value
>>>>>>>> used by our code and shown by smc_stats. To achieve this this patch
>>>>>>>> changes the behavior to:
>>>>>>>>
>>>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>
>>>>>>>>     exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
>>>>>>>>               |                  |                 |
>>>>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>>>> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
>>>>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>>>>               |                  |                 |    ^
>>>>>>>>               |                  |                 |    |
>>>>>>>> +---------+ | +--------------+ |                 |    |
>>>>>>>> | sockopt |---| sk_buf=val*2 |------------------------|
>>>>>>>> +---------+ | +--------------+ |                 |
>>>>>>>>               |                  |                 |
>>>>>>>>
>>>>>>>> This way both paths follow the same pattern and the expected behavior
>>>>>>>> is re-established.
>>>>>>>>
>>>>>>>> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>>>>> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
>>>>>>>> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
>>>>>>>> ---
>>>>>>>>    net/smc/af_smc.c   | 9 +++++++--
>>>>>>>>    net/smc/smc_core.c | 8 ++++----
>>>>>>>>    2 files changed, 11 insertions(+), 6 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
>>>>>>>> index 036532cf39aa..a8c84e7bac99 100644
>>>>>>>> --- a/net/smc/af_smc.c
>>>>>>>> +++ b/net/smc/af_smc.c
>>>>>>>> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
>>>>>>>>    static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>>>>    				   int protocol)
>>>>>>>>    {
>>>>>>>> +	int buffersize_without_overhead;
>>>>>>>>    	struct smc_sock *smc;
>>>>>>>>    	struct proto *prot;
>>>>>>>>    	struct sock *sk;
>>>>>>>> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>>>>    	sk->sk_state = SMC_INIT;
>>>>>>>>    	sk->sk_destruct = smc_destruct;
>>>>>>>>    	sk->sk_protocol = protocol;
>>>>>>>> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
>>>>>>>> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
>>>>>>>> +	buffersize_without_overhead =
>>>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
>>>>>>>> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
>>>>>>>> +	buffersize_without_overhead =
>>>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
>>>>>>>> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
>>>>>>>>    	smc = smc_sk(sk);
>>>>>>>>    	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
>>>>>>>>    	INIT_WORK(&smc->connect_work, smc_connect_work);
>>>>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>>>>> index 00fb352c2765..36850a2ae167 100644
>>>>>>>> --- a/net/smc/smc_core.c
>>>>>>>> +++ b/net/smc/smc_core.c
>>>>>>>> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>    	if (is_rmb)
>>>>>>>>    		/* use socket recv buffer size (w/o overhead) as start value */
>>>>>>>> -		sk_buf_size = smc->sk.sk_rcvbuf;
>>>>>>>> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
>>>>>>>>    	else
>>>>>>>>    		/* use socket send buffer size (w/o overhead) as start value */
>>>>>>>> -		sk_buf_size = smc->sk.sk_sndbuf;
>>>>>>>> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
>>>>>>>>    	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
>>>>>>>>    	     bufsize_short >= 0; bufsize_short--) {
>>>>>>>> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>    	if (is_rmb) {
>>>>>>>>    		conn->rmb_desc = buf_desc;
>>>>>>>>    		conn->rmbe_size_short = bufsize_short;
>>>>>>>> -		smc->sk.sk_rcvbuf = bufsize;
>>>>>>>> +		smc->sk.sk_rcvbuf = bufsize * 2;
>>>>>>>>    		atomic_set(&conn->bytes_to_rcv, 0);
>>>>>>>>    		conn->rmbe_update_limit =
>>>>>>>>    			smc_rmb_wnd_update_limit(buf_desc->len);
>>>>>>>> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>    			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
>>>>>>>>    	} else {
>>>>>>>>    		conn->sndbuf_desc = buf_desc;
>>>>>>>> -		smc->sk.sk_sndbuf = bufsize;
>>>>>>>> +		smc->sk.sk_sndbuf = bufsize * 2;
>>>>>>>>    		atomic_set(&conn->sndbuf_space, bufsize);
>>>>>>>>    	}
>>>>>>>>    	return 0;
>>>>>>>> -- 
>>>>>>>> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-25 10:59               ` Alexandra Winter
@ 2022-11-28  4:33                 ` Tony Lu
  2022-11-28  8:32                   ` Wenjia Zhang
  2022-12-02 13:10                 ` Jan Karcher
  1 sibling, 1 reply; 13+ messages in thread
From: Tony Lu @ 2022-11-28  4:33 UTC (permalink / raw)
  To: Alexandra Winter, Jan Karcher
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Wenjia Zhang, Thorsten Winkler, Stefan Raspl, Karsten Graul

On Fri, Nov 25, 2022 at 11:59:46AM +0100, Alexandra Winter wrote:
> 
> 
> On 25.11.22 08:05, Tony Lu wrote:
> > On Fri, Nov 25, 2022 at 07:15:33AM +0100, Jan Karcher wrote:
> >>
> >>
> >> On 24/11/2022 15:07, Alexandra Winter wrote:
> >>>
> >>>
> >>> On 24.11.22 14:00, Alexandra Winter wrote:
> >>>>
> >>>>
> >> [ ... ]>>>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
> >>>>>>>> The fixed commit changed the expected behavior of buffersizes
> >>>>>>>> set by the user using the setsockopt mechanism.
> >>>>>>>> Before the fixed patch the logic for determining the buffersizes used
> >>>>>>>> was the following:
> >>>>>>>>
> >>>>>>>> default  = net.ipv4.tcp_{w|r}mem[1]
> >>> Jan, you explained to me: "the minima is 16Kib. This is enforced in smc_compress_bufsize
> >>> which would move any value <= 16Kib into bucket 0 - which is 16KiB "
> >>> net.ipv4.tcp_wmem[1] defaults to 8Kib. So in the default case (unchanged net.ipv4.tcp_wmem[1])
> >>> the default for the send path is not net.ipv4.tcp_wmem[1]. Should be clarified here.
> >>
> >> The default value is still set to the net.ipv4.tcp_{w|r}mem[1]. This is a
> >> *very* top level overview about what is happening and *not* a documentation.
> >> I don't really want to explain the full code flow here.
> >>
> >> What we still should do - as Tony aggreed on - is documenting the SMC
> >> behavior. This is a follow up on my list.
> > 
> > Hello Jan and Alexandra,
> > 
> > It looks like the misalignment of information is causing some trouble,
> > which is introduced by my patch. Maybe we could have an off-maillist and
> > online meeting to discussion?
> > 
> > We have some progress updates of scalability, and we are really like the
> > extension of SMC-D. Also we have some ideas for SMC, in case of
> > misalignment of information, we'd like to put them on the table and
> > discuss them earlier. Maybe an online meeting is an efficient way. What
> > do you think?
> > 
> > If possible, I would prepared the meetings and topics and send them to
> > everyone first.
> > 
> > Cheers,
> > Tony Lu
> > 
> 
> Thanks a lot for your constructive proposals Tony. Yes, we should have a discussion off-mailinglist
> about future topics.

I will prepare the discussion off-maillinglist ASAP. The email will be
sent out when it's ready. And Jan, What about your opinion?

Cheers,
Tony Lu

> 
> My remaining concern for this fix is the default values (user does not use setsockopt, nor 
> changes the new sysfs parameters, nor changes tcp defaults):
> >>>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> 	    send: 16k recv: 64k
> >>>> after net/smc: Fix expected buffersizes and sync logic   (this patch)
> >>>>       send: 16k recv: 128k
> 
> @Jan, as this is the only patch you want to send to net, please change the default size of
> the receive buffers back to 64k (I don't care how).
> 
> 
> >>
> >>>>>>>> sockopt  = the setsockopt mechanism
> >>>>>>>> val      = the value assigned in default or via setsockopt
> >>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
> >>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> >>>>>>>>
> >>>>>>>>     exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
> >>>>>>>>               |                  |                |
> >>>>>>>> +---------+ |                  | +------------+ | +-------------------+
> >>>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
> >>>>>>>> +---------+ |                  | +------------+ | +-------------------+
> >>>>>>>>               |                  |                |    ^
> >>>>>>>>               |                  |                |    |
> >>>>>>>> +---------+ | +--------------+ |                |    |
> >>>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
> >>>>>>>> +---------+ | +--------------+ |                |
> >>>>>>>>               |                  |                |
> >>>>>>>>
> >>>>>>>> The fixed patch introduced a dedicated sysctl for smc
> >>>>>>>> and removed the /2 in smc_core.c resulting in the following flow:
> >>>>>>>>
> >>>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> >>>>>>>> sockopt  = the setsockopt mechanism
> >>>>>>>> val      = the value assigned in default or via setsockopt
> >>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
> >>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> >>>>>>>>
> >>>>>>>>     exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
> >>>>>>>>               |                  |                |
> >>>>>>>> +---------+ |                  | +------------+ | +-----------------+
> >>>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
> >>>>>>>> +---------+ |                  | +------------+ | +-----------------+
> >>>>>>>>               |                  |                |    ^
> >>>>>>>>               |                  |                |    |
> >>>>>>>> +---------+ | +--------------+ |                |    |
> >>>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
> >>>>>>>> +---------+ | +--------------+ |                |
> >>>>>>>>               |                  |                |
> >>>>>>>>
> >>>>>>>> This would result in double of memory used for existing configurations
> >>>>>>>> that are using setsockopt.
> >>>>>>>
> >>>>>>> Firstly, thanks for your detailed diagrams :-)
> >>>>>>>
> >>>>>>> And the original decision to use user-provided values rather than
> >>>>>>> value/2 to follow the instructions of the socket manual [1].
> >>>>>>>
> >>>>>>>     SO_RCVBUF
> >>>>>>>            Sets or gets the maximum socket receive buffer in bytes.
> >>>>>>>            The kernel doubles this value (to allow space for
> >>>>>>>            bookkeeping overhead) when it is set using setsockopt(2),
> >>>>>>>            and this doubled value is returned by getsockopt(2).  The
> >>>>>>>            default value is set by the
> >>>>>>>            /proc/sys/net/core/rmem_default file, and the maximum
> >>>>>>>            allowed value is set by the /proc/sys/net/core/rmem_max
> >>>>>>>            file.  The minimum (doubled) value for this option is 256.
> >>>>>>>
> >>>>>>> [1] https://man7.org/linux/man-pages/man7/socket.7.html
> >>>>>>>
> >>>>>>> The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
> >>>>>>
> >>>>>> I totally agree that an educated user of SMC should know about that behavior
> >>>>>> if they decide to use it.
> >>>>>> We do provide our users preload libraries where they can pass preferred
> >>>>>> buffersizes via arguments and we handle the Sockopts for them.
> >>>>>>
> >>>>>>> double the values in kernel, and getsockopt() will return the doubled
> >>>>>>> values. So that they should use half of the values which are passed to
> >>>>>>> setsockopt(). The original patch tries to make things easier in SMC and
> >>>>>>> let user-space to handle them following the socket manual.
> >>>>>>>
> >>>>>>>> SMC historically decided to use the explicit value given by the user
> >>>>>>>> to allocate the memory. This is why we used the /2 in smc_core.c.
> >>>>>>>> That logic was not applied to the default value.
> >>>>>>>
> >>>>>>> Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
> >>>>>>> trade-off to follow original logic of SMC, or follow the socket manual.
> >>>>>>> We decides to follow the instruction of manuals in the end.
> >>>>>>
> >>>>>> I understand the point. I spend a lot of time trying to decide what to do.
> >>>>>>
> >>>>>> Since it was an intentional decision to not follow the general socket
> >>>>>> option, and we do not have anyone complaining we do not really have a reason
> >>>>>> to change it.
> >>>>>> Changing it means that users with existing configurations would have to
> >>>>>> change their configs on an update or suddenly expect double the memory
> >>>>>> consumption.
> >>>>>> That's why we in the end preffered to stay with the current logic.
> >>>>>
> >>>>> I can't agree with you more with the points to follow the historic logic
> >>>>> and not break the user-space applications.
> >>>>>
> >>>>>> I'm thinking that maybe - if we stay with the historic logic - we should
> >>>>>> document that desicion somewhere. So that in the future, if a user that
> >>>>>> expects the man page behavior, has a way to understand what SMC is doing.
> >>>>>> What do oyu think?
> >>>>>
> >>>>> Yep, we _really_ need to document it if we change the convention.
> >>>>> Actually, I spent a lot of time to find the history about the logic of
> >>>>> buffer (/2 and *2) in SMC. So I'm really in favor of adding
> >>>>> documentation, at least code comments to help others to understand them.
> >>>>>
> >>>>> Cheers,
> >>>>> Tony Lu
> >>>> Iiuc you are changing the default values in this a patch and your other patch:
> >>>> Default values for real_buf for send and receive:
> >>>>
> >>>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> >>>>      real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k
> >>>        see above: 			    send: 16k recv: 64k
> >>>> after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> >>>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
> >>>>
> >>>> after net/smc: Fix expected buffersizes and sync logic
> >>>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
> >>>>
> >>>> after net/smc: Unbind smc control from tcp control
> >>>> real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)
> >>>>
> >>>> If my understanding is correct, then I nack this.
> >>>> Defaults should be restored to the values before 0227f058aa29.
> >>>> Otherwise users will notice a change in memory usage that needs to
> >>>> be avoided or announced more explicitely. (and don't change them twice)
> >>> See above, I stand corrected. However this patch fixes/restores the buffersize
> >>> for setsockopt, but not for the default recieve path.
> >>> Could you please clarify that in the title and description?
> >>>
> >>
> >> I am trying to keep the commit title as crisp as possible while providing
> >> enough information and set the context in the commit message:
> >>
> >> "The fixed commit changed the expected behavior of buffersizes set by the
> >> user using the setsockopt mechanism."
> >>
> >>  + There is now a whole e-mail thread to consult in case of any further
> >> questions.
> >>
> >> Thank you for your comments
> >> - Jan
> >>
> >>> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
> >>>>>> - Jan
> >>>>>>
> >>>>>>>
> >>>>>>> Cheers,
> >>>>>>> Tony Lu
> >>>>>>>
> >>>>>>>> Since we now have our own sysctl, which is also exposed to the user,
> >>>>>>>> we should sync the logic in a way that both values are the real value
> >>>>>>>> used by our code and shown by smc_stats. To achieve this this patch
> >>>>>>>> changes the behavior to:
> >>>>>>>>
> >>>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> >>>>>>>> sockopt  = the setsockopt mechanism
> >>>>>>>> val      = the value assigned in default or via setsockopt
> >>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
> >>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> >>>>>>>>
> >>>>>>>>     exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
> >>>>>>>>               |                  |                 |
> >>>>>>>> +---------+ |                  | +-------------+ | +-----------------+
> >>>>>>>> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
> >>>>>>>> +---------+ |                  | +-------------+ | +-----------------+
> >>>>>>>>               |                  |                 |    ^
> >>>>>>>>               |                  |                 |    |
> >>>>>>>> +---------+ | +--------------+ |                 |    |
> >>>>>>>> | sockopt |---| sk_buf=val*2 |------------------------|
> >>>>>>>> +---------+ | +--------------+ |                 |
> >>>>>>>>               |                  |                 |
> >>>>>>>>
> >>>>>>>> This way both paths follow the same pattern and the expected behavior
> >>>>>>>> is re-established.
> >>>>>>>>
> >>>>>>>> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> >>>>>>>> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
> >>>>>>>> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
> >>>>>>>> ---
> >>>>>>>>    net/smc/af_smc.c   | 9 +++++++--
> >>>>>>>>    net/smc/smc_core.c | 8 ++++----
> >>>>>>>>    2 files changed, 11 insertions(+), 6 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> >>>>>>>> index 036532cf39aa..a8c84e7bac99 100644
> >>>>>>>> --- a/net/smc/af_smc.c
> >>>>>>>> +++ b/net/smc/af_smc.c
> >>>>>>>> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
> >>>>>>>>    static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
> >>>>>>>>    				   int protocol)
> >>>>>>>>    {
> >>>>>>>> +	int buffersize_without_overhead;
> >>>>>>>>    	struct smc_sock *smc;
> >>>>>>>>    	struct proto *prot;
> >>>>>>>>    	struct sock *sk;
> >>>>>>>> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
> >>>>>>>>    	sk->sk_state = SMC_INIT;
> >>>>>>>>    	sk->sk_destruct = smc_destruct;
> >>>>>>>>    	sk->sk_protocol = protocol;
> >>>>>>>> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
> >>>>>>>> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
> >>>>>>>> +	buffersize_without_overhead =
> >>>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
> >>>>>>>> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
> >>>>>>>> +	buffersize_without_overhead =
> >>>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
> >>>>>>>> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
> >>>>>>>>    	smc = smc_sk(sk);
> >>>>>>>>    	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
> >>>>>>>>    	INIT_WORK(&smc->connect_work, smc_connect_work);
> >>>>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> >>>>>>>> index 00fb352c2765..36850a2ae167 100644
> >>>>>>>> --- a/net/smc/smc_core.c
> >>>>>>>> +++ b/net/smc/smc_core.c
> >>>>>>>> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> >>>>>>>>    	if (is_rmb)
> >>>>>>>>    		/* use socket recv buffer size (w/o overhead) as start value */
> >>>>>>>> -		sk_buf_size = smc->sk.sk_rcvbuf;
> >>>>>>>> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
> >>>>>>>>    	else
> >>>>>>>>    		/* use socket send buffer size (w/o overhead) as start value */
> >>>>>>>> -		sk_buf_size = smc->sk.sk_sndbuf;
> >>>>>>>> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
> >>>>>>>>    	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
> >>>>>>>>    	     bufsize_short >= 0; bufsize_short--) {
> >>>>>>>> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> >>>>>>>>    	if (is_rmb) {
> >>>>>>>>    		conn->rmb_desc = buf_desc;
> >>>>>>>>    		conn->rmbe_size_short = bufsize_short;
> >>>>>>>> -		smc->sk.sk_rcvbuf = bufsize;
> >>>>>>>> +		smc->sk.sk_rcvbuf = bufsize * 2;
> >>>>>>>>    		atomic_set(&conn->bytes_to_rcv, 0);
> >>>>>>>>    		conn->rmbe_update_limit =
> >>>>>>>>    			smc_rmb_wnd_update_limit(buf_desc->len);
> >>>>>>>> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> >>>>>>>>    			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
> >>>>>>>>    	} else {
> >>>>>>>>    		conn->sndbuf_desc = buf_desc;
> >>>>>>>> -		smc->sk.sk_sndbuf = bufsize;
> >>>>>>>> +		smc->sk.sk_sndbuf = bufsize * 2;
> >>>>>>>>    		atomic_set(&conn->sndbuf_space, bufsize);
> >>>>>>>>    	}
> >>>>>>>>    	return 0;
> >>>>>>>> -- 
> >>>>>>>> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-28  4:33                 ` Tony Lu
@ 2022-11-28  8:32                   ` Wenjia Zhang
  2022-11-28 12:24                     ` Tony Lu
  0 siblings, 1 reply; 13+ messages in thread
From: Wenjia Zhang @ 2022-11-28  8:32 UTC (permalink / raw)
  To: Tony Lu
  Cc: Alexandra Winter, Jan Karcher, David Miller, Jakub Kicinski,
	netdev, linux-s390, Heiko Carstens, Thorsten Winkler,
	Stefan Raspl, Karsten Graul



On 28.11.22 05:33, Tony Lu wrote:
> On Fri, Nov 25, 2022 at 11:59:46AM +0100, Alexandra Winter wrote:
>>
>>
>> On 25.11.22 08:05, Tony Lu wrote:
>>> On Fri, Nov 25, 2022 at 07:15:33AM +0100, Jan Karcher wrote:
>>>>
>>>>
>>>> On 24/11/2022 15:07, Alexandra Winter wrote:
>>>>>
>>>>>
>>>>> On 24.11.22 14:00, Alexandra Winter wrote:
>>>>>>
>>>>>>
>>>> [ ... ]>>>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
>>>>>>>>>> The fixed commit changed the expected behavior of buffersizes
>>>>>>>>>> set by the user using the setsockopt mechanism.
>>>>>>>>>> Before the fixed patch the logic for determining the buffersizes used
>>>>>>>>>> was the following:
>>>>>>>>>>
>>>>>>>>>> default  = net.ipv4.tcp_{w|r}mem[1]
>>>>> Jan, you explained to me: "the minima is 16Kib. This is enforced in smc_compress_bufsize
>>>>> which would move any value <= 16Kib into bucket 0 - which is 16KiB "
>>>>> net.ipv4.tcp_wmem[1] defaults to 8Kib. So in the default case (unchanged net.ipv4.tcp_wmem[1])
>>>>> the default for the send path is not net.ipv4.tcp_wmem[1]. Should be clarified here.
>>>>
>>>> The default value is still set to the net.ipv4.tcp_{w|r}mem[1]. This is a
>>>> *very* top level overview about what is happening and *not* a documentation.
>>>> I don't really want to explain the full code flow here.
>>>>
>>>> What we still should do - as Tony aggreed on - is documenting the SMC
>>>> behavior. This is a follow up on my list.
>>>
>>> Hello Jan and Alexandra,
>>>
>>> It looks like the misalignment of information is causing some trouble,
>>> which is introduced by my patch. Maybe we could have an off-maillist and
>>> online meeting to discussion?
>>>
>>> We have some progress updates of scalability, and we are really like the
>>> extension of SMC-D. Also we have some ideas for SMC, in case of
>>> misalignment of information, we'd like to put them on the table and
>>> discuss them earlier. Maybe an online meeting is an efficient way. What
>>> do you think?
>>>
>>> If possible, I would prepared the meetings and topics and send them to
>>> everyone first.
>>>
>>> Cheers,
>>> Tony Lu
>>>
>>
>> Thanks a lot for your constructive proposals Tony. Yes, we should have a discussion off-mailinglist
>> about future topics.
> 
> I will prepare the discussion off-maillinglist ASAP. The email will be
> sent out when it's ready. And Jan, What about your opinion?
> 
> Cheers,
> Tony Lu
> 
Hi Tony,

Sorry for the flurry we brought!

It's very nice to know that you got progress on the scalability.

Firstly the off-millinglist is a good idea! Let's know if yor're ready.

About the meetings I would ask for your understanding that I still can 
not give any guarantee. But I would let you know ASAP after I talk to 
our team.

Best,
Wenjia
>>
>> My remaining concern for this fix is the default values (user does not use setsockopt, nor
>> changes the new sysfs parameters, nor changes tcp defaults):
>>>>>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>> 	    send: 16k recv: 64k
>>>>>> after net/smc: Fix expected buffersizes and sync logic   (this patch)
>>>>>>        send: 16k recv: 128k
>>
>> @Jan, as this is the only patch you want to send to net, please change the default size of
>> the receive buffers back to 64k (I don't care how).
>>
>>
>>>>
>>>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>>>
>>>>>>>>>>      exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>>>>>>                |                  |                |
>>>>>>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
>>>>>>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>>>>>>                |                  |                |    ^
>>>>>>>>>>                |                  |                |    |
>>>>>>>>>> +---------+ | +--------------+ |                |    |
>>>>>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>>>>>>> +---------+ | +--------------+ |                |
>>>>>>>>>>                |                  |                |
>>>>>>>>>>
>>>>>>>>>> The fixed patch introduced a dedicated sysctl for smc
>>>>>>>>>> and removed the /2 in smc_core.c resulting in the following flow:
>>>>>>>>>>
>>>>>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>>>
>>>>>>>>>>      exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>>>>>>                |                  |                |
>>>>>>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
>>>>>>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>>>>>>                |                  |                |    ^
>>>>>>>>>>                |                  |                |    |
>>>>>>>>>> +---------+ | +--------------+ |                |    |
>>>>>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>>>>>>> +---------+ | +--------------+ |                |
>>>>>>>>>>                |                  |                |
>>>>>>>>>>
>>>>>>>>>> This would result in double of memory used for existing configurations
>>>>>>>>>> that are using setsockopt.
>>>>>>>>>
>>>>>>>>> Firstly, thanks for your detailed diagrams :-)
>>>>>>>>>
>>>>>>>>> And the original decision to use user-provided values rather than
>>>>>>>>> value/2 to follow the instructions of the socket manual [1].
>>>>>>>>>
>>>>>>>>>      SO_RCVBUF
>>>>>>>>>             Sets or gets the maximum socket receive buffer in bytes.
>>>>>>>>>             The kernel doubles this value (to allow space for
>>>>>>>>>             bookkeeping overhead) when it is set using setsockopt(2),
>>>>>>>>>             and this doubled value is returned by getsockopt(2).  The
>>>>>>>>>             default value is set by the
>>>>>>>>>             /proc/sys/net/core/rmem_default file, and the maximum
>>>>>>>>>             allowed value is set by the /proc/sys/net/core/rmem_max
>>>>>>>>>             file.  The minimum (doubled) value for this option is 256.
>>>>>>>>>
>>>>>>>>> [1] https://man7.org/linux/man-pages/man7/socket.7.html
>>>>>>>>>
>>>>>>>>> The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
>>>>>>>>
>>>>>>>> I totally agree that an educated user of SMC should know about that behavior
>>>>>>>> if they decide to use it.
>>>>>>>> We do provide our users preload libraries where they can pass preferred
>>>>>>>> buffersizes via arguments and we handle the Sockopts for them.
>>>>>>>>
>>>>>>>>> double the values in kernel, and getsockopt() will return the doubled
>>>>>>>>> values. So that they should use half of the values which are passed to
>>>>>>>>> setsockopt(). The original patch tries to make things easier in SMC and
>>>>>>>>> let user-space to handle them following the socket manual.
>>>>>>>>>
>>>>>>>>>> SMC historically decided to use the explicit value given by the user
>>>>>>>>>> to allocate the memory. This is why we used the /2 in smc_core.c.
>>>>>>>>>> That logic was not applied to the default value.
>>>>>>>>>
>>>>>>>>> Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
>>>>>>>>> trade-off to follow original logic of SMC, or follow the socket manual.
>>>>>>>>> We decides to follow the instruction of manuals in the end.
>>>>>>>>
>>>>>>>> I understand the point. I spend a lot of time trying to decide what to do.
>>>>>>>>
>>>>>>>> Since it was an intentional decision to not follow the general socket
>>>>>>>> option, and we do not have anyone complaining we do not really have a reason
>>>>>>>> to change it.
>>>>>>>> Changing it means that users with existing configurations would have to
>>>>>>>> change their configs on an update or suddenly expect double the memory
>>>>>>>> consumption.
>>>>>>>> That's why we in the end preffered to stay with the current logic.
>>>>>>>
>>>>>>> I can't agree with you more with the points to follow the historic logic
>>>>>>> and not break the user-space applications.
>>>>>>>
>>>>>>>> I'm thinking that maybe - if we stay with the historic logic - we should
>>>>>>>> document that desicion somewhere. So that in the future, if a user that
>>>>>>>> expects the man page behavior, has a way to understand what SMC is doing.
>>>>>>>> What do oyu think?
>>>>>>>
>>>>>>> Yep, we _really_ need to document it if we change the convention.
>>>>>>> Actually, I spent a lot of time to find the history about the logic of
>>>>>>> buffer (/2 and *2) in SMC. So I'm really in favor of adding
>>>>>>> documentation, at least code comments to help others to understand them.
>>>>>>>
>>>>>>> Cheers,
>>>>>>> Tony Lu
>>>>>> Iiuc you are changing the default values in this a patch and your other patch:
>>>>>> Default values for real_buf for send and receive:
>>>>>>
>>>>>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>>>       real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k
>>>>>         see above: 			    send: 16k recv: 64k
>>>>>> after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
>>>>>>
>>>>>> after net/smc: Fix expected buffersizes and sync logic
>>>>>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
>>>>>>
>>>>>> after net/smc: Unbind smc control from tcp control
>>>>>> real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)
>>>>>>
>>>>>> If my understanding is correct, then I nack this.
>>>>>> Defaults should be restored to the values before 0227f058aa29.
>>>>>> Otherwise users will notice a change in memory usage that needs to
>>>>>> be avoided or announced more explicitely. (and don't change them twice)
>>>>> See above, I stand corrected. However this patch fixes/restores the buffersize
>>>>> for setsockopt, but not for the default recieve path.
>>>>> Could you please clarify that in the title and description?
>>>>>
>>>>
>>>> I am trying to keep the commit title as crisp as possible while providing
>>>> enough information and set the context in the commit message:
>>>>
>>>> "The fixed commit changed the expected behavior of buffersizes set by the
>>>> user using the setsockopt mechanism."
>>>>
>>>>   + There is now a whole e-mail thread to consult in case of any further
>>>> questions.
>>>>
>>>> Thank you for your comments
>>>> - Jan
>>>>
>>>>> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
>>>>>>>> - Jan
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Cheers,
>>>>>>>>> Tony Lu
>>>>>>>>>
>>>>>>>>>> Since we now have our own sysctl, which is also exposed to the user,
>>>>>>>>>> we should sync the logic in a way that both values are the real value
>>>>>>>>>> used by our code and shown by smc_stats. To achieve this this patch
>>>>>>>>>> changes the behavior to:
>>>>>>>>>>
>>>>>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>>>
>>>>>>>>>>      exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
>>>>>>>>>>                |                  |                 |
>>>>>>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>>>>>> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
>>>>>>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>>>>>>                |                  |                 |    ^
>>>>>>>>>>                |                  |                 |    |
>>>>>>>>>> +---------+ | +--------------+ |                 |    |
>>>>>>>>>> | sockopt |---| sk_buf=val*2 |------------------------|
>>>>>>>>>> +---------+ | +--------------+ |                 |
>>>>>>>>>>                |                  |                 |
>>>>>>>>>>
>>>>>>>>>> This way both paths follow the same pattern and the expected behavior
>>>>>>>>>> is re-established.
>>>>>>>>>>
>>>>>>>>>> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>>>>>>> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
>>>>>>>>>> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
>>>>>>>>>> ---
>>>>>>>>>>     net/smc/af_smc.c   | 9 +++++++--
>>>>>>>>>>     net/smc/smc_core.c | 8 ++++----
>>>>>>>>>>     2 files changed, 11 insertions(+), 6 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
>>>>>>>>>> index 036532cf39aa..a8c84e7bac99 100644
>>>>>>>>>> --- a/net/smc/af_smc.c
>>>>>>>>>> +++ b/net/smc/af_smc.c
>>>>>>>>>> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
>>>>>>>>>>     static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>>>>>>     				   int protocol)
>>>>>>>>>>     {
>>>>>>>>>> +	int buffersize_without_overhead;
>>>>>>>>>>     	struct smc_sock *smc;
>>>>>>>>>>     	struct proto *prot;
>>>>>>>>>>     	struct sock *sk;
>>>>>>>>>> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>>>>>>     	sk->sk_state = SMC_INIT;
>>>>>>>>>>     	sk->sk_destruct = smc_destruct;
>>>>>>>>>>     	sk->sk_protocol = protocol;
>>>>>>>>>> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
>>>>>>>>>> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
>>>>>>>>>> +	buffersize_without_overhead =
>>>>>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
>>>>>>>>>> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
>>>>>>>>>> +	buffersize_without_overhead =
>>>>>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
>>>>>>>>>> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
>>>>>>>>>>     	smc = smc_sk(sk);
>>>>>>>>>>     	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
>>>>>>>>>>     	INIT_WORK(&smc->connect_work, smc_connect_work);
>>>>>>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>>>>>>> index 00fb352c2765..36850a2ae167 100644
>>>>>>>>>> --- a/net/smc/smc_core.c
>>>>>>>>>> +++ b/net/smc/smc_core.c
>>>>>>>>>> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>>>     	if (is_rmb)
>>>>>>>>>>     		/* use socket recv buffer size (w/o overhead) as start value */
>>>>>>>>>> -		sk_buf_size = smc->sk.sk_rcvbuf;
>>>>>>>>>> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
>>>>>>>>>>     	else
>>>>>>>>>>     		/* use socket send buffer size (w/o overhead) as start value */
>>>>>>>>>> -		sk_buf_size = smc->sk.sk_sndbuf;
>>>>>>>>>> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
>>>>>>>>>>     	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
>>>>>>>>>>     	     bufsize_short >= 0; bufsize_short--) {
>>>>>>>>>> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>>>     	if (is_rmb) {
>>>>>>>>>>     		conn->rmb_desc = buf_desc;
>>>>>>>>>>     		conn->rmbe_size_short = bufsize_short;
>>>>>>>>>> -		smc->sk.sk_rcvbuf = bufsize;
>>>>>>>>>> +		smc->sk.sk_rcvbuf = bufsize * 2;
>>>>>>>>>>     		atomic_set(&conn->bytes_to_rcv, 0);
>>>>>>>>>>     		conn->rmbe_update_limit =
>>>>>>>>>>     			smc_rmb_wnd_update_limit(buf_desc->len);
>>>>>>>>>> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>>>     			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
>>>>>>>>>>     	} else {
>>>>>>>>>>     		conn->sndbuf_desc = buf_desc;
>>>>>>>>>> -		smc->sk.sk_sndbuf = bufsize;
>>>>>>>>>> +		smc->sk.sk_sndbuf = bufsize * 2;
>>>>>>>>>>     		atomic_set(&conn->sndbuf_space, bufsize);
>>>>>>>>>>     	}
>>>>>>>>>>     	return 0;
>>>>>>>>>> -- 
>>>>>>>>>> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-28  8:32                   ` Wenjia Zhang
@ 2022-11-28 12:24                     ` Tony Lu
  0 siblings, 0 replies; 13+ messages in thread
From: Tony Lu @ 2022-11-28 12:24 UTC (permalink / raw)
  To: Wenjia Zhang
  Cc: Alexandra Winter, Jan Karcher, David Miller, Jakub Kicinski,
	netdev, linux-s390, Heiko Carstens, Thorsten Winkler,
	Stefan Raspl, Karsten Graul

On Mon, Nov 28, 2022 at 09:32:45AM +0100, Wenjia Zhang wrote:
> 
> 
> On 28.11.22 05:33, Tony Lu wrote:
> > On Fri, Nov 25, 2022 at 11:59:46AM +0100, Alexandra Winter wrote:
> > > 
> > > 
> > > On 25.11.22 08:05, Tony Lu wrote:
> > > > On Fri, Nov 25, 2022 at 07:15:33AM +0100, Jan Karcher wrote:
> > > > > 
> > > > > 
> > > > > On 24/11/2022 15:07, Alexandra Winter wrote:
> > > > > > 
> > > > > > 
> > > > > > On 24.11.22 14:00, Alexandra Winter wrote:
> > > > > > > 
> > > > > > > 
> > > > > [ ... ]>>>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
> > > > > > > > > > > The fixed commit changed the expected behavior of buffersizes
> > > > > > > > > > > set by the user using the setsockopt mechanism.
> > > > > > > > > > > Before the fixed patch the logic for determining the buffersizes used
> > > > > > > > > > > was the following:
> > > > > > > > > > > 
> > > > > > > > > > > default  = net.ipv4.tcp_{w|r}mem[1]
> > > > > > Jan, you explained to me: "the minima is 16Kib. This is enforced in smc_compress_bufsize
> > > > > > which would move any value <= 16Kib into bucket 0 - which is 16KiB "
> > > > > > net.ipv4.tcp_wmem[1] defaults to 8Kib. So in the default case (unchanged net.ipv4.tcp_wmem[1])
> > > > > > the default for the send path is not net.ipv4.tcp_wmem[1]. Should be clarified here.
> > > > > 
> > > > > The default value is still set to the net.ipv4.tcp_{w|r}mem[1]. This is a
> > > > > *very* top level overview about what is happening and *not* a documentation.
> > > > > I don't really want to explain the full code flow here.
> > > > > 
> > > > > What we still should do - as Tony aggreed on - is documenting the SMC
> > > > > behavior. This is a follow up on my list.
> > > > 
> > > > Hello Jan and Alexandra,
> > > > 
> > > > It looks like the misalignment of information is causing some trouble,
> > > > which is introduced by my patch. Maybe we could have an off-maillist and
> > > > online meeting to discussion?
> > > > 
> > > > We have some progress updates of scalability, and we are really like the
> > > > extension of SMC-D. Also we have some ideas for SMC, in case of
> > > > misalignment of information, we'd like to put them on the table and
> > > > discuss them earlier. Maybe an online meeting is an efficient way. What
> > > > do you think?
> > > > 
> > > > If possible, I would prepared the meetings and topics and send them to
> > > > everyone first.
> > > > 
> > > > Cheers,
> > > > Tony Lu
> > > > 
> > > 
> > > Thanks a lot for your constructive proposals Tony. Yes, we should have a discussion off-mailinglist
> > > about future topics.
> > 
> > I will prepare the discussion off-maillinglist ASAP. The email will be
> > sent out when it's ready. And Jan, What about your opinion?
> > 
> > Cheers,
> > Tony Lu
> > 
> Hi Tony,
> 
> Sorry for the flurry we brought!
> 
> It's very nice to know that you got progress on the scalability.
> 
> Firstly the off-millinglist is a good idea! Let's know if yor're ready.
> 
> About the meetings I would ask for your understanding that I still can not
> give any guarantee. But I would let you know ASAP after I talk to our team.

Sure, looking forward to your reply.

Cheers,
Tony Lu

> 
> Best,
> Wenjia
> > > 
> > > My remaining concern for this fix is the default values (user does not use setsockopt, nor
> > > changes the new sysfs parameters, nor changes tcp defaults):
> > > > > > > before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> > > 	    send: 16k recv: 64k
> > > > > > > after net/smc: Fix expected buffersizes and sync logic   (this patch)
> > > > > > >        send: 16k recv: 128k
> > > 
> > > @Jan, as this is the only patch you want to send to net, please change the default size of
> > > the receive buffers back to 64k (I don't care how).
> > > 
> > > 
> > > > > 
> > > > > > > > > > > sockopt  = the setsockopt mechanism
> > > > > > > > > > > val      = the value assigned in default or via setsockopt
> > > > > > > > > > > sk_buf   = short for sk_{snd|rcv}buf
> > > > > > > > > > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > > > > > > > > > 
> > > > > > > > > > >      exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
> > > > > > > > > > >                |                  |                |
> > > > > > > > > > > +---------+ |                  | +------------+ | +-------------------+
> > > > > > > > > > > | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
> > > > > > > > > > > +---------+ |                  | +------------+ | +-------------------+
> > > > > > > > > > >                |                  |                |    ^
> > > > > > > > > > >                |                  |                |    |
> > > > > > > > > > > +---------+ | +--------------+ |                |    |
> > > > > > > > > > > | sockopt |---| sk_buf=val*2 |-----------------------|
> > > > > > > > > > > +---------+ | +--------------+ |                |
> > > > > > > > > > >                |                  |                |
> > > > > > > > > > > 
> > > > > > > > > > > The fixed patch introduced a dedicated sysctl for smc
> > > > > > > > > > > and removed the /2 in smc_core.c resulting in the following flow:
> > > > > > > > > > > 
> > > > > > > > > > > default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> > > > > > > > > > > sockopt  = the setsockopt mechanism
> > > > > > > > > > > val      = the value assigned in default or via setsockopt
> > > > > > > > > > > sk_buf   = short for sk_{snd|rcv}buf
> > > > > > > > > > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > > > > > > > > > 
> > > > > > > > > > >      exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
> > > > > > > > > > >                |                  |                |
> > > > > > > > > > > +---------+ |                  | +------------+ | +-----------------+
> > > > > > > > > > > | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
> > > > > > > > > > > +---------+ |                  | +------------+ | +-----------------+
> > > > > > > > > > >                |                  |                |    ^
> > > > > > > > > > >                |                  |                |    |
> > > > > > > > > > > +---------+ | +--------------+ |                |    |
> > > > > > > > > > > | sockopt |---| sk_buf=val*2 |-----------------------|
> > > > > > > > > > > +---------+ | +--------------+ |                |
> > > > > > > > > > >                |                  |                |
> > > > > > > > > > > 
> > > > > > > > > > > This would result in double of memory used for existing configurations
> > > > > > > > > > > that are using setsockopt.
> > > > > > > > > > 
> > > > > > > > > > Firstly, thanks for your detailed diagrams :-)
> > > > > > > > > > 
> > > > > > > > > > And the original decision to use user-provided values rather than
> > > > > > > > > > value/2 to follow the instructions of the socket manual [1].
> > > > > > > > > > 
> > > > > > > > > >      SO_RCVBUF
> > > > > > > > > >             Sets or gets the maximum socket receive buffer in bytes.
> > > > > > > > > >             The kernel doubles this value (to allow space for
> > > > > > > > > >             bookkeeping overhead) when it is set using setsockopt(2),
> > > > > > > > > >             and this doubled value is returned by getsockopt(2).  The
> > > > > > > > > >             default value is set by the
> > > > > > > > > >             /proc/sys/net/core/rmem_default file, and the maximum
> > > > > > > > > >             allowed value is set by the /proc/sys/net/core/rmem_max
> > > > > > > > > >             file.  The minimum (doubled) value for this option is 256.
> > > > > > > > > > 
> > > > > > > > > > [1] https://man7.org/linux/man-pages/man7/socket.7.html
> > > > > > > > > > 
> > > > > > > > > > The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
> > > > > > > > > 
> > > > > > > > > I totally agree that an educated user of SMC should know about that behavior
> > > > > > > > > if they decide to use it.
> > > > > > > > > We do provide our users preload libraries where they can pass preferred
> > > > > > > > > buffersizes via arguments and we handle the Sockopts for them.
> > > > > > > > > 
> > > > > > > > > > double the values in kernel, and getsockopt() will return the doubled
> > > > > > > > > > values. So that they should use half of the values which are passed to
> > > > > > > > > > setsockopt(). The original patch tries to make things easier in SMC and
> > > > > > > > > > let user-space to handle them following the socket manual.
> > > > > > > > > > 
> > > > > > > > > > > SMC historically decided to use the explicit value given by the user
> > > > > > > > > > > to allocate the memory. This is why we used the /2 in smc_core.c.
> > > > > > > > > > > That logic was not applied to the default value.
> > > > > > > > > > 
> > > > > > > > > > Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
> > > > > > > > > > trade-off to follow original logic of SMC, or follow the socket manual.
> > > > > > > > > > We decides to follow the instruction of manuals in the end.
> > > > > > > > > 
> > > > > > > > > I understand the point. I spend a lot of time trying to decide what to do.
> > > > > > > > > 
> > > > > > > > > Since it was an intentional decision to not follow the general socket
> > > > > > > > > option, and we do not have anyone complaining we do not really have a reason
> > > > > > > > > to change it.
> > > > > > > > > Changing it means that users with existing configurations would have to
> > > > > > > > > change their configs on an update or suddenly expect double the memory
> > > > > > > > > consumption.
> > > > > > > > > That's why we in the end preffered to stay with the current logic.
> > > > > > > > 
> > > > > > > > I can't agree with you more with the points to follow the historic logic
> > > > > > > > and not break the user-space applications.
> > > > > > > > 
> > > > > > > > > I'm thinking that maybe - if we stay with the historic logic - we should
> > > > > > > > > document that desicion somewhere. So that in the future, if a user that
> > > > > > > > > expects the man page behavior, has a way to understand what SMC is doing.
> > > > > > > > > What do oyu think?
> > > > > > > > 
> > > > > > > > Yep, we _really_ need to document it if we change the convention.
> > > > > > > > Actually, I spent a lot of time to find the history about the logic of
> > > > > > > > buffer (/2 and *2) in SMC. So I'm really in favor of adding
> > > > > > > > documentation, at least code comments to help others to understand them.
> > > > > > > > 
> > > > > > > > Cheers,
> > > > > > > > Tony Lu
> > > > > > > Iiuc you are changing the default values in this a patch and your other patch:
> > > > > > > Default values for real_buf for send and receive:
> > > > > > > 
> > > > > > > before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> > > > > > >       real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k
> > > > > >         see above: 			    send: 16k recv: 64k
> > > > > > > after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> > > > > > > real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
> > > > > > > 
> > > > > > > after net/smc: Fix expected buffersizes and sync logic
> > > > > > > real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
> > > > > > > 
> > > > > > > after net/smc: Unbind smc control from tcp control
> > > > > > > real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)
> > > > > > > 
> > > > > > > If my understanding is correct, then I nack this.
> > > > > > > Defaults should be restored to the values before 0227f058aa29.
> > > > > > > Otherwise users will notice a change in memory usage that needs to
> > > > > > > be avoided or announced more explicitely. (and don't change them twice)
> > > > > > See above, I stand corrected. However this patch fixes/restores the buffersize
> > > > > > for setsockopt, but not for the default recieve path.
> > > > > > Could you please clarify that in the title and description?
> > > > > > 
> > > > > 
> > > > > I am trying to keep the commit title as crisp as possible while providing
> > > > > enough information and set the context in the commit message:
> > > > > 
> > > > > "The fixed commit changed the expected behavior of buffersizes set by the
> > > > > user using the setsockopt mechanism."
> > > > > 
> > > > >   + There is now a whole e-mail thread to consult in case of any further
> > > > > questions.
> > > > > 
> > > > > Thank you for your comments
> > > > > - Jan
> > > > > 
> > > > > > Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
> > > > > > > > > - Jan
> > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Cheers,
> > > > > > > > > > Tony Lu
> > > > > > > > > > 
> > > > > > > > > > > Since we now have our own sysctl, which is also exposed to the user,
> > > > > > > > > > > we should sync the logic in a way that both values are the real value
> > > > > > > > > > > used by our code and shown by smc_stats. To achieve this this patch
> > > > > > > > > > > changes the behavior to:
> > > > > > > > > > > 
> > > > > > > > > > > default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
> > > > > > > > > > > sockopt  = the setsockopt mechanism
> > > > > > > > > > > val      = the value assigned in default or via setsockopt
> > > > > > > > > > > sk_buf   = short for sk_{snd|rcv}buf
> > > > > > > > > > > real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
> > > > > > > > > > > 
> > > > > > > > > > >      exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
> > > > > > > > > > >                |                  |                 |
> > > > > > > > > > > +---------+ |                  | +-------------+ | +-----------------+
> > > > > > > > > > > | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
> > > > > > > > > > > +---------+ |                  | +-------------+ | +-----------------+
> > > > > > > > > > >                |                  |                 |    ^
> > > > > > > > > > >                |                  |                 |    |
> > > > > > > > > > > +---------+ | +--------------+ |                 |    |
> > > > > > > > > > > | sockopt |---| sk_buf=val*2 |------------------------|
> > > > > > > > > > > +---------+ | +--------------+ |                 |
> > > > > > > > > > >                |                  |                 |
> > > > > > > > > > > 
> > > > > > > > > > > This way both paths follow the same pattern and the expected behavior
> > > > > > > > > > > is re-established.
> > > > > > > > > > > 
> > > > > > > > > > > Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> > > > > > > > > > > Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
> > > > > > > > > > > Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
> > > > > > > > > > > ---
> > > > > > > > > > >     net/smc/af_smc.c   | 9 +++++++--
> > > > > > > > > > >     net/smc/smc_core.c | 8 ++++----
> > > > > > > > > > >     2 files changed, 11 insertions(+), 6 deletions(-)
> > > > > > > > > > > 
> > > > > > > > > > > diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> > > > > > > > > > > index 036532cf39aa..a8c84e7bac99 100644
> > > > > > > > > > > --- a/net/smc/af_smc.c
> > > > > > > > > > > +++ b/net/smc/af_smc.c
> > > > > > > > > > > @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
> > > > > > > > > > >     static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
> > > > > > > > > > >     				   int protocol)
> > > > > > > > > > >     {
> > > > > > > > > > > +	int buffersize_without_overhead;
> > > > > > > > > > >     	struct smc_sock *smc;
> > > > > > > > > > >     	struct proto *prot;
> > > > > > > > > > >     	struct sock *sk;
> > > > > > > > > > > @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
> > > > > > > > > > >     	sk->sk_state = SMC_INIT;
> > > > > > > > > > >     	sk->sk_destruct = smc_destruct;
> > > > > > > > > > >     	sk->sk_protocol = protocol;
> > > > > > > > > > > -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
> > > > > > > > > > > -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
> > > > > > > > > > > +	buffersize_without_overhead =
> > > > > > > > > > > +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
> > > > > > > > > > > +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
> > > > > > > > > > > +	buffersize_without_overhead =
> > > > > > > > > > > +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
> > > > > > > > > > > +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
> > > > > > > > > > >     	smc = smc_sk(sk);
> > > > > > > > > > >     	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
> > > > > > > > > > >     	INIT_WORK(&smc->connect_work, smc_connect_work);
> > > > > > > > > > > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > > > > > > > > > > index 00fb352c2765..36850a2ae167 100644
> > > > > > > > > > > --- a/net/smc/smc_core.c
> > > > > > > > > > > +++ b/net/smc/smc_core.c
> > > > > > > > > > > @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > > > > > > > > > >     	if (is_rmb)
> > > > > > > > > > >     		/* use socket recv buffer size (w/o overhead) as start value */
> > > > > > > > > > > -		sk_buf_size = smc->sk.sk_rcvbuf;
> > > > > > > > > > > +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
> > > > > > > > > > >     	else
> > > > > > > > > > >     		/* use socket send buffer size (w/o overhead) as start value */
> > > > > > > > > > > -		sk_buf_size = smc->sk.sk_sndbuf;
> > > > > > > > > > > +		sk_buf_size = smc->sk.sk_sndbuf / 2;
> > > > > > > > > > >     	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
> > > > > > > > > > >     	     bufsize_short >= 0; bufsize_short--) {
> > > > > > > > > > > @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > > > > > > > > > >     	if (is_rmb) {
> > > > > > > > > > >     		conn->rmb_desc = buf_desc;
> > > > > > > > > > >     		conn->rmbe_size_short = bufsize_short;
> > > > > > > > > > > -		smc->sk.sk_rcvbuf = bufsize;
> > > > > > > > > > > +		smc->sk.sk_rcvbuf = bufsize * 2;
> > > > > > > > > > >     		atomic_set(&conn->bytes_to_rcv, 0);
> > > > > > > > > > >     		conn->rmbe_update_limit =
> > > > > > > > > > >     			smc_rmb_wnd_update_limit(buf_desc->len);
> > > > > > > > > > > @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
> > > > > > > > > > >     			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
> > > > > > > > > > >     	} else {
> > > > > > > > > > >     		conn->sndbuf_desc = buf_desc;
> > > > > > > > > > > -		smc->sk.sk_sndbuf = bufsize;
> > > > > > > > > > > +		smc->sk.sk_sndbuf = bufsize * 2;
> > > > > > > > > > >     		atomic_set(&conn->sndbuf_space, bufsize);
> > > > > > > > > > >     	}
> > > > > > > > > > >     	return 0;
> > > > > > > > > > > -- 
> > > > > > > > > > > 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net] net/smc: Fix expected buffersizes and sync logic
  2022-11-25 10:59               ` Alexandra Winter
  2022-11-28  4:33                 ` Tony Lu
@ 2022-12-02 13:10                 ` Jan Karcher
  1 sibling, 0 replies; 13+ messages in thread
From: Jan Karcher @ 2022-12-02 13:10 UTC (permalink / raw)
  To: Alexandra Winter, Tony Lu
  Cc: David Miller, Jakub Kicinski, netdev, linux-s390, Heiko Carstens,
	Wenjia Zhang, Thorsten Winkler, Stefan Raspl, Karsten Graul



On 25/11/2022 11:59, Alexandra Winter wrote:
> 
> 
> On 25.11.22 08:05, Tony Lu wrote:
>> On Fri, Nov 25, 2022 at 07:15:33AM +0100, Jan Karcher wrote:
>>>
>>>
>>> On 24/11/2022 15:07, Alexandra Winter wrote:
>>>>
>>>>
>>>> On 24.11.22 14:00, Alexandra Winter wrote:
>>>>>
>>>>>
>>> [ ... ]>>>>> On Wed, Nov 23, 2022 at 11:49:07AM +0100, Jan Karcher wrote:
>>>>>>>>> The fixed commit changed the expected behavior of buffersizes
>>>>>>>>> set by the user using the setsockopt mechanism.
>>>>>>>>> Before the fixed patch the logic for determining the buffersizes used
>>>>>>>>> was the following:
>>>>>>>>>
>>>>>>>>> default  = net.ipv4.tcp_{w|r}mem[1]
>>>> Jan, you explained to me: "the minima is 16Kib. This is enforced in smc_compress_bufsize
>>>> which would move any value <= 16Kib into bucket 0 - which is 16KiB "
>>>> net.ipv4.tcp_wmem[1] defaults to 8Kib. So in the default case (unchanged net.ipv4.tcp_wmem[1])
>>>> the default for the send path is not net.ipv4.tcp_wmem[1]. Should be clarified here.
>>>
>>> The default value is still set to the net.ipv4.tcp_{w|r}mem[1]. This is a
>>> *very* top level overview about what is happening and *not* a documentation.
>>> I don't really want to explain the full code flow here.
>>>
>>> What we still should do - as Tony aggreed on - is documenting the SMC
>>> behavior. This is a follow up on my list.
>>
>> Hello Jan and Alexandra,
>>
>> It looks like the misalignment of information is causing some trouble,
>> which is introduced by my patch. Maybe we could have an off-maillist and
>> online meeting to discussion?
>>
>> We have some progress updates of scalability, and we are really like the
>> extension of SMC-D. Also we have some ideas for SMC, in case of
>> misalignment of information, we'd like to put them on the table and
>> discuss them earlier. Maybe an online meeting is an efficient way. What
>> do you think?
>>
>> If possible, I would prepared the meetings and topics and send them to
>> everyone first.
>>
>> Cheers,
>> Tony Lu
>>

Hello everyone,

Tony +1. I think working closer together is something both parties can 
benefit of.

> 
> Thanks a lot for your constructive proposals Tony. Yes, we should have a discussion off-mailinglist
> about future topics.
> 
> My remaining concern for this fix is the default values (user does not use setsockopt, nor
> changes the new sysfs parameters, nor changes tcp defaults):
>>>>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
> 	    send: 16k recv: 64k
>>>>> after net/smc: Fix expected buffersizes and sync logic   (this patch)
>>>>>        send: 16k recv: 128k
> 
> @Jan, as this is the only patch you want to send to net, please change the default size of
> the receive buffers back to 64k (I don't care how).
> 

Thanks for your comments, I've spent some time reading and re-evaluating.

The concern that Alexandra mentioned (recv 64KiB vs recv 128KiB) is a 
valid point and leaves the question if I want to, and if yes, how I want 
to address the concern.
I sure want to address it, so I did revisit how to resolve the concern.
My way to address this concern is rather easy since I based my actions 
on one wrong assumption.
The reason I split the logic can be found here [1]:

      I decided to split those two separate in favor of the user. While
      the fix corrects specific behavior this patch does provides a new
      functionality and could be used without the fixed patch.

Which is simply not true since the patch [1] still requires the fixing 
patch. This was the misassumption I made and caused all the confusion.
There is going to follow a v2 with some changes (the wrong formatting) 
and the v2 is going to be the sum of both (this and [1]) patches. Since 
the second [1] patch sets the initial value of the smc sysctl to 16KiB 
send and 64KiB recv buffer size the fix is complete and all concerns in 
this regard should be addressed.

As soon as the v2 is out I'm going to update the other patch[1] one last 
time pointing onto the v2.

If you have any objections please let me know.

Thank you
  - Jan

[1] 
https://lore.kernel.org/netdev/0d22034d-28cd-7721-e210-651750549a1c@linux.ibm.com/

> 
>>>
>>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>>
>>>>>>>>>      exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>>>>>                |                  |                |
>>>>>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf/2 |
>>>>>>>>> +---------+ |                  | +------------+ | +-------------------+
>>>>>>>>>                |                  |                |    ^
>>>>>>>>>                |                  |                |    |
>>>>>>>>> +---------+ | +--------------+ |                |    |
>>>>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>>>>>> +---------+ | +--------------+ |                |
>>>>>>>>>                |                  |                |
>>>>>>>>>
>>>>>>>>> The fixed patch introduced a dedicated sysctl for smc
>>>>>>>>> and removed the /2 in smc_core.c resulting in the following flow:
>>>>>>>>>
>>>>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>>
>>>>>>>>>      exposed   | net/core/sock.c  |    af_smc.c    |  smc_core.c
>>>>>>>>>                |                  |                |
>>>>>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>>>>> | default |----------------------| sk_buf=val |---| real_buf=sk_buf |
>>>>>>>>> +---------+ |                  | +------------+ | +-----------------+
>>>>>>>>>                |                  |                |    ^
>>>>>>>>>                |                  |                |    |
>>>>>>>>> +---------+ | +--------------+ |                |    |
>>>>>>>>> | sockopt |---| sk_buf=val*2 |-----------------------|
>>>>>>>>> +---------+ | +--------------+ |                |
>>>>>>>>>                |                  |                |
>>>>>>>>>
>>>>>>>>> This would result in double of memory used for existing configurations
>>>>>>>>> that are using setsockopt.
>>>>>>>>
>>>>>>>> Firstly, thanks for your detailed diagrams :-)
>>>>>>>>
>>>>>>>> And the original decision to use user-provided values rather than
>>>>>>>> value/2 to follow the instructions of the socket manual [1].
>>>>>>>>
>>>>>>>>      SO_RCVBUF
>>>>>>>>             Sets or gets the maximum socket receive buffer in bytes.
>>>>>>>>             The kernel doubles this value (to allow space for
>>>>>>>>             bookkeeping overhead) when it is set using setsockopt(2),
>>>>>>>>             and this doubled value is returned by getsockopt(2).  The
>>>>>>>>             default value is set by the
>>>>>>>>             /proc/sys/net/core/rmem_default file, and the maximum
>>>>>>>>             allowed value is set by the /proc/sys/net/core/rmem_max
>>>>>>>>             file.  The minimum (doubled) value for this option is 256.
>>>>>>>>
>>>>>>>> [1] https://man7.org/linux/man-pages/man7/socket.7.html
>>>>>>>>
>>>>>>>> The user of SMC should know that setsockopt() with SO_{RCV|SND}BUF will
>>>>>>>
>>>>>>> I totally agree that an educated user of SMC should know about that behavior
>>>>>>> if they decide to use it.
>>>>>>> We do provide our users preload libraries where they can pass preferred
>>>>>>> buffersizes via arguments and we handle the Sockopts for them.
>>>>>>>
>>>>>>>> double the values in kernel, and getsockopt() will return the doubled
>>>>>>>> values. So that they should use half of the values which are passed to
>>>>>>>> setsockopt(). The original patch tries to make things easier in SMC and
>>>>>>>> let user-space to handle them following the socket manual.
>>>>>>>>
>>>>>>>>> SMC historically decided to use the explicit value given by the user
>>>>>>>>> to allocate the memory. This is why we used the /2 in smc_core.c.
>>>>>>>>> That logic was not applied to the default value.
>>>>>>>>
>>>>>>>> Yep, let back to the patch which introduced smc_{w|r}mem knobs, it's a
>>>>>>>> trade-off to follow original logic of SMC, or follow the socket manual.
>>>>>>>> We decides to follow the instruction of manuals in the end.
>>>>>>>
>>>>>>> I understand the point. I spend a lot of time trying to decide what to do.
>>>>>>>
>>>>>>> Since it was an intentional decision to not follow the general socket
>>>>>>> option, and we do not have anyone complaining we do not really have a reason
>>>>>>> to change it.
>>>>>>> Changing it means that users with existing configurations would have to
>>>>>>> change their configs on an update or suddenly expect double the memory
>>>>>>> consumption.
>>>>>>> That's why we in the end preffered to stay with the current logic.
>>>>>>
>>>>>> I can't agree with you more with the points to follow the historic logic
>>>>>> and not break the user-space applications.
>>>>>>
>>>>>>> I'm thinking that maybe - if we stay with the historic logic - we should
>>>>>>> document that desicion somewhere. So that in the future, if a user that
>>>>>>> expects the man page behavior, has a way to understand what SMC is doing.
>>>>>>> What do oyu think?
>>>>>>
>>>>>> Yep, we _really_ need to document it if we change the convention.
>>>>>> Actually, I spent a lot of time to find the history about the logic of
>>>>>> buffer (/2 and *2) in SMC. So I'm really in favor of adding
>>>>>> documentation, at least code comments to help others to understand them.
>>>>>>
>>>>>> Cheers,
>>>>>> Tony Lu
>>>>> Iiuc you are changing the default values in this a patch and your other patch:
>>>>> Default values for real_buf for send and receive:
>>>>>
>>>>> before 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>>       real_buf=net.ipv4.tcp_{w|r}mem[1]/2   send: 8k  recv: 64k
>>>>         see above: 			    send: 16k recv: 64k
>>>>> after 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
>>>>>
>>>>> after net/smc: Fix expected buffersizes and sync logic
>>>>> real_buf=net.ipv4.tcp_{w|r}mem[1]   send: 16k (16*1024) recv: 128k (131072)
>>>>>
>>>>> after net/smc: Unbind smc control from tcp control
>>>>> real_buf=SMC_*BUF_INIT_SIZE   send: 16k (16384) recv: 64k (65536)
>>>>>
>>>>> If my understanding is correct, then I nack this.
>>>>> Defaults should be restored to the values before 0227f058aa29.
>>>>> Otherwise users will notice a change in memory usage that needs to
>>>>> be avoided or announced more explicitely. (and don't change them twice)
>>>> See above, I stand corrected. However this patch fixes/restores the buffersize
>>>> for setsockopt, but not for the default recieve path.
>>>> Could you please clarify that in the title and description?
>>>>
>>>
>>> I am trying to keep the commit title as crisp as possible while providing
>>> enough information and set the context in the commit message:
>>>
>>> "The fixed commit changed the expected behavior of buffersizes set by the
>>> user using the setsockopt mechanism."
>>>
>>>   + There is now a whole e-mail thread to consult in case of any further
>>> questions.
>>>
>>> Thank you for your comments
>>> - Jan
>>>
>>>> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
>>>>>>> - Jan
>>>>>>>
>>>>>>>>
>>>>>>>> Cheers,
>>>>>>>> Tony Lu
>>>>>>>>
>>>>>>>>> Since we now have our own sysctl, which is also exposed to the user,
>>>>>>>>> we should sync the logic in a way that both values are the real value
>>>>>>>>> used by our code and shown by smc_stats. To achieve this this patch
>>>>>>>>> changes the behavior to:
>>>>>>>>>
>>>>>>>>> default  = net.smc.{w|r}mem (which defaults to net.ipv4.tcp_{w|r}mem[1])
>>>>>>>>> sockopt  = the setsockopt mechanism
>>>>>>>>> val      = the value assigned in default or via setsockopt
>>>>>>>>> sk_buf   = short for sk_{snd|rcv}buf
>>>>>>>>> real_buf = the real size of the buffer (sk_buf_size in __smc_buf_create)
>>>>>>>>>
>>>>>>>>>      exposed   | net/core/sock.c  |    af_smc.c     |  smc_core.c
>>>>>>>>>                |                  |                 |
>>>>>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>>>>> | default |----------------------| sk_buf=val*2|---|real_buf=sk_buf/2|
>>>>>>>>> +---------+ |                  | +-------------+ | +-----------------+
>>>>>>>>>                |                  |                 |    ^
>>>>>>>>>                |                  |                 |    |
>>>>>>>>> +---------+ | +--------------+ |                 |    |
>>>>>>>>> | sockopt |---| sk_buf=val*2 |------------------------|
>>>>>>>>> +---------+ | +--------------+ |                 |
>>>>>>>>>                |                  |                 |
>>>>>>>>>
>>>>>>>>> This way both paths follow the same pattern and the expected behavior
>>>>>>>>> is re-established.
>>>>>>>>>
>>>>>>>>> Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
>>>>>>>>> Signed-off-by: Jan Karcher <jaka@linux.ibm.com>
>>>>>>>>> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
>>>>>>>>> ---
>>>>>>>>>     net/smc/af_smc.c   | 9 +++++++--
>>>>>>>>>     net/smc/smc_core.c | 8 ++++----
>>>>>>>>>     2 files changed, 11 insertions(+), 6 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
>>>>>>>>> index 036532cf39aa..a8c84e7bac99 100644
>>>>>>>>> --- a/net/smc/af_smc.c
>>>>>>>>> +++ b/net/smc/af_smc.c
>>>>>>>>> @@ -366,6 +366,7 @@ static void smc_destruct(struct sock *sk)
>>>>>>>>>     static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>>>>>     				   int protocol)
>>>>>>>>>     {
>>>>>>>>> +	int buffersize_without_overhead;
>>>>>>>>>     	struct smc_sock *smc;
>>>>>>>>>     	struct proto *prot;
>>>>>>>>>     	struct sock *sk;
>>>>>>>>> @@ -379,8 +380,12 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
>>>>>>>>>     	sk->sk_state = SMC_INIT;
>>>>>>>>>     	sk->sk_destruct = smc_destruct;
>>>>>>>>>     	sk->sk_protocol = protocol;
>>>>>>>>> -	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
>>>>>>>>> -	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
>>>>>>>>> +	buffersize_without_overhead =
>>>>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_wmem), INT_MAX / 2);
>>>>>>>>> +	WRITE_ONCE(sk->sk_sndbuf, buffersize_without_overhead * 2);
>>>>>>>>> +	buffersize_without_overhead =
>>>>>>>>> +		min_t(int, READ_ONCE(net->smc.sysctl_rmem), INT_MAX / 2);
>>>>>>>>> +	WRITE_ONCE(sk->sk_rcvbuf, buffersize_without_overhead * 2);
>>>>>>>>>     	smc = smc_sk(sk);
>>>>>>>>>     	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
>>>>>>>>>     	INIT_WORK(&smc->connect_work, smc_connect_work);
>>>>>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>>>>>> index 00fb352c2765..36850a2ae167 100644
>>>>>>>>> --- a/net/smc/smc_core.c
>>>>>>>>> +++ b/net/smc/smc_core.c
>>>>>>>>> @@ -2314,10 +2314,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>>     	if (is_rmb)
>>>>>>>>>     		/* use socket recv buffer size (w/o overhead) as start value */
>>>>>>>>> -		sk_buf_size = smc->sk.sk_rcvbuf;
>>>>>>>>> +		sk_buf_size = smc->sk.sk_rcvbuf / 2;
>>>>>>>>>     	else
>>>>>>>>>     		/* use socket send buffer size (w/o overhead) as start value */
>>>>>>>>> -		sk_buf_size = smc->sk.sk_sndbuf;
>>>>>>>>> +		sk_buf_size = smc->sk.sk_sndbuf / 2;
>>>>>>>>>     	for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
>>>>>>>>>     	     bufsize_short >= 0; bufsize_short--) {
>>>>>>>>> @@ -2376,7 +2376,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>>     	if (is_rmb) {
>>>>>>>>>     		conn->rmb_desc = buf_desc;
>>>>>>>>>     		conn->rmbe_size_short = bufsize_short;
>>>>>>>>> -		smc->sk.sk_rcvbuf = bufsize;
>>>>>>>>> +		smc->sk.sk_rcvbuf = bufsize * 2;
>>>>>>>>>     		atomic_set(&conn->bytes_to_rcv, 0);
>>>>>>>>>     		conn->rmbe_update_limit =
>>>>>>>>>     			smc_rmb_wnd_update_limit(buf_desc->len);
>>>>>>>>> @@ -2384,7 +2384,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
>>>>>>>>>     			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
>>>>>>>>>     	} else {
>>>>>>>>>     		conn->sndbuf_desc = buf_desc;
>>>>>>>>> -		smc->sk.sk_sndbuf = bufsize;
>>>>>>>>> +		smc->sk.sk_sndbuf = bufsize * 2;
>>>>>>>>>     		atomic_set(&conn->sndbuf_space, bufsize);
>>>>>>>>>     	}
>>>>>>>>>     	return 0;
>>>>>>>>> -- 
>>>>>>>>> 2.34.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-12-02 13:10 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-23 10:49 [PATCH net] net/smc: Fix expected buffersizes and sync logic Jan Karcher
2022-11-23 11:53 ` Tony Lu
2022-11-23 13:13   ` Jan Karcher
2022-11-23 13:41     ` Tony Lu
2022-11-24 13:00       ` Alexandra Winter
2022-11-24 14:07         ` Alexandra Winter
2022-11-25  6:15           ` Jan Karcher
2022-11-25  7:05             ` Tony Lu
2022-11-25 10:59               ` Alexandra Winter
2022-11-28  4:33                 ` Tony Lu
2022-11-28  8:32                   ` Wenjia Zhang
2022-11-28 12:24                     ` Tony Lu
2022-12-02 13:10                 ` Jan Karcher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.