Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* remove kernel_setsockopt and kernel_getsockopt
@ 2020-05-13  6:26 Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 01/33] net: add sock_set_reuseaddr Christoph Hellwig
                   ` (40 more replies)
  0 siblings, 41 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Hi Dave,

this series removes the kernel_setsockopt and kernel_getsockopt
functions, and instead switches their users to small functions that
implement setting (or in one case getting) a sockopt directly using
a normal kernel function call with type safety and all the other
benefits of not having a function call.

In some cases these functions seem pretty heavy handed as they do
a lock_sock even for just setting a single variable, but this mirrors
the real setsockopt implementation - counter to that a few kernel
drivers just set the fields directly already.

Nevertheless the diffstat looks quite promising:

 42 files changed, 721 insertions(+), 799 deletions(-)

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 01/33] net: add sock_set_reuseaddr
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 02/33] net: add sock_set_linger Christoph Hellwig
                   ` (39 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_REUSEADDR sockopt from kernel space
without going through a fake uaccess.

For this the iscsi target now has to formally depend on inet to avoid
a mostly theoretical compile failure.  For actual operation it already
did depend on having ipv4 or ipv6 support.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/sw/siw/siw_cm.c        | 18 +++++-------------
 drivers/nvme/target/tcp.c                 |  8 +-------
 drivers/target/iscsi/Kconfig              |  2 +-
 drivers/target/iscsi/iscsi_target_login.c |  9 +--------
 fs/dlm/lowcomms.c                         |  6 +-----
 include/net/sock.h                        |  1 +
 net/core/sock.c                           |  8 ++++++++
 7 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 559e5fd3bad8b..6d7c8c933736c 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -1312,17 +1312,14 @@ static void siw_cm_llp_state_change(struct sock *sk)
 static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
 			      struct sockaddr *raddr)
 {
-	int rv, flags = 0, s_val = 1;
+	int rv, flags = 0;
 	size_t size = laddr->sa_family == AF_INET ?
 		sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
 
 	/*
 	 * Make address available again asap.
 	 */
-	rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
-			       sizeof(s_val));
-	if (rv < 0)
-		return rv;
+	sock_set_reuseaddr(s->sk, SK_CAN_REUSE);
 
 	rv = s->ops->bind(s, laddr, size);
 	if (rv < 0)
@@ -1781,7 +1778,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
 	struct siw_cep *cep = NULL;
 	struct siw_device *sdev = to_siw_dev(id->device);
 	int addr_family = id->local_addr.ss_family;
-	int rv = 0, s_val;
+	int rv = 0;
 
 	if (addr_family != AF_INET && addr_family != AF_INET6)
 		return -EAFNOSUPPORT;
@@ -1793,13 +1790,8 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
 	/*
 	 * Allow binding local port when still in TIME_WAIT from last close.
 	 */
-	s_val = 1;
-	rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
-			       sizeof(s_val));
-	if (rv) {
-		siw_dbg(id->device, "setsockopt error: %d\n", rv);
-		goto error;
-	}
+	sock_set_reuseaddr(s->sk, SK_CAN_REUSE);
+
 	if (addr_family == AF_INET) {
 		struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
 
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index f0da04e960f40..791aa32beeb98 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1632,6 +1632,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 	port->sock->sk->sk_user_data = port;
 	port->data_ready = port->sock->sk->sk_data_ready;
 	port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
+	sock_set_reuseaddr(port->sock->sk, SK_CAN_REUSE);
 
 	opt = 1;
 	ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
@@ -1641,13 +1642,6 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 		goto err_sock;
 	}
 
-	ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
-			(char *)&opt, sizeof(opt));
-	if (ret) {
-		pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
-		goto err_sock;
-	}
-
 	if (so_priority > 0) {
 		ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY,
 				(char *)&so_priority, sizeof(so_priority));
diff --git a/drivers/target/iscsi/Kconfig b/drivers/target/iscsi/Kconfig
index 1f93ea3813536..922484ea4e304 100644
--- a/drivers/target/iscsi/Kconfig
+++ b/drivers/target/iscsi/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config ISCSI_TARGET
 	tristate "Linux-iSCSI.org iSCSI Target Mode Stack"
-	depends on NET
+	depends on INET
 	select CRYPTO
 	select CRYPTO_CRC32C
 	select CRYPTO_CRC32C_INTEL if X86
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 731ee67fe914b..7da59ece3eb99 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -909,14 +909,7 @@ int iscsit_setup_np(
 		}
 	}
 
-	/* FIXME: Someone please explain why this is endian-safe */
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-			(char *)&opt, sizeof(opt));
-	if (ret < 0) {
-		pr_err("kernel_setsockopt() for SO_REUSEADDR"
-			" failed\n");
-		goto fail;
-	}
+	sock_set_reuseaddr(sock->sk, SK_CAN_REUSE);
 
 	ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND,
 			(char *)&opt, sizeof(opt));
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index cdfaf4f0e11a0..48e7ba796c6fb 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1244,12 +1244,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
 			  sizeof(one));
 
-	result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-				   (char *)&one, sizeof(one));
+	sock_set_reuseaddr(sock->sk, SK_CAN_REUSE);
 
-	if (result < 0) {
-		log_print("Failed to set SO_REUSEADDR on socket: %d", result);
-	}
 	write_lock_bh(&sock->sk->sk_callback_lock);
 	sock->sk->sk_user_data = con;
 	save_listen_callbacks(sock);
diff --git a/include/net/sock.h b/include/net/sock.h
index 3e8c6d4b4b59f..e801a147ad746 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2687,5 +2687,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 }
 
 void sock_def_readable(struct sock *sk);
+void sock_set_reuseaddr(struct sock *sk, unsigned char reuse);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index fd85e651ce284..ff4faa3e68ac4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -712,6 +712,14 @@ bool sk_mc_loop(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_mc_loop);
 
+void sock_set_reuseaddr(struct sock *sk, unsigned char reuse)
+{
+	lock_sock(sk);
+	sk->sk_reuse = reuse;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseaddr);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 02/33] net: add sock_set_linger
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 01/33] net: add sock_set_reuseaddr Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 03/33] net: add sock_set_priority Christoph Hellwig
                   ` (38 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_LINGER sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c   |  9 +--------
 drivers/nvme/target/tcp.c |  6 +-----
 include/net/sock.h        |  1 +
 net/core/sock.c           | 36 +++++++++++++++++++++++++-----------
 net/rds/tcp_listen.c      |  8 +-------
 net/sunrpc/svcsock.c      | 12 ++----------
 6 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c15a92163c1f7..5cacb61c73229 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1313,7 +1313,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 {
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
-	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
 	int ret, opt, rcv_pdu_size;
 
 	queue->ctrl = ctrl;
@@ -1361,13 +1360,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	 * close. This is done to prevent stale data from being sent should
 	 * the network connection be restored before TCP times out.
 	 */
-	ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
-			(char *)&sol, sizeof(sol));
-	if (ret) {
-		dev_err(nctrl->device,
-			"failed to set SO_LINGER sock opt %d\n", ret);
-		goto err_sock;
-	}
+	sock_set_linger(queue->sock->sk, true, 0);
 
 	if (so_priority > 0) {
 		ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY,
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 791aa32beeb98..87aba417189d2 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1429,7 +1429,6 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 {
 	struct socket *sock = queue->sock;
 	struct inet_sock *inet = inet_sk(sock->sk);
-	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
 	int ret;
 
 	ret = kernel_getsockname(sock,
@@ -1447,10 +1446,7 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 	 * close. This is done to prevent stale data from being sent should
 	 * the network connection be restored before TCP times out.
 	 */
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
-			(char *)&sol, sizeof(sol));
-	if (ret)
-		return ret;
+	sock_set_linger(sock->sk, true, 0);
 
 	if (so_priority > 0) {
 		ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY,
diff --git a/include/net/sock.h b/include/net/sock.h
index e801a147ad746..60890fb47fbc0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2688,5 +2688,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 
 void sock_def_readable(struct sock *sk);
 void sock_set_reuseaddr(struct sock *sk, unsigned char reuse);
+void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index ff4faa3e68ac4..cbc5104ca3515 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -720,6 +720,30 @@ void sock_set_reuseaddr(struct sock *sk, unsigned char reuse)
 }
 EXPORT_SYMBOL(sock_set_reuseaddr);
 
+static void __sock_set_linger(struct sock *sk, bool onoff, unsigned int linger)
+{
+	if (!onoff) {
+		sock_reset_flag(sk, SOCK_LINGER);
+		return;
+	}
+
+#if (BITS_PER_LONG == 32)
+	if (linger >= MAX_SCHEDULE_TIMEOUT / HZ)
+		sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+	else
+#endif
+		sk->sk_lingertime = linger * HZ;
+	sock_set_flag(sk, SOCK_LINGER);
+}
+
+void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger)
+{
+	lock_sock(sk);
+	__sock_set_linger(sk, onoff, linger);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_linger);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
@@ -886,17 +910,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			ret = -EFAULT;
 			break;
 		}
-		if (!ling.l_onoff)
-			sock_reset_flag(sk, SOCK_LINGER);
-		else {
-#if (BITS_PER_LONG == 32)
-			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
-				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
-			else
-#endif
-				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
-			sock_set_flag(sk, SOCK_LINGER);
-		}
+		__sock_set_linger(sk, ling.l_onoff, ling.l_linger);
 		break;
 
 	case SO_BSDCOMPAT:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 810a3a49e9474..96f7538e5fa8d 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -113,13 +113,7 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
 
 void rds_tcp_set_linger(struct socket *sock)
 {
-	struct linger no_linger = {
-		.l_onoff = 1,
-		.l_linger = 0,
-	};
-
-	kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
-			  (char *)&no_linger, sizeof(no_linger));
+	sock_set_linger(sock->sk, true, 0);
 }
 
 int rds_tcp_accept_one(struct socket *sock)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 023514e392b31..0f6b78d0e6170 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -323,17 +323,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
 
 static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
 {
-	struct svc_sock *svsk;
-	struct socket *sock;
-	struct linger no_linger = {
-		.l_onoff = 1,
-		.l_linger = 0,
-	};
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
 
-	svsk = container_of(xprt, struct svc_sock, sk_xprt);
-	sock = svsk->sk_sock;
-	kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
-			  (char *)&no_linger, sizeof(no_linger));
+	sock_set_linger(svsk->sk_sock->sk, true, 0);
 }
 
 /*
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 03/33] net: add sock_set_priority
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 01/33] net: add sock_set_reuseaddr Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 02/33] net: add sock_set_linger Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 04/33] net: add sock_set_sndtimeo Christoph Hellwig
                   ` (37 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_PRIORITY sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c   | 12 ++----------
 drivers/nvme/target/tcp.c | 18 ++++--------------
 include/net/sock.h        |  1 +
 net/core/sock.c           |  8 ++++++++
 4 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 5cacb61c73229..cd6a8fc14a139 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1362,16 +1362,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	 */
 	sock_set_linger(queue->sock->sk, true, 0);
 
-	if (so_priority > 0) {
-		ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY,
-				(char *)&so_priority, sizeof(so_priority));
-		if (ret) {
-			dev_err(ctrl->ctrl.device,
-				"failed to set SO_PRIORITY sock opt, ret %d\n",
-				ret);
-			goto err_sock;
-		}
-	}
+	if (so_priority > 0)
+		sock_set_priority(queue->sock->sk, so_priority);
 
 	/* Set socket type of service */
 	if (nctrl->opts->tos >= 0) {
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 87aba417189d2..778c1ce3137b7 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1448,12 +1448,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 	 */
 	sock_set_linger(sock->sk, true, 0);
 
-	if (so_priority > 0) {
-		ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY,
-				(char *)&so_priority, sizeof(so_priority));
-		if (ret)
-			return ret;
-	}
+	if (so_priority > 0)
+		sock_set_priority(sock->sk, so_priority);
 
 	/* Set socket type of service */
 	if (inet->rcv_tos > 0) {
@@ -1638,14 +1634,8 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 		goto err_sock;
 	}
 
-	if (so_priority > 0) {
-		ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY,
-				(char *)&so_priority, sizeof(so_priority));
-		if (ret) {
-			pr_err("failed to set SO_PRIORITY sock opt %d\n", ret);
-			goto err_sock;
-		}
-	}
+	if (so_priority > 0)
+		sock_set_priority(port->sock->sk, so_priority);
 
 	ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
 			sizeof(port->addr));
diff --git a/include/net/sock.h b/include/net/sock.h
index 60890fb47fbc0..cce11782dc295 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2689,5 +2689,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 void sock_def_readable(struct sock *sk);
 void sock_set_reuseaddr(struct sock *sk, unsigned char reuse);
 void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger);
+void sock_set_priority(struct sock *sk, u32 priority);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index cbc5104ca3515..e9f1e2247b004 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -744,6 +744,14 @@ void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger)
 }
 EXPORT_SYMBOL(sock_set_linger);
 
+void sock_set_priority(struct sock *sk, u32 priority)
+{
+	lock_sock(sk);
+	sk->sk_priority = priority;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_priority);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 04/33] net: add sock_set_sndtimeo
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (2 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 03/33] net: add sock_set_priority Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 05/33] net: add sock_bindtoindex Christoph Hellwig
                   ` (36 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_SNDTIMEO_NEW sockopt from kernel
space without going through a fake uaccess.  The interface is
simplified to only pass the seconds value, as that is the only
thing needed at the moment.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/dlm/lowcomms.c  |  8 ++------
 include/net/sock.h |  1 +
 net/core/sock.c    | 11 +++++++++++
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 48e7ba796c6fb..0c0a6413fdfcc 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1035,7 +1035,6 @@ static void sctp_connect_to_sock(struct connection *con)
 	int result;
 	int addr_len;
 	struct socket *sock;
-	struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 };
 
 	if (con->nodeid == 0) {
 		log_print("attempt to connect sock 0 foiled");
@@ -1087,13 +1086,10 @@ static void sctp_connect_to_sock(struct connection *con)
 	 * since O_NONBLOCK argument in connect() function does not work here,
 	 * then, we should restore the default value of this attribute.
 	 */
-	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv,
-			  sizeof(tv));
+	sock_set_sndtimeo(sock->sk, 5);
 	result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
 				   0);
-	memset(&tv, 0, sizeof(tv));
-	kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv,
-			  sizeof(tv));
+	sock_set_sndtimeo(sock->sk, 0);
 
 	if (result == -EINPROGRESS)
 		result = 0;
diff --git a/include/net/sock.h b/include/net/sock.h
index cce11782dc295..809596ffd32d2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2690,5 +2690,6 @@ void sock_def_readable(struct sock *sk);
 void sock_set_reuseaddr(struct sock *sk, unsigned char reuse);
 void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger);
 void sock_set_priority(struct sock *sk, u32 priority);
+void sock_set_sndtimeo(struct sock *sk, unsigned int secs);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index e9f1e2247b004..76527681e50b9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -752,6 +752,17 @@ void sock_set_priority(struct sock *sk, u32 priority)
 }
 EXPORT_SYMBOL(sock_set_priority);
 
+void sock_set_sndtimeo(struct sock *sk, unsigned int secs)
+{
+	lock_sock(sk);
+	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+		sk->sk_sndtimeo = secs * HZ;
+	else
+		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_sndtimeo);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 05/33] net: add sock_bindtoindex
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (3 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 04/33] net: add sock_set_sndtimeo Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 06/33] net: add sock_set_timestamps Christoph Hellwig
                   ` (35 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_BINDTOIFINDEX sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/sock.h        |  1 +
 net/core/sock.c           | 21 +++++++++++++++------
 net/ipv4/udp_tunnel.c     |  4 +---
 net/ipv6/ip6_udp_tunnel.c |  4 +---
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 809596ffd32d2..b63ea15362065 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2691,5 +2691,6 @@ void sock_set_reuseaddr(struct sock *sk, unsigned char reuse);
 void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_sndtimeo(struct sock *sk, unsigned int secs);
+int sock_bindtoindex(struct sock *sk, int ifindex);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 76527681e50b9..4b7439308caec 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 }
 EXPORT_SYMBOL(sk_dst_check);
 
-static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
+static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 {
 	int ret = -ENOPROTOOPT;
 #ifdef CONFIG_NETDEVICES
@@ -594,6 +594,18 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
 	return ret;
 }
 
+int sock_bindtoindex(struct sock *sk, int ifindex)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = sock_bindtoindex_locked(sk, ifindex);
+	release_sock(sk);
+
+	return ret;
+}
+EXPORT_SYMBOL(sock_bindtoindex);
+
 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 				int optlen)
 {
@@ -634,10 +646,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 			goto out;
 	}
 
-	lock_sock(sk);
-	ret = sock_setbindtodevice_locked(sk, index);
-	release_sock(sk);
-
+	return sock_bindtoindex(sk, index);
 out:
 #endif
 
@@ -1221,7 +1230,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_BINDTOIFINDEX:
-		ret = sock_setbindtodevice_locked(sk, val);
+		ret = sock_bindtoindex_locked(sk, val);
 		break;
 
 	default:
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 150e6f0fdbf59..2158e8bddf41c 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -22,9 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 		goto error;
 
 	if (cfg->bind_ifindex) {
-		err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX,
-					(void *)&cfg->bind_ifindex,
-					sizeof(cfg->bind_ifindex));
+		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex);
 		if (err < 0)
 			goto error;
 	}
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 58956a6b66a21..6523609516d25 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -33,9 +33,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 			goto error;
 	}
 	if (cfg->bind_ifindex) {
-		err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX,
-					(void *)&cfg->bind_ifindex,
-					sizeof(cfg->bind_ifindex));
+		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex);
 		if (err < 0)
 			goto error;
 	}
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 06/33] net: add sock_set_timestamps
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (4 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 05/33] net: add sock_bindtoindex Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 07/33] net: add sock_set_keepalive Christoph Hellwig
                   ` (34 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_TIMESTAMP* sockopts from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/sock.h       |  1 +
 net/core/sock.c          | 47 +++++++++++++++++++++++++---------------
 net/rxrpc/local_object.c |  8 +------
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index b63ea15362065..cf8a30e0168de 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2692,5 +2692,6 @@ void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_sndtimeo(struct sock *sk, unsigned int secs);
 int sock_bindtoindex(struct sock *sk, int ifindex);
+void sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns);
 
 #endif	/* _SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 4b7439308caec..1589f242ecc7e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -772,6 +772,28 @@ void sock_set_sndtimeo(struct sock *sk, unsigned int secs)
 }
 EXPORT_SYMBOL(sock_set_sndtimeo);
 
+static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+	if (val)  {
+		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
+		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
+		sock_set_flag(sk, SOCK_RCVTSTAMP);
+		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+	} else {
+		sock_reset_flag(sk, SOCK_RCVTSTAMP);
+		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+		sock_reset_flag(sk, SOCK_TSTAMP_NEW);
+	}
+}
+
+void sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+	lock_sock(sk);
+	__sock_set_timestamps(sk, val, new, ns);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_timestamps);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
@@ -953,28 +975,17 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_TIMESTAMP_OLD:
+		__sock_set_timestamps(sk, valbool, false, false);
+		break;
 	case SO_TIMESTAMP_NEW:
+		__sock_set_timestamps(sk, valbool, true, false);
+		break;
 	case SO_TIMESTAMPNS_OLD:
+		__sock_set_timestamps(sk, valbool, false, true);
+		break;
 	case SO_TIMESTAMPNS_NEW:
-		if (valbool)  {
-			if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
-				sock_set_flag(sk, SOCK_TSTAMP_NEW);
-			else
-				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
-			if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
-				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
-			else
-				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
-			sock_set_flag(sk, SOCK_RCVTSTAMP);
-			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-		} else {
-			sock_reset_flag(sk, SOCK_RCVTSTAMP);
-			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
-			sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-		}
+		__sock_set_timestamps(sk, valbool, true, true);
 		break;
-
 	case SO_TIMESTAMPING_NEW:
 		sock_set_flag(sk, SOCK_TSTAMP_NEW);
 		/* fall through */
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 01135e54d95d2..562ea36c96b0f 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -189,13 +189,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 		}
 
 		/* We want receive timestamps. */
-		opt = 1;
-		ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
-					(char *)&opt, sizeof(opt));
-		if (ret < 0) {
-			_debug("setsockopt failed");
-			goto error;
-		}
+		sock_set_timestamps(local->socket->sk, true, false, true);
 		break;
 
 	default:
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 07/33] net: add sock_set_keepalive
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (5 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 06/33] net: add sock_set_timestamps Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 08/33] net: add sock_set_rcvbuf Christoph Hellwig
                   ` (33 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_KEEPALIVE sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/dlm/lowcomms.c     |  6 +-----
 include/net/sock.h    |  1 +
 net/core/sock.c       | 10 ++++++++++
 net/rds/tcp_listen.c  |  6 +-----
 net/sunrpc/xprtsock.c |  4 +---
 5 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0c0a6413fdfcc..16d616c180613 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1259,11 +1259,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 		con->sock = NULL;
 		goto create_out;
 	}
-	result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-				 (char *)&one, sizeof(one));
-	if (result < 0) {
-		log_print("Set keepalive failed: %d", result);
-	}
+	sock_set_keepalive(sock->sk, true);
 
 	result = sock->ops->listen(sock, 5);
 	if (result < 0) {
diff --git a/include/net/sock.h b/include/net/sock.h
index cf8a30e0168de..4cedde585424f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2691,6 +2691,7 @@ void sock_set_reuseaddr(struct sock *sk, unsigned char reuse);
 void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_sndtimeo(struct sock *sk, unsigned int secs);
+void sock_set_keepalive(struct sock *sk, bool keepalive);
 int sock_bindtoindex(struct sock *sk, int ifindex);
 void sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 1589f242ecc7e..dfd2b839f88bb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -794,6 +794,16 @@ void sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 }
 EXPORT_SYMBOL(sock_set_timestamps);
 
+void sock_set_keepalive(struct sock *sk, bool keepalive)
+{
+	lock_sock(sk);
+	if (sk->sk_prot->keepalive)
+		sk->sk_prot->keepalive(sk, keepalive);
+	sock_valbool_flag(sk, SOCK_KEEPOPEN, keepalive);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_keepalive);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 96f7538e5fa8d..a55b39cd45a6c 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -43,13 +43,9 @@ int rds_tcp_keepalive(struct socket *sock)
 	/* values below based on xs_udp_default_timeout */
 	int keepidle = 5; /* send a probe 'keepidle' secs after last data */
 	int keepcnt = 5; /* number of unack'ed probes before declaring dead */
-	int keepalive = 1;
 	int ret = 0;
 
-	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-				(char *)&keepalive, sizeof(keepalive));
-	if (ret < 0)
-		goto bail;
+	sock_set_keepalive(sock->sk, true);
 
 	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
 				(char *)&keepcnt, sizeof(keepcnt));
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 845d0be805ece..bb61d3758be2b 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2110,7 +2110,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	unsigned int keepidle;
 	unsigned int keepcnt;
-	unsigned int opt_on = 1;
 	unsigned int timeo;
 
 	spin_lock(&xprt->transport_lock);
@@ -2122,8 +2121,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 	spin_unlock(&xprt->transport_lock);
 
 	/* TCP Keepalive options */
-	kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-			(char *)&opt_on, sizeof(opt_on));
+	sock_set_keepalive(sock->sk, 1);
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
 			(char *)&keepidle, sizeof(keepidle));
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 08/33] net: add sock_set_rcvbuf
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (6 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 07/33] net: add sock_set_keepalive Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 09/33] net: add sock_set_reuseport Christoph Hellwig
                   ` (32 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_RCVBUFFORCE sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/dlm/lowcomms.c  |  7 +-----
 include/net/sock.h |  1 +
 net/core/sock.c    | 59 +++++++++++++++++++++++++---------------------
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 16d616c180613..223c185ecd0c7 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1297,7 +1297,6 @@ static int sctp_listen_for_all(void)
 	struct socket *sock = NULL;
 	int result = -EINVAL;
 	struct connection *con = nodeid2con(0, GFP_NOFS);
-	int bufsize = NEEDED_RMEM;
 	int one = 1;
 
 	if (!con)
@@ -1312,11 +1311,7 @@ static int sctp_listen_for_all(void)
 		goto out;
 	}
 
-	result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE,
-				 (char *)&bufsize, sizeof(bufsize));
-	if (result)
-		log_print("Error increasing buffer space on socket %d", result);
-
+	sock_set_rcvbuf(sock->sk, NEEDED_RMEM);
 	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
 				   sizeof(one));
 	if (result < 0)
diff --git a/include/net/sock.h b/include/net/sock.h
index 4cedde585424f..e1ed40ff01312 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2692,6 +2692,7 @@ void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_sndtimeo(struct sock *sk, unsigned int secs);
 void sock_set_keepalive(struct sock *sk, bool keepalive);
+void sock_set_rcvbuf(struct sock *sk, int val);
 int sock_bindtoindex(struct sock *sk, int ifindex);
 void sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index dfd2b839f88bb..6af01b757cf24 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -804,6 +804,35 @@ void sock_set_keepalive(struct sock *sk, bool keepalive)
 }
 EXPORT_SYMBOL(sock_set_keepalive);
 
+void __sock_set_rcvbuf(struct sock *sk, int val)
+{
+	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
+	 * as a negative value.
+	 */
+	val = min_t(int, val, INT_MAX / 2);
+	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+
+	/* We double it on the way in to account for "struct sk_buff" etc.
+	 * overhead.   Applications assume that the SO_RCVBUF setting they make
+	 * will allow that much actual data to be received on that socket.
+	 *
+	 * Applications are unaware that "struct sk_buff" and other overheads
+	 * allocate from the receive buffer during socket buffer allocation.
+	 *
+	 * And after considering the possible alternatives, returning the value
+	 * we actually used in getsockopt is the most desirable behavior.
+	 */
+	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
+}
+
+void sock_set_rcvbuf(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__sock_set_rcvbuf(sk, val);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_rcvbuf);
+
 /*
  *	This is meant for all protocols to use and covers goings on
  *	at the socket level. Everything here is generic.
@@ -900,30 +929,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 		 * are treated in BSD as hints
 		 */
-		val = min_t(u32, val, sysctl_rmem_max);
-set_rcvbuf:
-		/* Ensure val * 2 fits into an int, to prevent max_t()
-		 * from treating it as a negative value.
-		 */
-		val = min_t(int, val, INT_MAX / 2);
-		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
-		/*
-		 * We double it on the way in to account for
-		 * "struct sk_buff" etc. overhead.   Applications
-		 * assume that the SO_RCVBUF setting they make will
-		 * allow that much actual data to be received on that
-		 * socket.
-		 *
-		 * Applications are unaware that "struct sk_buff" and
-		 * other overheads allocate from the receive buffer
-		 * during socket buffer allocation.
-		 *
-		 * And after considering the possible alternatives,
-		 * returning the value we actually used in getsockopt
-		 * is the most desirable behavior.
-		 */
-		WRITE_ONCE(sk->sk_rcvbuf,
-			   max_t(int, val * 2, SOCK_MIN_RCVBUF));
+		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
 		break;
 
 	case SO_RCVBUFFORCE:
@@ -935,9 +941,8 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		/* No negative values (to prevent underflow, as val will be
 		 * multiplied by 2).
 		 */
-		if (val < 0)
-			val = 0;
-		goto set_rcvbuf;
+		__sock_set_rcvbuf(sk, max(val, 0));
+		break;
 
 	case SO_KEEPALIVE:
 		if (sk->sk_prot->keepalive)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 09/33] net: add sock_set_reuseport
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (7 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 08/33] net: add sock_set_rcvbuf Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 10/33] tcp: add tcp_sock_set_cork Christoph Hellwig
                   ` (31 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SO_REUSEPORT sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/sock.h    |  1 +
 net/core/sock.c       |  8 ++++++++
 net/sunrpc/xprtsock.c | 17 +----------------
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index e1ed40ff01312..6b8e06947b243 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 
 void sock_def_readable(struct sock *sk);
 void sock_set_reuseaddr(struct sock *sk, unsigned char reuse);
+void sock_set_reuseport(struct sock *sk, bool reuseport);
 void sock_set_linger(struct sock *sk, bool onoff, unsigned int linger);
 void sock_set_priority(struct sock *sk, u32 priority);
 void sock_set_sndtimeo(struct sock *sk, unsigned int secs);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6af01b757cf24..7f0baf1ccde17 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -729,6 +729,14 @@ void sock_set_reuseaddr(struct sock *sk, unsigned char reuse)
 }
 EXPORT_SYMBOL(sock_set_reuseaddr);
 
+void sock_set_reuseport(struct sock *sk, bool reuseport)
+{
+	lock_sock(sk);
+	sk->sk_reuseport = reuseport;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseport);
+
 static void __sock_set_linger(struct sock *sk, bool onoff, unsigned int linger)
 {
 	if (!onoff) {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bb61d3758be2b..3dc2d52371a0e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1594,21 +1594,6 @@ static int xs_get_random_port(void)
 	return rand + min;
 }
 
-/**
- * xs_set_reuseaddr_port - set the socket's port and address reuse options
- * @sock: socket
- *
- * Note that this function has to be called on all sockets that share the
- * same port, and it must be called before binding.
- */
-static void xs_sock_set_reuseport(struct socket *sock)
-{
-	int opt = 1;
-
-	kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT,
-			(char *)&opt, sizeof(opt));
-}
-
 static unsigned short xs_sock_getport(struct socket *sock)
 {
 	struct sockaddr_storage buf;
@@ -1801,7 +1786,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt,
 	xs_reclassify_socket(family, sock);
 
 	if (reuseport)
-		xs_sock_set_reuseport(sock);
+		sock_set_reuseport(sock->sk, true);
 
 	err = xs_bind(transport, sock);
 	if (err) {
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 10/33] tcp: add tcp_sock_set_cork
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (8 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 09/33] net: add sock_set_reuseport Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 11/33] tcp: tcp_sock_set_nodelay Christoph Hellwig
                   ` (30 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the TCP_CORK sockopt from kernel space
without going through a fake uaccess.  Cleanup the callers to avoid
pointless wrappers now that this is a simple function call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/drbd/drbd_int.h      | 14 --------
 drivers/block/drbd/drbd_receiver.c |  4 +--
 drivers/block/drbd/drbd_worker.c   |  6 ++--
 fs/cifs/transport.c                |  8 ++---
 include/linux/tcp.h                |  2 ++
 net/ipv4/tcp.c                     | 51 +++++++++++++++++++-----------
 net/rds/tcp_send.c                 |  9 ++----
 7 files changed, 43 insertions(+), 51 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index aae99a2d7bd40..3550adc93c68b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1570,20 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
 extern int drbd_connected(struct drbd_peer_device *);
 
-static inline void drbd_tcp_cork(struct socket *sock)
-{
-	int val = 1;
-	(void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
-			(char*)&val, sizeof(val));
-}
-
-static inline void drbd_tcp_uncork(struct socket *sock)
-{
-	int val = 0;
-	(void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
-			(char*)&val, sizeof(val));
-}
-
 static inline void drbd_tcp_nodelay(struct socket *sock)
 {
 	int val = 1;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c15e7083b13a6..55ea907ad33cb 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -6162,7 +6162,7 @@ void drbd_send_acks_wf(struct work_struct *ws)
 	rcu_read_unlock();
 
 	if (tcp_cork)
-		drbd_tcp_cork(connection->meta.socket);
+		tcp_sock_set_cork(connection->meta.socket->sk, true);
 
 	err = drbd_finish_peer_reqs(device);
 	kref_put(&device->kref, drbd_destroy_device);
@@ -6175,7 +6175,7 @@ void drbd_send_acks_wf(struct work_struct *ws)
 	}
 
 	if (tcp_cork)
-		drbd_tcp_uncork(connection->meta.socket);
+		tcp_sock_set_cork(connection->meta.socket->sk, false);
 
 	return;
 }
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 0dc019da1f8d0..2b89c9f2ca707 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -2098,7 +2098,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 	if (uncork) {
 		mutex_lock(&connection->data.mutex);
 		if (connection->data.socket)
-			drbd_tcp_uncork(connection->data.socket);
+			tcp_sock_set_cork(connection->data.socket->sk, false);
 		mutex_unlock(&connection->data.mutex);
 	}
 
@@ -2153,9 +2153,9 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 	mutex_lock(&connection->data.mutex);
 	if (connection->data.socket) {
 		if (cork)
-			drbd_tcp_cork(connection->data.socket);
+			tcp_sock_set_cork(connection->data.socket->sk, true);
 		else if (!uncork)
-			drbd_tcp_uncork(connection->data.socket);
+			tcp_sock_set_cork(connection->data.socket->sk, false);
 	}
 	mutex_unlock(&connection->data.mutex);
 }
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c97570eb2c180..99760063e0006 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -325,7 +325,6 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 	size_t total_len = 0, sent, size;
 	struct socket *ssocket = server->ssocket;
 	struct msghdr smb_msg;
-	int val = 1;
 	__be32 rfc1002_marker;
 
 	if (cifs_rdma_enabled(server)) {
@@ -345,8 +344,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 	}
 
 	/* cork the socket */
-	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
-				(char *)&val, sizeof(val));
+	tcp_sock_set_cork(ssocket->sk, true);
 
 	for (j = 0; j < num_rqst; j++)
 		send_length += smb_rqst_len(server, &rqst[j]);
@@ -435,9 +433,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 	}
 
 	/* uncork it */
-	val = 0;
-	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
-				(char *)&val, sizeof(val));
+	tcp_sock_set_cork(ssocket->sk, false);
 
 	if ((total_len > 0) && (total_len != send_length)) {
 		cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n",
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e60db06ec28d7..7ef0f975a7658 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -494,4 +494,6 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
 int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 		  int shiftlen);
 
+void tcp_sock_set_cork(struct sock *sk, bool on);
+
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c1250103959a..e11ba10b90d4c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2790,6 +2790,37 @@ static void tcp_enable_tx_delay(void)
 	}
 }
 
+/* When set indicates to always queue non-full frames.  Later the user clears
+ * this option and we transmit any pending partial frames in the queue.  This is
+ * meant to be used alongside sendfile() to get properly filled frames when the
+ * user (for example) must write out headers with a write() call first and then
+ * use sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
+ * TCP_NODELAY.
+ */
+static void __tcp_sock_set_cork(struct sock *sk, bool on)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (on) {
+		tp->nonagle |= TCP_NAGLE_CORK;
+	} else {
+		tp->nonagle &= ~TCP_NAGLE_CORK;
+		if (tp->nonagle & TCP_NAGLE_OFF)
+			tp->nonagle |= TCP_NAGLE_PUSH;
+		tcp_push_pending_frames(sk);
+	}
+}
+
+void tcp_sock_set_cork(struct sock *sk, bool on)
+{
+	lock_sock(sk);
+	__tcp_sock_set_cork(sk, on);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_cork);
+
 /*
  *	Socket option code for TCP.
  */
@@ -2968,25 +2999,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_CORK:
-		/* When set indicates to always queue non-full frames.
-		 * Later the user clears this option and we transmit
-		 * any pending partial frames in the queue.  This is
-		 * meant to be used alongside sendfile() to get properly
-		 * filled frames when the user (for example) must write
-		 * out headers with a write() call first and then use
-		 * sendfile to send out the data parts.
-		 *
-		 * TCP_CORK can be set together with TCP_NODELAY and it is
-		 * stronger than TCP_NODELAY.
-		 */
-		if (val) {
-			tp->nonagle |= TCP_NAGLE_CORK;
-		} else {
-			tp->nonagle &= ~TCP_NAGLE_CORK;
-			if (tp->nonagle&TCP_NAGLE_OFF)
-				tp->nonagle |= TCP_NAGLE_PUSH;
-			tcp_push_pending_frames(sk);
-		}
+		__tcp_sock_set_cork(sk, val);
 		break;
 
 	case TCP_KEEPIDLE:
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 78a2554a44979..8c4d1d6e9249d 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -38,23 +38,18 @@
 #include "rds.h"
 #include "tcp.h"
 
-static void rds_tcp_cork(struct socket *sock, int val)
-{
-	kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val));
-}
-
 void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
 {
 	struct rds_tcp_connection *tc = cp->cp_transport_data;
 
-	rds_tcp_cork(tc->t_sock, 1);
+	tcp_sock_set_cork(tc->t_sock->sk, true);
 }
 
 void rds_tcp_xmit_path_complete(struct rds_conn_path *cp)
 {
 	struct rds_tcp_connection *tc = cp->cp_transport_data;
 
-	rds_tcp_cork(tc->t_sock, 0);
+	tcp_sock_set_cork(tc->t_sock->sk, false);
 }
 
 /* the core send_sem serializes this with other xmit and shutdown */
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 11/33] tcp: tcp_sock_set_nodelay
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (9 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 10/33] tcp: add tcp_sock_set_cork Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13 12:51   ` Jason Gunthorpe
  2020-05-13  6:26 ` [PATCH 12/33] tcp: add tcp_sock_set_quickack Christoph Hellwig
                   ` (29 subsequent siblings)
  40 siblings, 1 reply; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the TCP_NODELAY sockopt from kernel space
without going through a fake uaccess.  Cleanup the callers to avoid
pointless wrappers now that this is a simple function call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/drbd/drbd_int.h             |  7 ----
 drivers/block/drbd/drbd_main.c            |  2 +-
 drivers/block/drbd/drbd_receiver.c        |  4 +--
 drivers/infiniband/sw/siw/siw_cm.c        | 24 +++-----------
 drivers/nvme/host/tcp.c                   |  9 +-----
 drivers/nvme/target/tcp.c                 | 12 ++-----
 drivers/target/iscsi/iscsi_target_login.c | 15 ++-------
 fs/cifs/connect.c                         | 10 ++----
 fs/dlm/lowcomms.c                         |  8 ++---
 fs/ocfs2/cluster/tcp.c                    | 20 ++----------
 include/linux/tcp.h                       |  1 +
 net/ceph/messenger.c                      | 11 ++-----
 net/ipv4/tcp.c                            | 39 +++++++++++++++--------
 net/rds/tcp.c                             | 11 +------
 net/rds/tcp.h                             |  1 -
 net/rds/tcp_listen.c                      |  2 +-
 16 files changed, 49 insertions(+), 127 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3550adc93c68b..e24bba87c8e02 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
 extern int drbd_connected(struct drbd_peer_device *);
 
-static inline void drbd_tcp_nodelay(struct socket *sock)
-{
-	int val = 1;
-	(void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-			(char*)&val, sizeof(val));
-}
-
 static inline void drbd_tcp_quickack(struct socket *sock)
 {
 	int val = 2;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index c094c3c2c5d4d..4c876c7d7067f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -660,7 +660,7 @@ static int __send_command(struct drbd_connection *connection, int vnr,
 	/* DRBD protocol "pings" are latency critical.
 	 * This is supposed to trigger tcp_push_pending_frames() */
 	if (!err && (cmd == P_PING || cmd == P_PING_ACK))
-		drbd_tcp_nodelay(sock->socket);
+		tcp_sock_set_nodelay(sock->socket->sk, true);
 
 	return err;
 }
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 55ea907ad33cb..da5a9ee896a43 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1051,8 +1051,8 @@ static int conn_connect(struct drbd_connection *connection)
 
 	/* we don't want delays.
 	 * we use TCP_CORK where appropriate, though */
-	drbd_tcp_nodelay(sock.socket);
-	drbd_tcp_nodelay(msock.socket);
+	tcp_sock_set_nodelay(sock.socket->sk, true);
+	tcp_sock_set_nodelay(msock.socket->sk, true);
 
 	connection->data.socket = sock.socket;
 	connection->meta.socket = msock.socket;
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index 6d7c8c933736c..7781bcddf7e23 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -947,16 +947,8 @@ static void siw_accept_newconn(struct siw_cep *cep)
 	siw_cep_get(new_cep);
 	new_s->sk->sk_user_data = new_cep;
 
-	if (siw_tcp_nagle == false) {
-		int val = 1;
-
-		rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY,
-				       (char *)&val, sizeof(val));
-		if (rv) {
-			siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv);
-			goto error;
-		}
-	}
+	if (siw_tcp_nagle == false)
+		tcp_sock_set_nodelay(new_s->sk, true);
 	new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
 
 	rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
@@ -1386,16 +1378,8 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
 		siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
 		goto error;
 	}
-	if (siw_tcp_nagle == false) {
-		int val = 1;
-
-		rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val,
-				       sizeof(val));
-		if (rv) {
-			siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv);
-			goto error;
-		}
-	}
+	if (siw_tcp_nagle == false)
+		tcp_sock_set_nodelay(s->sk, true);
 	cep = siw_cep_alloc(sdev);
 	if (!cep) {
 		rv = -ENOMEM;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index cd6a8fc14a139..a8070f93fd0a0 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1346,14 +1346,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	}
 
 	/* Set TCP no delay */
-	opt = 1;
-	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
-			TCP_NODELAY, (char *)&opt, sizeof(opt));
-	if (ret) {
-		dev_err(nctrl->device,
-			"failed to set TCP_NODELAY sock opt %d\n", ret);
-		goto err_sock;
-	}
+	tcp_sock_set_nodelay(queue->sock->sk, true);
 
 	/*
 	 * Cleanup whatever is sitting in the TCP transmit queue on socket
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 778c1ce3137b7..b2bfa791c5cb2 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1580,7 +1580,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 {
 	struct nvmet_tcp_port *port;
 	__kernel_sa_family_t af;
-	int opt, ret;
+	int ret;
 
 	port = kzalloc(sizeof(*port), GFP_KERNEL);
 	if (!port)
@@ -1625,15 +1625,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 	port->data_ready = port->sock->sk->sk_data_ready;
 	port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
 	sock_set_reuseaddr(port->sock->sk, SK_CAN_REUSE);
-
-	opt = 1;
-	ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
-			TCP_NODELAY, (char *)&opt, sizeof(opt));
-	if (ret) {
-		pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
-		goto err_sock;
-	}
-
+	tcp_sock_set_nodelay(port->sock->sk, true);
 	if (so_priority > 0)
 		sock_set_priority(port->sock->sk, so_priority);
 
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 7da59ece3eb99..165fa573bcb29 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -897,20 +897,11 @@ int iscsit_setup_np(
 	/*
 	 * Set SO_REUSEADDR, and disable Nagel Algorithm with TCP_NODELAY.
 	 */
-	/* FIXME: Someone please explain why this is endian-safe */
-	opt = 1;
-	if (np->np_network_transport == ISCSI_TCP) {
-		ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
-				(char *)&opt, sizeof(opt));
-		if (ret < 0) {
-			pr_err("kernel_setsockopt() for TCP_NODELAY"
-				" failed: %d\n", ret);
-			goto fail;
-		}
-	}
-
+	if (np->np_network_transport == ISCSI_TCP)
+		tcp_sock_set_nodelay(sock->sk, true);
 	sock_set_reuseaddr(sock->sk, SK_CAN_REUSE);
 
+	opt = 1;
 	ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND,
 			(char *)&opt, sizeof(opt));
 	if (ret < 0) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 28268ed461b82..b0422ed617832 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3929,14 +3929,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
 			socket->sk->sk_rcvbuf = 140 * 1024;
 	}
 
-	if (server->tcp_nodelay) {
-		int val = 1;
-		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
-				(char *)&val, sizeof(val));
-		if (rc)
-			cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n",
-				 rc);
-	}
+	if (server->tcp_nodelay)
+		tcp_sock_set_nodelay(socket->sk, true);
 
 	cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n",
 		 socket->sk->sk_sndbuf,
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 223c185ecd0c7..b722a09a7ca05 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1128,7 +1128,6 @@ static void tcp_connect_to_sock(struct connection *con)
 	struct sockaddr_storage saddr, src_addr;
 	int addr_len;
 	struct socket *sock = NULL;
-	int one = 1;
 	int result;
 
 	if (con->nodeid == 0) {
@@ -1177,8 +1176,7 @@ static void tcp_connect_to_sock(struct connection *con)
 	log_print("connecting to %d", con->nodeid);
 
 	/* Turn off Nagle's algorithm */
-	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
-			  sizeof(one));
+	tcp_sock_set_nodelay(sock->sk, true);
 
 	result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
 				   O_NONBLOCK);
@@ -1220,7 +1218,6 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 {
 	struct socket *sock = NULL;
 	int result = 0;
-	int one = 1;
 	int addr_len;
 
 	if (dlm_local_addr[0]->ss_family == AF_INET)
@@ -1237,8 +1234,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 	}
 
 	/* Turn off Nagle's algorithm */
-	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
-			  sizeof(one));
+	tcp_sock_set_nodelay(sock->sk, true);
 
 	sock_set_reuseaddr(sock->sk, SK_CAN_REUSE);
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2c512b40a940e..7936e22e39f34 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work)
 	sc_put(sc);
 }
 
-static int o2net_set_nodelay(struct socket *sock)
-{
-	int val = 1;
-
-	return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-				    (void *)&val, sizeof(val));
-}
-
 static int o2net_set_usertimeout(struct socket *sock)
 {
 	int user_timeout = O2NET_TCP_USER_TIMEOUT;
@@ -1636,11 +1628,7 @@ static void o2net_start_connect(struct work_struct *work)
 		goto out;
 	}
 
-	ret = o2net_set_nodelay(sc->sc_sock);
-	if (ret) {
-		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
-		goto out;
-	}
+	tcp_sock_set_nodelay(sc->sc_sock->sk, true);
 
 	ret = o2net_set_usertimeout(sock);
 	if (ret) {
@@ -1832,11 +1820,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
 	*more = 1;
 	new_sock->sk->sk_allocation = GFP_ATOMIC;
 
-	ret = o2net_set_nodelay(new_sock);
-	if (ret) {
-		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
-		goto out;
-	}
+	tcp_sock_set_nodelay(new_sock->sk, true);
 
 	ret = o2net_set_usertimeout(new_sock);
 	if (ret) {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7ef0f975a7658..533610b6ae420 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -495,5 +495,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 		  int shiftlen);
 
 void tcp_sock_set_cork(struct sock *sk, bool on);
+void tcp_sock_set_nodelay(struct sock *sk, bool on);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f8ca5edc5f2c9..67d5cb2ddc9cb 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -490,15 +490,8 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 		return ret;
 	}
 
-	if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) {
-		int optval = 1;
-
-		ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
-					(char *)&optval, sizeof(optval));
-		if (ret)
-			pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d",
-			       ret);
-	}
+	if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY))
+		tcp_sock_set_nodelay(sock->sk, true);
 
 	con->sock = sock;
 	return 0;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e11ba10b90d4c..300ce622607d8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2821,6 +2821,30 @@ void tcp_sock_set_cork(struct sock *sk, bool on)
 }
 EXPORT_SYMBOL(tcp_sock_set_cork);
 
+/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
+ * remembered, but it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make an explicit push, which overrides
+ * even TCP_CORK for currently queued segments.
+ */
+static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
+{
+	if (on) {
+		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+		tcp_push_pending_frames(sk);
+	} else {
+		tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
+	}
+}
+
+void tcp_sock_set_nodelay(struct sock *sk, bool on)
+{
+	lock_sock(sk);
+	__tcp_sock_set_nodelay(sk, on);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_nodelay);
+
 /*
  *	Socket option code for TCP.
  */
@@ -2918,20 +2942,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_NODELAY:
-		if (val) {
-			/* TCP_NODELAY is weaker than TCP_CORK, so that
-			 * this option on corked socket is remembered, but
-			 * it is not activated until cork is cleared.
-			 *
-			 * However, when TCP_NODELAY is set we make
-			 * an explicit push, which overrides even TCP_CORK
-			 * for currently queued segments.
-			 */
-			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
-			tcp_push_pending_frames(sk);
-		} else {
-			tp->nonagle &= ~TCP_NAGLE_OFF;
-		}
+		__tcp_sock_set_nodelay(sk, val);
 		break;
 
 	case TCP_THIN_LINEAR_TIMEOUTS:
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 46782fac4c162..8721803958f6b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -89,15 +89,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = {
 	{ }
 };
 
-/* doing it this way avoids calling tcp_sk() */
-void rds_tcp_nonagle(struct socket *sock)
-{
-	int val = 1;
-
-	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val,
-			      sizeof(val));
-}
-
 u32 rds_tcp_write_seq(struct rds_tcp_connection *tc)
 {
 	/* seq# of the last byte of data in tcp send buffer */
@@ -502,7 +493,7 @@ void rds_tcp_tune(struct socket *sock)
 	struct net *net = sock_net(sk);
 	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 
-	rds_tcp_nonagle(sock);
+	tcp_sock_set_nodelay(sock->sk, true);
 	lock_sock(sk);
 	if (rtn->sndbuf_size > 0) {
 		sk->sk_sndbuf = rtn->sndbuf_size;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 3c69361d21c73..39ac666d09c6c 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -50,7 +50,6 @@ struct rds_tcp_statistics {
 
 /* tcp.c */
 void rds_tcp_tune(struct socket *sock);
-void rds_tcp_nonagle(struct socket *sock);
 void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
 void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
 void rds_tcp_restore_callbacks(struct socket *sock,
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index a55b39cd45a6c..e76ec64b43fe7 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -293,7 +293,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
 	}
 
 	sock->sk->sk_reuse = SK_CAN_REUSE;
-	rds_tcp_nonagle(sock);
+	tcp_sock_set_nodelay(sock->sk, true);
 
 	write_lock_bh(&sock->sk->sk_callback_lock);
 	sock->sk->sk_user_data = sock->sk->sk_data_ready;
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 12/33] tcp: add tcp_sock_set_quickack
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (10 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 11/33] tcp: tcp_sock_set_nodelay Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 13/33] tcp: add tcp_sock_set_syncnt Christoph Hellwig
                   ` (28 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the TCP_QUICKACK sockopt from kernel space
without going through a fake uaccess.  Cleanup the callers to avoid
pointless wrappers now that this is a simple function call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/drbd/drbd_int.h      |  7 ------
 drivers/block/drbd/drbd_receiver.c |  5 ++--
 include/linux/tcp.h                |  1 +
 net/ipv4/tcp.c                     | 39 ++++++++++++++++++++----------
 4 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e24bba87c8e02..14345a87c7cc5 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1570,13 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
 extern int drbd_connected(struct drbd_peer_device *);
 
-static inline void drbd_tcp_quickack(struct socket *sock)
-{
-	int val = 2;
-	(void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
-			(char*)&val, sizeof(val));
-}
-
 /* sets the number of 512 byte sectors of our virtual device */
 void drbd_set_my_capacity(struct drbd_device *device, sector_t size);
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index da5a9ee896a43..cdd317ae97021 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1223,7 +1223,7 @@ static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, str
 		 * quickly as possible, and let remote TCP know what we have
 		 * received so far. */
 		if (err == -EAGAIN) {
-			drbd_tcp_quickack(connection->data.socket);
+			tcp_sock_set_quickack(connection->data.socket->sk, 2);
 			drbd_unplug_all_devices(connection);
 		}
 		if (err > 0) {
@@ -4959,8 +4959,7 @@ static int receive_UnplugRemote(struct drbd_connection *connection, struct packe
 {
 	/* Make sure we've acked all the TCP data associated
 	 * with the data requests being unplugged */
-	drbd_tcp_quickack(connection->data.socket);
-
+	tcp_sock_set_quickack(connection->data.socket->sk, 2);
 	return 0;
 }
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 533610b6ae420..e7ab6da5111b5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -496,5 +496,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 
 void tcp_sock_set_cork(struct sock *sk, bool on);
 void tcp_sock_set_nodelay(struct sock *sk, bool on);
+void tcp_sock_set_quickack(struct sock *sk, int val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 300ce622607d8..c681f43f0bb85 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2845,6 +2845,31 @@ void tcp_sock_set_nodelay(struct sock *sk, bool on)
 }
 EXPORT_SYMBOL(tcp_sock_set_nodelay);
 
+static void __tcp_sock_set_quickack(struct sock *sk, int val)
+{
+	if (!val) {
+		inet_csk_enter_pingpong_mode(sk);
+		return;
+	}
+
+	inet_csk_exit_pingpong_mode(sk);
+	if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+	    inet_csk_ack_scheduled(sk)) {
+		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
+		tcp_cleanup_rbuf(sk, 1);
+		if (!(val & 1))
+			inet_csk_enter_pingpong_mode(sk);
+	}
+}
+
+void tcp_sock_set_quickack(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__tcp_sock_set_quickack(sk, val);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_quickack);
+
 /*
  *	Socket option code for TCP.
  */
@@ -3085,19 +3110,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_QUICKACK:
-		if (!val) {
-			inet_csk_enter_pingpong_mode(sk);
-		} else {
-			inet_csk_exit_pingpong_mode(sk);
-			if ((1 << sk->sk_state) &
-			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
-			    inet_csk_ack_scheduled(sk)) {
-				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-				tcp_cleanup_rbuf(sk, 1);
-				if (!(val & 1))
-					inet_csk_enter_pingpong_mode(sk);
-			}
-		}
+		__tcp_sock_set_quickack(sk, val);
 		break;
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 13/33] tcp: add tcp_sock_set_syncnt
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (11 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 12/33] tcp: add tcp_sock_set_quickack Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 14/33] tcp: add tcp_sock_set_user_timeout Christoph Hellwig
                   ` (27 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the TCP_SYNCNT sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c |  9 +--------
 include/linux/tcp.h     |  1 +
 net/ipv4/tcp.c          | 12 ++++++++++++
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a8070f93fd0a0..8417eeb83fcd2 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1336,14 +1336,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	}
 
 	/* Single syn retry */
-	opt = 1;
-	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
-			(char *)&opt, sizeof(opt));
-	if (ret) {
-		dev_err(nctrl->device,
-			"failed to set TCP_SYNCNT sock opt %d\n", ret);
-		goto err_sock;
-	}
+	tcp_sock_set_syncnt(queue->sock->sk, 1);
 
 	/* Set TCP no delay */
 	tcp_sock_set_nodelay(queue->sock->sk, true);
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e7ab6da5111b5..77b832acf3398 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -497,5 +497,6 @@ int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
 void tcp_sock_set_cork(struct sock *sk, bool on);
 void tcp_sock_set_nodelay(struct sock *sk, bool on);
 void tcp_sock_set_quickack(struct sock *sk, int val);
+int tcp_sock_set_syncnt(struct sock *sk, int val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c681f43f0bb85..773b5cd366ab7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2870,6 +2870,18 @@ void tcp_sock_set_quickack(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_quickack);
 
+int tcp_sock_set_syncnt(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_SYNCNT)
+		return -EINVAL;
+
+	lock_sock(sk);
+	inet_csk(sk)->icsk_syn_retries = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_syncnt);
+
 /*
  *	Socket option code for TCP.
  */
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 14/33] tcp: add tcp_sock_set_user_timeout
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (12 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 13/33] tcp: add tcp_sock_set_syncnt Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 15/33] tcp: add tcp_sock_set_keepidle Christoph Hellwig
                   ` (26 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the TCP_USER_TIMEOUT sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/ocfs2/cluster/tcp.c | 22 ++--------------------
 include/linux/tcp.h    |  1 +
 net/ipv4/tcp.c         |  8 ++++++++
 net/sunrpc/xprtsock.c  |  3 +--
 4 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 7936e22e39f34..5776df10d11f9 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1441,14 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work)
 	sc_put(sc);
 }
 
-static int o2net_set_usertimeout(struct socket *sock)
-{
-	int user_timeout = O2NET_TCP_USER_TIMEOUT;
-
-	return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
-				(void *)&user_timeout, sizeof(user_timeout));
-}
-
 static void o2net_initialize_handshake(void)
 {
 	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1629,12 +1621,7 @@ static void o2net_start_connect(struct work_struct *work)
 	}
 
 	tcp_sock_set_nodelay(sc->sc_sock->sk, true);
-
-	ret = o2net_set_usertimeout(sock);
-	if (ret) {
-		mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
-		goto out;
-	}
+	tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT);
 
 	o2net_register_callbacks(sc->sc_sock->sk, sc);
 
@@ -1821,12 +1808,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
 	new_sock->sk->sk_allocation = GFP_ATOMIC;
 
 	tcp_sock_set_nodelay(new_sock->sk, true);
-
-	ret = o2net_set_usertimeout(new_sock);
-	if (ret) {
-		mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
-		goto out;
-	}
+	tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT);
 
 	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1);
 	if (ret < 0)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 77b832acf3398..69c988f84a184 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -498,5 +498,6 @@ void tcp_sock_set_cork(struct sock *sk, bool on);
 void tcp_sock_set_nodelay(struct sock *sk, bool on);
 void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
+void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 773b5cd366ab7..9a8d062b17a48 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2882,6 +2882,14 @@ int tcp_sock_set_syncnt(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_syncnt);
 
+void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
+{
+	lock_sock(sk);
+	inet_csk(sk)->icsk_user_timeout = val;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_user_timeout);
+
 /*
  *	Socket option code for TCP.
  */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3dc2d52371a0e..30d4c4fcd3e38 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2115,8 +2115,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 			(char *)&keepcnt, sizeof(keepcnt));
 
 	/* TCP user timeout (see RFC5482) */
-	kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
-			(char *)&timeo, sizeof(timeo));
+	tcp_sock_set_user_timeout(sock->sk, timeo);
 }
 
 static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 15/33] tcp: add tcp_sock_set_keepidle
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (13 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 14/33] tcp: add tcp_sock_set_user_timeout Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 16/33] tcp: add tcp_sock_set_keepintvl Christoph Hellwig
                   ` (25 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the TCP_KEEP_IDLE sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp.c        | 49 ++++++++++++++++++++++++++++++-------------
 net/rds/tcp_listen.c  |  5 +----
 net/sunrpc/xprtsock.c |  3 +--
 4 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 69c988f84a184..4d3a3e959e45b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -499,5 +499,6 @@ void tcp_sock_set_nodelay(struct sock *sk, bool on);
 void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
 void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
+int tcp_sock_set_keepidle(struct sock *sk, int val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9a8d062b17a48..22eb9159c7d05 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2890,6 +2890,39 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
 }
 EXPORT_SYMBOL(tcp_sock_set_user_timeout);
 
+static int __tcp_sock_set_keepidle(struct sock *sk, int val)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (val < 1 || val > MAX_TCP_KEEPIDLE)
+		return -EINVAL;
+
+	tp->keepalive_time = val * HZ;
+	if (sock_flag(sk, SOCK_KEEPOPEN) &&
+	    !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
+		u32 elapsed = keepalive_time_elapsed(tp);
+
+		if (tp->keepalive_time > elapsed)
+			elapsed = tp->keepalive_time - elapsed;
+		else
+			elapsed = 0;
+		inet_csk_reset_keepalive_timer(sk, elapsed);
+	}
+
+	return 0;
+}
+
+int tcp_sock_set_keepidle(struct sock *sk, int val)
+{
+	int err;
+
+	lock_sock(sk);
+	err = __tcp_sock_set_keepidle(sk, val);
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepidle);
+
 /*
  *	Socket option code for TCP.
  */
@@ -3059,21 +3092,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_KEEPIDLE:
-		if (val < 1 || val > MAX_TCP_KEEPIDLE)
-			err = -EINVAL;
-		else {
-			tp->keepalive_time = val * HZ;
-			if (sock_flag(sk, SOCK_KEEPOPEN) &&
-			    !((1 << sk->sk_state) &
-			      (TCPF_CLOSE | TCPF_LISTEN))) {
-				u32 elapsed = keepalive_time_elapsed(tp);
-				if (tp->keepalive_time > elapsed)
-					elapsed = tp->keepalive_time - elapsed;
-				else
-					elapsed = 0;
-				inet_csk_reset_keepalive_timer(sk, elapsed);
-			}
-		}
+		err = __tcp_sock_set_keepidle(sk, val);
 		break;
 	case TCP_KEEPINTVL:
 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index e76ec64b43fe7..8c76969d8c878 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -52,10 +52,7 @@ int rds_tcp_keepalive(struct socket *sock)
 	if (ret < 0)
 		goto bail;
 
-	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
-				(char *)&keepidle, sizeof(keepidle));
-	if (ret < 0)
-		goto bail;
+	tcp_sock_set_keepidle(sock->sk, keepidle);
 
 	/* KEEPINTVL is the interval between successive probes. We follow
 	 * the model in xs_tcp_finish_connecting() and re-use keepidle.
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 30d4c4fcd3e38..ea79446789c69 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2107,8 +2107,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 
 	/* TCP Keepalive options */
 	sock_set_keepalive(sock->sk, 1);
-	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
-			(char *)&keepidle, sizeof(keepidle));
+	tcp_sock_set_keepidle(sock->sk, keepidle);
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
 			(char *)&keepidle, sizeof(keepidle));
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 16/33] tcp: add tcp_sock_set_keepintvl
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (14 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 15/33] tcp: add tcp_sock_set_keepidle Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 17/33] tcp: add tcp_sock_set_keepcnt Christoph Hellwig
                   ` (24 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the TCP_KEEPINTVL sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp.c        | 12 ++++++++++++
 net/rds/tcp_listen.c  |  4 +---
 net/sunrpc/xprtsock.c |  3 +--
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4d3a3e959e45b..dad18ca361c01 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -500,5 +500,6 @@ void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
 void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
 int tcp_sock_set_keepidle(struct sock *sk, int val);
+int tcp_sock_set_keepintvl(struct sock *sk, int val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 22eb9159c7d05..b714f2b2fa54e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2923,6 +2923,18 @@ int tcp_sock_set_keepidle(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_keepidle);
 
+int tcp_sock_set_keepintvl(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_KEEPINTVL)
+		return -EINVAL;
+
+	lock_sock(sk);
+	tcp_sk(sk)->keepalive_intvl = val * HZ;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepintvl);
+
 /*
  *	Socket option code for TCP.
  */
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 8c76969d8c878..a5db2f8bb7339 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -53,12 +53,10 @@ int rds_tcp_keepalive(struct socket *sock)
 		goto bail;
 
 	tcp_sock_set_keepidle(sock->sk, keepidle);
-
 	/* KEEPINTVL is the interval between successive probes. We follow
 	 * the model in xs_tcp_finish_connecting() and re-use keepidle.
 	 */
-	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
-				(char *)&keepidle, sizeof(keepidle));
+	tcp_sock_set_keepintvl(sock->sk, keepidle);
 bail:
 	return ret;
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index ea79446789c69..e20de4a52edb7 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2108,8 +2108,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 	/* TCP Keepalive options */
 	sock_set_keepalive(sock->sk, 1);
 	tcp_sock_set_keepidle(sock->sk, keepidle);
-	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
-			(char *)&keepidle, sizeof(keepidle));
+	tcp_sock_set_keepintvl(sock->sk, keepidle);
 	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
 			(char *)&keepcnt, sizeof(keepcnt));
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 17/33] tcp: add tcp_sock_set_keepcnt
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (15 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 16/33] tcp: add tcp_sock_set_keepintvl Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 18/33] ipv4: add ip_sock_set_tos Christoph Hellwig
                   ` (23 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the TCP_KEEPCNT sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp.c        | 12 ++++++++++++
 net/rds/tcp.h         |  2 +-
 net/rds/tcp_listen.c  | 17 +++--------------
 net/sunrpc/xprtsock.c |  3 +--
 5 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index dad18ca361c01..ff2aa165b5c02 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -501,5 +501,6 @@ int tcp_sock_set_syncnt(struct sock *sk, int val);
 void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
 int tcp_sock_set_keepidle(struct sock *sk, int val);
 int tcp_sock_set_keepintvl(struct sock *sk, int val);
+int tcp_sock_set_keepcnt(struct sock *sk, int val);
 
 #endif	/* _LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b714f2b2fa54e..a0406df42ef39 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2935,6 +2935,18 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_sock_set_keepintvl);
 
+int tcp_sock_set_keepcnt(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_KEEPCNT)
+		return -EINVAL;
+
+	lock_sock(sk);
+	tcp_sk(sk)->keepalive_probes = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepcnt);
+
 /*
  *	Socket option code for TCP.
  */
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 39ac666d09c6c..ae18568bce233 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -70,7 +70,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
 void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
 void rds_tcp_listen_data_ready(struct sock *sk);
 int rds_tcp_accept_one(struct socket *sock);
-int rds_tcp_keepalive(struct socket *sock);
+void rds_tcp_keepalive(struct socket *sock);
 void *rds_tcp_listen_sock_def_readable(struct net *net);
 void rds_tcp_set_linger(struct socket *sock);
 
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index a5db2f8bb7339..f6d2b4c9f445a 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -38,27 +38,19 @@
 #include "rds.h"
 #include "tcp.h"
 
-int rds_tcp_keepalive(struct socket *sock)
+void rds_tcp_keepalive(struct socket *sock)
 {
 	/* values below based on xs_udp_default_timeout */
 	int keepidle = 5; /* send a probe 'keepidle' secs after last data */
 	int keepcnt = 5; /* number of unack'ed probes before declaring dead */
-	int ret = 0;
 
 	sock_set_keepalive(sock->sk, true);
-
-	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
-				(char *)&keepcnt, sizeof(keepcnt));
-	if (ret < 0)
-		goto bail;
-
+	tcp_sock_set_keepcnt(sock->sk, keepcnt);
 	tcp_sock_set_keepidle(sock->sk, keepidle);
 	/* KEEPINTVL is the interval between successive probes. We follow
 	 * the model in xs_tcp_finish_connecting() and re-use keepidle.
 	 */
 	tcp_sock_set_keepintvl(sock->sk, keepidle);
-bail:
-	return ret;
 }
 
 /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
@@ -145,10 +137,7 @@ int rds_tcp_accept_one(struct socket *sock)
 	new_sock->ops = sock->ops;
 	__module_get(new_sock->ops->owner);
 
-	ret = rds_tcp_keepalive(new_sock);
-	if (ret < 0)
-		goto out;
-
+	rds_tcp_keepalive(new_sock);
 	rds_tcp_tune(new_sock);
 
 	inet = inet_sk(new_sock->sk);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e20de4a52edb7..88aa198456858 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2109,8 +2109,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 	sock_set_keepalive(sock->sk, 1);
 	tcp_sock_set_keepidle(sock->sk, keepidle);
 	tcp_sock_set_keepintvl(sock->sk, keepidle);
-	kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
-			(char *)&keepcnt, sizeof(keepcnt));
+	tcp_sock_set_keepcnt(sock->sk, keepcnt);
 
 	/* TCP user timeout (see RFC5482) */
 	tcp_sock_set_user_timeout(sock->sk, timeo);
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 18/33] ipv4: add ip_sock_set_tos
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (16 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 17/33] tcp: add tcp_sock_set_keepcnt Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 19/33] ipv4: add ip_sock_set_freebind Christoph Hellwig
                   ` (22 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the IP_TOS sockopt from kernel space without
going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c   | 14 +++-----------
 drivers/nvme/target/tcp.c | 10 ++--------
 include/net/ip.h          |  2 ++
 net/ipv4/ip_sockglue.c    | 30 +++++++++++++++++++++---------
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8417eeb83fcd2..6c069e982989e 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1313,7 +1313,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 {
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
-	int ret, opt, rcv_pdu_size;
+	int ret, rcv_pdu_size;
 
 	queue->ctrl = ctrl;
 	INIT_LIST_HEAD(&queue->send_list);
@@ -1352,16 +1352,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 		sock_set_priority(queue->sock->sk, so_priority);
 
 	/* Set socket type of service */
-	if (nctrl->opts->tos >= 0) {
-		opt = nctrl->opts->tos;
-		ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
-				(char *)&opt, sizeof(opt));
-		if (ret) {
-			dev_err(nctrl->device,
-				"failed to set IP_TOS sock opt %d\n", ret);
-			goto err_sock;
-		}
-	}
+	if (nctrl->opts->tos >= 0)
+		ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
 
 	queue->sock->sk->sk_allocation = GFP_ATOMIC;
 	nvme_tcp_set_queue_io_cpu(queue);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index b2bfa791c5cb2..4296fe3c745bf 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1452,14 +1452,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 		sock_set_priority(sock->sk, so_priority);
 
 	/* Set socket type of service */
-	if (inet->rcv_tos > 0) {
-		int tos = inet->rcv_tos;
-
-		ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
-				(char *)&tos, sizeof(tos));
-		if (ret)
-			return ret;
-	}
+	if (inet->rcv_tos > 0)
+		ip_sock_set_tos(sock->sk, inet->rcv_tos);
 
 	write_lock_bh(&sock->sk->sk_callback_lock);
 	sock->sk->sk_user_data = queue;
diff --git a/include/net/ip.h b/include/net/ip.h
index 5b317c9f4470a..2fc52e26fa88b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -765,4 +765,6 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 	return likely(mtu >= IPV4_MIN_MTU);
 }
 
+void ip_sock_set_tos(struct sock *sk, int val);
+
 #endif	/* _IP_H */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8206047d70b6b..1733ac78c21aa 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -560,6 +560,26 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 	return err;
 }
 
+static void __ip_sock_set_tos(struct sock *sk, int val)
+{
+	if (sk->sk_type == SOCK_STREAM) {
+		val &= ~INET_ECN_MASK;
+		val |= inet_sk(sk)->tos & INET_ECN_MASK;
+	}
+	if (inet_sk(sk)->tos != val) {
+		inet_sk(sk)->tos = val;
+		sk->sk_priority = rt_tos2priority(val);
+		sk_dst_reset(sk);
+	}
+}
+
+void ip_sock_set_tos(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__ip_sock_set_tos(sk, val);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_tos);
 
 /*
  *	Socket option code for IP. This is the end of the line after any
@@ -743,15 +763,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
 		break;
 	case IP_TOS:	/* This sets both TOS and Precedence */
-		if (sk->sk_type == SOCK_STREAM) {
-			val &= ~INET_ECN_MASK;
-			val |= inet->tos & INET_ECN_MASK;
-		}
-		if (inet->tos != val) {
-			inet->tos = val;
-			sk->sk_priority = rt_tos2priority(val);
-			sk_dst_reset(sk);
-		}
+		__ip_sock_set_tos(sk, val);
 		break;
 	case IP_TTL:
 		if (optlen < 1)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 19/33] ipv4: add ip_sock_set_freebind
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (17 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 18/33] ipv4: add ip_sock_set_tos Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 20/33] ipv4: add ip_sock_set_recverr Christoph Hellwig
                   ` (21 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the IP_FREEBIND sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/target/iscsi/iscsi_target_login.c | 13 +++----------
 include/net/ip.h                          |  1 +
 net/ipv4/ip_sockglue.c                    |  8 ++++++++
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 165fa573bcb29..9f69e16cfef5f 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -15,6 +15,7 @@
 #include <linux/sched/signal.h>
 #include <linux/idr.h>
 #include <linux/tcp.h>        /* TCP_NODELAY */
+#include <net/ip.h>
 #include <net/ipv6.h>         /* ipv6_addr_v4mapped() */
 #include <scsi/iscsi_proto.h>
 #include <target/target_core_base.h>
@@ -855,7 +856,7 @@ int iscsit_setup_np(
 	struct sockaddr_storage *sockaddr)
 {
 	struct socket *sock = NULL;
-	int backlog = ISCSIT_TCP_BACKLOG, ret, opt = 0, len;
+	int backlog = ISCSIT_TCP_BACKLOG, ret, len;
 
 	switch (np->np_network_transport) {
 	case ISCSI_TCP:
@@ -900,15 +901,7 @@ int iscsit_setup_np(
 	if (np->np_network_transport == ISCSI_TCP)
 		tcp_sock_set_nodelay(sock->sk, true);
 	sock_set_reuseaddr(sock->sk, SK_CAN_REUSE);
-
-	opt = 1;
-	ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND,
-			(char *)&opt, sizeof(opt));
-	if (ret < 0) {
-		pr_err("kernel_setsockopt() for IP_FREEBIND"
-			" failed\n");
-		goto fail;
-	}
+	ip_sock_set_freebind(sock->sk, true);
 
 	ret = kernel_bind(sock, (struct sockaddr *)&np->np_sockaddr, len);
 	if (ret < 0) {
diff --git a/include/net/ip.h b/include/net/ip.h
index 2fc52e26fa88b..1e2feca8630d0 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -766,5 +766,6 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 }
 
 void ip_sock_set_tos(struct sock *sk, int val);
+void ip_sock_set_freebind(struct sock *sk, bool val);
 
 #endif	/* _IP_H */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 1733ac78c21aa..0c40887a817f8 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -581,6 +581,14 @@ void ip_sock_set_tos(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(ip_sock_set_tos);
 
+void ip_sock_set_freebind(struct sock *sk, bool val)
+{
+	lock_sock(sk);
+	inet_sk(sk)->freebind = val;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_freebind);
+
 /*
  *	Socket option code for IP. This is the end of the line after any
  *	TCP,UDP etc options on an IP socket.
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 20/33] ipv4: add ip_sock_set_recverr
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (18 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 19/33] ipv4: add ip_sock_set_freebind Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13 21:00   ` Joe Perches
  2020-05-13  6:26 ` [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover Christoph Hellwig
                   ` (20 subsequent siblings)
  40 siblings, 1 reply; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the IP_RECVERR sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/ip.h         |  1 +
 net/ipv4/ip_sockglue.c   | 10 ++++++++++
 net/rxrpc/local_object.c |  8 +-------
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 1e2feca8630d0..7ab8140b54429 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -767,5 +767,6 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 
 void ip_sock_set_tos(struct sock *sk, int val);
 void ip_sock_set_freebind(struct sock *sk, bool val);
+void ip_sock_set_recverr(struct sock *sk, bool val);
 
 #endif	/* _IP_H */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0c40887a817f8..9abecc3195520 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -589,6 +589,16 @@ void ip_sock_set_freebind(struct sock *sk, bool val)
 }
 EXPORT_SYMBOL(ip_sock_set_freebind);
 
+void ip_sock_set_recverr(struct sock *sk, bool val)
+{
+	lock_sock(sk);
+	inet_sk(sk)->recverr = val;
+	if (!val)
+		skb_queue_purge(&sk->sk_error_queue);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_recverr);
+
 /*
  *	Socket option code for IP. This is the end of the line after any
  *	TCP,UDP etc options on an IP socket.
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 562ea36c96b0f..1b87b8a9ff725 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -171,13 +171,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 		/* Fall through */
 	case AF_INET:
 		/* we want to receive ICMP errors */
-		opt = 1;
-		ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
-					(char *) &opt, sizeof(opt));
-		if (ret < 0) {
-			_debug("setsockopt failed");
-			goto error;
-		}
+		ip_sock_set_recverr(local->socket->sk, true);
 
 		/* we want to set the don't fragment bit */
 		opt = IP_PMTUDISC_DO;
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (19 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 20/33] ipv4: add ip_sock_set_recverr Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 22/33] ipv6: add ip6_sock_set_v6only Christoph Hellwig
                   ` (19 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the IP_MTU_DISCOVER sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/ip.h         |  2 ++
 net/ipv4/ip_sockglue.c   | 22 ++++++++++++++++++++++
 net/rxrpc/local_object.c |  8 +-------
 net/rxrpc/output.c       | 14 +++++---------
 net/sunrpc/svcsock.c     |  5 ++---
 5 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 7ab8140b54429..536eaffec59f0 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -768,5 +768,7 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
 void ip_sock_set_tos(struct sock *sk, int val);
 void ip_sock_set_freebind(struct sock *sk, bool val);
 void ip_sock_set_recverr(struct sock *sk, bool val);
+int ip_sock_set_mtu_discover(struct sock *sk, int val);
+void ip_sock_set_pktinfo(struct sock *sk, bool val);
 
 #endif	/* _IP_H */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 9abecc3195520..df6ce0a7b0e3d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -599,6 +599,28 @@ void ip_sock_set_recverr(struct sock *sk, bool val)
 }
 EXPORT_SYMBOL(ip_sock_set_recverr);
 
+int ip_sock_set_mtu_discover(struct sock *sk, int val)
+{
+	if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
+		return -EINVAL;
+	lock_sock(sk);
+	inet_sk(sk)->pmtudisc = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(ip_sock_set_mtu_discover);
+
+void ip_sock_set_pktinfo(struct sock *sk, bool val)
+{
+	lock_sock(sk);
+	if (val)
+		inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO;
+	else
+		inet_sk(sk)->cmsg_flags &= ~IP_CMSG_PKTINFO;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_pktinfo);
+
 /*
  *	Socket option code for IP. This is the end of the line after any
  *	TCP,UDP etc options on an IP socket.
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 1b87b8a9ff725..20236ddecd2ef 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -174,13 +174,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 		ip_sock_set_recverr(local->socket->sk, true);
 
 		/* we want to set the don't fragment bit */
-		opt = IP_PMTUDISC_DO;
-		ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
-					(char *) &opt, sizeof(opt));
-		if (ret < 0) {
-			_debug("setsockopt failed");
-			goto error;
-		}
+		ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO);
 
 		/* We want receive timestamps. */
 		sock_set_timestamps(local->socket->sk, true, false, true);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 90e263c6aa69e..ad0234e1e1713 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -321,7 +321,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	struct kvec iov[2];
 	rxrpc_serial_t serial;
 	size_t len;
-	int ret, opt;
+	int ret;
 
 	_enter(",{%d}", skb->len);
 
@@ -476,18 +476,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	switch (conn->params.local->srx.transport.family) {
 	case AF_INET6:
 	case AF_INET:
-		opt = IP_PMTUDISC_DONT;
-		kernel_setsockopt(conn->params.local->socket,
-				  SOL_IP, IP_MTU_DISCOVER,
-				  (char *)&opt, sizeof(opt));
+		ip_sock_set_mtu_discover(conn->params.local->socket->sk,
+				IP_PMTUDISC_DONT);
 		ret = kernel_sendmsg(conn->params.local->socket, &msg,
 				     iov, 2, len);
 		conn->params.peer->last_tx_at = ktime_get_seconds();
 
-		opt = IP_PMTUDISC_DO;
-		kernel_setsockopt(conn->params.local->socket,
-				  SOL_IP, IP_MTU_DISCOVER,
-				  (char *)&opt, sizeof(opt));
+		ip_sock_set_mtu_discover(conn->params.local->socket->sk,
+				IP_PMTUDISC_DO);
 		break;
 
 	default:
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0f6b78d0e6170..7a4f01c79e0f1 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -616,9 +616,8 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	/* make sure we get destination address info */
 	switch (svsk->sk_sk->sk_family) {
 	case AF_INET:
-		level = SOL_IP;
-		optname = IP_PKTINFO;
-		break;
+		ip_sock_set_pktinfo(svsk->sk_sock->sk, true);
+		return;
 	case AF_INET6:
 		level = SOL_IPV6;
 		optname = IPV6_RECVPKTINFO;
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 22/33] ipv6: add ip6_sock_set_v6only
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (20 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 23/33] ipv6: add ip6_sock_set_recverr Christoph Hellwig
                   ` (18 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the IPV6_V6ONLY sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/ipv6.h        |  3 +++
 net/ipv6/ip6_udp_tunnel.c |  5 +----
 net/ipv6/ipv6_sockglue.c  | 11 +++++++++++
 net/sunrpc/svcsock.c      |  8 ++------
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 955badd1e8ffc..e24b59201a00d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1174,4 +1174,7 @@ int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
 			  const struct in6_addr *addr, unsigned int mode);
 int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
+
+int ip6_sock_set_v6only(struct sock *sk, bool val);
+
 #endif /* _NET_IPV6_H */
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 6523609516d25..bc4ee5cb14c8b 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -25,10 +25,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 		goto error;
 
 	if (cfg->ipv6_v6only) {
-		int val = 1;
-
-		err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
-					(char *) &val, sizeof(val));
+		err = ip6_sock_set_v6only(sock->sk, true);
 		if (err < 0)
 			goto error;
 	}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 18d05403d3b52..f26224bb3e098 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -136,6 +136,17 @@ static bool setsockopt_needs_rtnl(int optname)
 	return false;
 }
 
+int ip6_sock_set_v6only(struct sock *sk, bool val)
+{
+	if (inet_sk(sk)->inet_num)
+		return -EINVAL;
+	lock_sock(sk);
+	sk->sk_ipv6only = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(ip6_sock_set_v6only);
+
 static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, unsigned int optlen)
 {
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7a4f01c79e0f1..7fa7fedec3c5a 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1328,7 +1328,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 	struct sockaddr *newsin = (struct sockaddr *)&addr;
 	int		newlen;
 	int		family;
-	int		val;
 	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
 
 	dprintk("svc: svc_create_socket(%s, %d, %s)\n",
@@ -1364,11 +1363,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 	 * getting requests from IPv4 remotes.  Those should
 	 * be shunted to a PF_INET listener via rpcbind.
 	 */
-	val = 1;
-	if (family == PF_INET6)
-		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
-					(char *)&val, sizeof(val));
-
+	if (family == PF_INET6 && IS_REACHABLE(CONFIG_IPV6))
+		ip6_sock_set_v6only(sock->sk, true);
 	if (type == SOCK_STREAM)
 		sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */
 	error = kernel_bind(sock, sin, len);
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 23/33] ipv6: add ip6_sock_set_recverr
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (21 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 22/33] ipv6: add ip6_sock_set_v6only Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 24/33] ipv6: add ip6_sock_set_addr_preferences Christoph Hellwig
                   ` (17 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the IPV6_RECVERR sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/ipv6.h       |  1 +
 net/ipv6/ipv6_sockglue.c | 10 ++++++++++
 net/rxrpc/local_object.c | 10 ++--------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index e24b59201a00d..69bc1651aaef8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1176,5 +1176,6 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
 
 int ip6_sock_set_v6only(struct sock *sk, bool val);
+void ip6_sock_set_recverr(struct sock *sk, bool val);
 
 #endif /* _NET_IPV6_H */
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f26224bb3e098..3c67626b6f5a9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -147,6 +147,16 @@ int ip6_sock_set_v6only(struct sock *sk, bool val)
 }
 EXPORT_SYMBOL(ip6_sock_set_v6only);
 
+void ip6_sock_set_recverr(struct sock *sk, bool val)
+{
+	lock_sock(sk);
+	inet6_sk(sk)->recverr = val;
+	if (!val)
+		skb_queue_purge(&sk->sk_error_queue);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip6_sock_set_recverr);
+
 static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, unsigned int optlen)
 {
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 20236ddecd2ef..5e356a63aa791 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -107,7 +107,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
 static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 {
 	struct sock *usk;
-	int ret, opt;
+	int ret;
 
 	_enter("%p{%d,%d}",
 	       local, local->srx.transport_type, local->srx.transport.family);
@@ -157,13 +157,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 	switch (local->srx.transport.family) {
 	case AF_INET6:
 		/* we want to receive ICMPv6 errors */
-		opt = 1;
-		ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR,
-					(char *) &opt, sizeof(opt));
-		if (ret < 0) {
-			_debug("setsockopt failed");
-			goto error;
-		}
+		ip6_sock_set_recverr(local->socket->sk, true);
 
 		/* Fall through and set IPv4 options too otherwise we don't get
 		 * errors from IPv4 packets sent through the IPv6 socket.
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 24/33] ipv6: add ip6_sock_set_addr_preferences
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (22 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 23/33] ipv6: add ip6_sock_set_recverr Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 25/33] ipv6: add ip6_sock_set_recvpktinfo Christoph Hellwig
                   ` (16 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the IPV6_ADD_PREFERENCES sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/ipv6.h       |   1 +
 net/ipv6/ipv6_sockglue.c | 127 +++++++++++++++++++++------------------
 net/sunrpc/xprtsock.c    |   8 ++-
 3 files changed, 75 insertions(+), 61 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 69bc1651aaef8..04b2bc1935054 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1177,5 +1177,6 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 
 int ip6_sock_set_v6only(struct sock *sk, bool val);
 void ip6_sock_set_recverr(struct sock *sk, bool val);
+int ip6_sock_set_addr_preferences(struct sock *sk, bool val);
 
 #endif /* _NET_IPV6_H */
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3c67626b6f5a9..c23d42e809d7e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -157,6 +157,74 @@ void ip6_sock_set_recverr(struct sock *sk, bool val)
 }
 EXPORT_SYMBOL(ip6_sock_set_recverr);
 
+static int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
+{
+	unsigned int pref = 0;
+	unsigned int prefmask = ~0;
+
+	/* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
+	switch (val & (IPV6_PREFER_SRC_PUBLIC |
+		       IPV6_PREFER_SRC_TMP |
+		       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
+	case IPV6_PREFER_SRC_PUBLIC:
+		pref |= IPV6_PREFER_SRC_PUBLIC;
+		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
+			      IPV6_PREFER_SRC_TMP);
+		break;
+	case IPV6_PREFER_SRC_TMP:
+		pref |= IPV6_PREFER_SRC_TMP;
+		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
+			      IPV6_PREFER_SRC_TMP);
+		break;
+	case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
+		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
+			      IPV6_PREFER_SRC_TMP);
+		break;
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* check HOME/COA conflicts */
+	switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) {
+	case IPV6_PREFER_SRC_HOME:
+		prefmask &= ~IPV6_PREFER_SRC_COA;
+		break;
+	case IPV6_PREFER_SRC_COA:
+		pref |= IPV6_PREFER_SRC_COA;
+		break;
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* check CGA/NONCGA conflicts */
+	switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
+	case IPV6_PREFER_SRC_CGA:
+	case IPV6_PREFER_SRC_NONCGA:
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref;
+	return 0;
+}
+
+int ip6_sock_set_addr_preferences(struct sock *sk, bool val)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = __ip6_sock_set_addr_preferences(sk, val);
+	release_sock(sk);
+	return ret;
+}
+EXPORT_SYMBOL(ip6_sock_set_addr_preferences);
+
 static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, unsigned int optlen)
 {
@@ -859,67 +927,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_ADDR_PREFERENCES:
-	    {
-		unsigned int pref = 0;
-		unsigned int prefmask = ~0;
-
 		if (optlen < sizeof(int))
 			goto e_inval;
-
-		retv = -EINVAL;
-
-		/* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
-		switch (val & (IPV6_PREFER_SRC_PUBLIC|
-			       IPV6_PREFER_SRC_TMP|
-			       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
-		case IPV6_PREFER_SRC_PUBLIC:
-			pref |= IPV6_PREFER_SRC_PUBLIC;
-			break;
-		case IPV6_PREFER_SRC_TMP:
-			pref |= IPV6_PREFER_SRC_TMP;
-			break;
-		case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
-			break;
-		case 0:
-			goto pref_skip_pubtmp;
-		default:
-			goto e_inval;
-		}
-
-		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC|
-			      IPV6_PREFER_SRC_TMP);
-pref_skip_pubtmp:
-
-		/* check HOME/COA conflicts */
-		switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) {
-		case IPV6_PREFER_SRC_HOME:
-			break;
-		case IPV6_PREFER_SRC_COA:
-			pref |= IPV6_PREFER_SRC_COA;
-		case 0:
-			goto pref_skip_coa;
-		default:
-			goto e_inval;
-		}
-
-		prefmask &= ~IPV6_PREFER_SRC_COA;
-pref_skip_coa:
-
-		/* check CGA/NONCGA conflicts */
-		switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
-		case IPV6_PREFER_SRC_CGA:
-		case IPV6_PREFER_SRC_NONCGA:
-		case 0:
-			break;
-		default:
-			goto e_inval;
-		}
-
-		np->srcprefs = (np->srcprefs & prefmask) | pref;
-		retv = 0;
-
+		retv = __ip6_sock_set_addr_preferences(sk, val);
 		break;
-	    }
 	case IPV6_MINHOPCOUNT:
 		if (optlen < sizeof(int))
 			goto e_inval;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 88aa198456858..7aaf2baf0c393 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2150,7 +2150,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
 	if (!transport->inet) {
 		struct sock *sk = sock->sk;
-		unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
 
 		/* Avoid temporary address, they are bad for long-lived
 		 * connections such as NFS mounts.
@@ -2159,8 +2158,11 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		 *    knowledge about the normal duration of connections,
 		 *    MAY override this as appropriate.
 		 */
-		kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
-				(char *)&addr_pref, sizeof(addr_pref));
+		if (xs_addr(xprt)->sa_family == PF_INET6 &&
+		    IS_REACHABLE(CONFIG_IPV6)) {
+			ip6_sock_set_addr_preferences(sk,
+				IPV6_PREFER_SRC_PUBLIC);
+		}
 
 		xs_tcp_set_socket_timeouts(xprt, sock);
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 25/33] ipv6: add ip6_sock_set_recvpktinfo
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (23 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 24/33] ipv6: add ip6_sock_set_addr_preferences Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 26/33] sctp: lift copying in addrs into sctp_setsockopt Christoph Hellwig
                   ` (15 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the IPV6_RECVPKTINFO sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/ipv6.h       |  1 +
 net/ipv6/ipv6_sockglue.c |  8 ++++++++
 net/sunrpc/svcsock.c     | 11 +++--------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 04b2bc1935054..170872bc4e960 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1178,5 +1178,6 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 int ip6_sock_set_v6only(struct sock *sk, bool val);
 void ip6_sock_set_recverr(struct sock *sk, bool val);
 int ip6_sock_set_addr_preferences(struct sock *sk, bool val);
+void ip6_sock_set_recvpktinfo(struct sock *sk, bool val);
 
 #endif /* _NET_IPV6_H */
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c23d42e809d7e..d60adb018d71c 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -225,6 +225,14 @@ int ip6_sock_set_addr_preferences(struct sock *sk, bool val)
 }
 EXPORT_SYMBOL(ip6_sock_set_addr_preferences);
 
+void ip6_sock_set_recvpktinfo(struct sock *sk, bool val)
+{
+	lock_sock(sk);
+	inet6_sk(sk)->rxopt.bits.rxinfo = val;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip6_sock_set_recvpktinfo);
+
 static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, unsigned int optlen)
 {
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7fa7fedec3c5a..7cf8389b6f46f 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -595,8 +595,6 @@ static struct svc_xprt_class svc_udp_class = {
 
 static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 {
-	int err, level, optname, one = 1;
-
 	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
 		      &svsk->sk_xprt, serv);
 	clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
@@ -617,17 +615,14 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	switch (svsk->sk_sk->sk_family) {
 	case AF_INET:
 		ip_sock_set_pktinfo(svsk->sk_sock->sk, true);
-		return;
+		break;
 	case AF_INET6:
-		level = SOL_IPV6;
-		optname = IPV6_RECVPKTINFO;
+		if (IS_REACHABLE(CONFIG_IPV6))
+			ip6_sock_set_recvpktinfo(svsk->sk_sock->sk, true);
 		break;
 	default:
 		BUG();
 	}
-	err = kernel_setsockopt(svsk->sk_sock, level, optname,
-					(char *)&one, sizeof(one));
-	dprintk("svc: kernel_setsockopt returned %d\n", err);
 }
 
 /*
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 26/33] sctp: lift copying in addrs into sctp_setsockopt
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (24 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 25/33] ipv6: add ip6_sock_set_recvpktinfo Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 27/33] sctp: export sctp_setsockopt_bindx Christoph Hellwig
                   ` (14 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Prepare for additional kernel-space callers of sctp_setsockopt_bindx.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 net/sctp/socket.c | 71 ++++++++++++++++++-----------------------------
 1 file changed, 27 insertions(+), 44 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 827a9903ee288..1c96b52c4aa28 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -972,18 +972,16 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw)
  * it.
  *
  * sk        The sk of the socket
- * addrs     The pointer to the addresses in user land
+ * addrs     The pointer to the addresses
  * addrssize Size of the addrs buffer
  * op        Operation to perform (add or remove, see the flags of
  *           sctp_bindx)
  *
  * Returns 0 if ok, <0 errno code on error.
  */
-static int sctp_setsockopt_bindx(struct sock *sk,
-				 struct sockaddr __user *addrs,
+static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
 				 int addrs_size, int op)
 {
-	struct sockaddr *kaddrs;
 	int err;
 	int addrcnt = 0;
 	int walk_size = 0;
@@ -991,23 +989,13 @@ static int sctp_setsockopt_bindx(struct sock *sk,
 	void *addr_buf;
 	struct sctp_af *af;
 
-	pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n",
-		 __func__, sk, addrs, addrs_size, op);
-
-	if (unlikely(addrs_size <= 0))
-		return -EINVAL;
+	pr_debug("%s: sk:%p kaddrs:%p addrs_size:%d opt:%d\n",
+		 __func__, sk, kaddrs, addrs_size, op);
 
-	kaddrs = memdup_user(addrs, addrs_size);
-	if (IS_ERR(kaddrs))
-		return PTR_ERR(kaddrs);
-
-	/* Walk through the addrs buffer and count the number of addresses. */
 	addr_buf = kaddrs;
 	while (walk_size < addrs_size) {
-		if (walk_size + sizeof(sa_family_t) > addrs_size) {
-			kfree(kaddrs);
+		if (walk_size + sizeof(sa_family_t) > addrs_size)
 			return -EINVAL;
-		}
 
 		sa_addr = addr_buf;
 		af = sctp_get_af_specific(sa_addr->sa_family);
@@ -1015,10 +1003,8 @@ static int sctp_setsockopt_bindx(struct sock *sk,
 		/* If the address family is not supported or if this address
 		 * causes the address buffer to overflow return EINVAL.
 		 */
-		if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
-			kfree(kaddrs);
+		if (!af || (walk_size + af->sockaddr_len) > addrs_size)
 			return -EINVAL;
-		}
 		addrcnt++;
 		addr_buf += af->sockaddr_len;
 		walk_size += af->sockaddr_len;
@@ -1032,29 +1018,19 @@ static int sctp_setsockopt_bindx(struct sock *sk,
 						 (struct sockaddr *)kaddrs,
 						 addrs_size);
 		if (err)
-			goto out;
+			return err;
 		err = sctp_bindx_add(sk, kaddrs, addrcnt);
 		if (err)
-			goto out;
-		err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt);
-		break;
-
+			return err;
+		return sctp_send_asconf_add_ip(sk, kaddrs, addrcnt);
 	case SCTP_BINDX_REM_ADDR:
 		err = sctp_bindx_rem(sk, kaddrs, addrcnt);
 		if (err)
-			goto out;
-		err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt);
-		break;
-
+			return err;
+		return sctp_send_asconf_del_ip(sk, kaddrs, addrcnt);
 	default:
-		err = -EINVAL;
-		break;
+		return -EINVAL;
 	}
-
-out:
-	kfree(kaddrs);
-
-	return err;
 }
 
 static int sctp_connect_new_asoc(struct sctp_endpoint *ep,
@@ -4670,6 +4646,7 @@ static int sctp_setsockopt_pf_expose(struct sock *sk,
 static int sctp_setsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, unsigned int optlen)
 {
+	struct sockaddr *kaddrs;
 	int retval = 0;
 
 	pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname);
@@ -4682,30 +4659,37 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	 */
 	if (level != SOL_SCTP) {
 		struct sctp_af *af = sctp_sk(sk)->pf->af;
-		retval = af->setsockopt(sk, level, optname, optval, optlen);
-		goto out_nounlock;
+		return af->setsockopt(sk, level, optname, optval, optlen);
 	}
 
+	if (unlikely(optlen <= 0))
+		return -EINVAL;
+
+	kaddrs = memdup_user(optval, optlen);
+	if (IS_ERR(kaddrs))
+		return PTR_ERR(kaddrs);
+
+	/* Walk through the addrs buffer and count the number of addresses. */
+
 	lock_sock(sk);
 
 	switch (optname) {
 	case SCTP_SOCKOPT_BINDX_ADD:
 		/* 'optlen' is the size of the addresses buffer. */
-		retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval,
+		retval = sctp_setsockopt_bindx(sk, (struct sockaddr *)optval,
 					       optlen, SCTP_BINDX_ADD_ADDR);
 		break;
 
 	case SCTP_SOCKOPT_BINDX_REM:
 		/* 'optlen' is the size of the addresses buffer. */
-		retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval,
+		retval = sctp_setsockopt_bindx(sk, (struct sockaddr *)optval,
 					       optlen, SCTP_BINDX_REM_ADDR);
 		break;
 
 	case SCTP_SOCKOPT_CONNECTX_OLD:
 		/* 'optlen' is the size of the addresses buffer. */
 		retval = sctp_setsockopt_connectx_old(sk,
-					    (struct sockaddr __user *)optval,
-					    optlen);
+					    (struct sockaddr *)optval, optlen);
 		break;
 
 	case SCTP_SOCKOPT_CONNECTX:
@@ -4871,8 +4855,7 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	}
 
 	release_sock(sk);
-
-out_nounlock:
+	kfree(kaddrs);
 	return retval;
 }
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (25 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 26/33] sctp: lift copying in addrs into sctp_setsockopt Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13 18:00   ` Marcelo Ricardo Leitner
  2020-05-13  6:26 ` [PATCH 28/33] sctp: add sctp_sock_set_nodelay Christoph Hellwig
                   ` (13 subsequent siblings)
  40 siblings, 1 reply; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

And call it directly from dlm instead of going through kernel_setsockopt.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/dlm/lowcomms.c       | 13 ++++++++-----
 include/net/sctp/sctp.h |  3 +++
 net/sctp/socket.c       |  5 +++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index b722a09a7ca05..e4939d770df53 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1005,14 +1005,17 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port)
 		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
 		make_sockaddr(&localaddr, port, &addr_len);
 
-		if (!i)
+		if (!i) {
 			result = kernel_bind(con->sock,
 					     (struct sockaddr *)&localaddr,
 					     addr_len);
-		else
-			result = kernel_setsockopt(con->sock, SOL_SCTP,
-						   SCTP_SOCKOPT_BINDX_ADD,
-						   (char *)&localaddr, addr_len);
+		} else {
+			lock_sock(con->sock->sk);
+			result = sctp_setsockopt_bindx(con->sock->sk,
+					(struct sockaddr *)&localaddr, addr_len,
+					SCTP_BINDX_ADD_ADDR);
+			release_sock(con->sock->sk);
+		}
 
 		if (result < 0) {
 			log_print("Can't bind to %d addr number %d, %d.\n",
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 3ab5c6bbb90bd..f702b14d768ba 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -615,4 +615,7 @@ static inline bool sctp_newsk_ready(const struct sock *sk)
 	return sock_flag(sk, SOCK_DEAD) || sk->sk_socket;
 }
 
+int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
+		int addrs_size, int op);
+
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1c96b52c4aa28..30c981d9f6158 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -979,8 +979,8 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw)
  *
  * Returns 0 if ok, <0 errno code on error.
  */
-static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
-				 int addrs_size, int op)
+int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
+		int addrs_size, int op)
 {
 	int err;
 	int addrcnt = 0;
@@ -1032,6 +1032,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
 		return -EINVAL;
 	}
 }
+EXPORT_SYMBOL(sctp_setsockopt_bindx);
 
 static int sctp_connect_new_asoc(struct sctp_endpoint *ep,
 				 const union sctp_addr *daddr,
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 28/33] sctp: add sctp_sock_set_nodelay
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (26 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 27/33] sctp: export sctp_setsockopt_bindx Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 29/33] rxrpc_sock_set_min_security_level Christoph Hellwig
                   ` (12 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the SCTP_NODELAY sockopt from kernel space
without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/dlm/lowcomms.c       | 10 ++--------
 include/net/sctp/sctp.h |  1 +
 net/sctp/socket.c       |  8 ++++++++
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e4939d770df53..6fa45365666a8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1034,7 +1034,6 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port)
 static void sctp_connect_to_sock(struct connection *con)
 {
 	struct sockaddr_storage daddr;
-	int one = 1;
 	int result;
 	int addr_len;
 	struct socket *sock;
@@ -1081,8 +1080,7 @@ static void sctp_connect_to_sock(struct connection *con)
 	log_print("connecting to %d", con->nodeid);
 
 	/* Turn off Nagle's algorithm */
-	kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
-			  sizeof(one));
+	sctp_sock_set_nodelay(sock->sk, true);
 
 	/*
 	 * Make sock->ops->connect() function return in specified time,
@@ -1296,7 +1294,6 @@ static int sctp_listen_for_all(void)
 	struct socket *sock = NULL;
 	int result = -EINVAL;
 	struct connection *con = nodeid2con(0, GFP_NOFS);
-	int one = 1;
 
 	if (!con)
 		return -ENOMEM;
@@ -1311,10 +1308,7 @@ static int sctp_listen_for_all(void)
 	}
 
 	sock_set_rcvbuf(sock->sk, NEEDED_RMEM);
-	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
-				   sizeof(one));
-	if (result < 0)
-		log_print("Could not set SCTP NODELAY error %d\n", result);
+	sctp_sock_set_nodelay(sock->sk, true);
 
 	write_lock_bh(&sock->sk->sk_callback_lock);
 	/* Init con struct */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index f702b14d768ba..b505fa082f254 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -617,5 +617,6 @@ static inline bool sctp_newsk_ready(const struct sock *sk)
 
 int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
 		int addrs_size, int op);
+void sctp_sock_set_nodelay(struct sock *sk, bool val);
 
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 30c981d9f6158..64c395f7a86d5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3066,6 +3066,14 @@ static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+void sctp_sock_set_nodelay(struct sock *sk, bool val)
+{
+	lock_sock(sk);
+	sctp_sk(sk)->nodelay = val;
+	release_sock(sk);
+}
+EXPORT_SYMBOL(sctp_sock_set_nodelay);
+
 /*
  *
  * 7.1.1 SCTP_RTOINFO
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 29/33] rxrpc_sock_set_min_security_level
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (27 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 28/33] sctp: add sctp_sock_set_nodelay Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 30/33] tipc: call tsk_set_importance from tipc_topsrv_create_listener Christoph Hellwig
                   ` (11 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly set the RXRPC_MIN_SECURITY_LEVEL sockopt from
kernel space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/afs/rxrpc.c         |  6 ++----
 include/net/af_rxrpc.h |  2 ++
 net/rxrpc/af_rxrpc.c   | 13 +++++++++++++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 1ecc67da6c1a4..7dfcbd58da85c 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -37,7 +37,6 @@ int afs_open_socket(struct afs_net *net)
 {
 	struct sockaddr_rxrpc srx;
 	struct socket *socket;
-	unsigned int min_level;
 	int ret;
 
 	_enter("");
@@ -57,9 +56,8 @@ int afs_open_socket(struct afs_net *net)
 	srx.transport.sin6.sin6_family	= AF_INET6;
 	srx.transport.sin6.sin6_port	= htons(AFS_CM_PORT);
 
-	min_level = RXRPC_SECURITY_ENCRYPT;
-	ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
-				(void *)&min_level, sizeof(min_level));
+	ret = rxrpc_sock_set_min_security_level(socket->sk,
+			RXRPC_SECURITY_ENCRYPT);
 	if (ret < 0)
 		goto error_2;
 
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 04e97bab6f28b..8d7b469453bda 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -72,4 +72,6 @@ bool rxrpc_kernel_call_is_complete(struct rxrpc_call *);
 void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
 			       unsigned long);
 
+int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
+
 #endif /* _NET_RXRPC_H */
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 15ee92d795815..394189b81849f 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -571,6 +571,19 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
 	return ret;
 }
 
+int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val)
+{
+	if (sk->sk_state != RXRPC_UNBOUND)
+		return -EISCONN;
+	if (val > RXRPC_SECURITY_MAX)
+		return -EINVAL;
+	lock_sock(sk);
+	rxrpc_sk(sk)->min_sec_level = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(rxrpc_sock_set_min_security_level);
+
 /*
  * set RxRPC socket options
  */
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 30/33] tipc: call tsk_set_importance from tipc_topsrv_create_listener
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (28 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 29/33] rxrpc_sock_set_min_security_level Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 31/33] net: remove kernel_setsockopt Christoph Hellwig
                   ` (10 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Avoid using kernel_setsockopt for the TIPC_IMPORTANCE option when we can
just use the internal helper.  The only change needed is to pass a struct
sock instead of tipc_sock, which is private to socket.c

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 net/tipc/socket.c | 18 +++++++++---------
 net/tipc/socket.h |  2 ++
 net/tipc/topsrv.c |  6 +++---
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 87466607097f1..f2e10fbfb03df 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -191,17 +191,17 @@ static int tsk_importance(struct tipc_sock *tsk)
 	return msg_importance(&tsk->phdr);
 }
 
-static int tsk_set_importance(struct tipc_sock *tsk, int imp)
+static struct tipc_sock *tipc_sk(const struct sock *sk)
 {
-	if (imp > TIPC_CRITICAL_IMPORTANCE)
-		return -EINVAL;
-	msg_set_importance(&tsk->phdr, (u32)imp);
-	return 0;
+	return container_of(sk, struct tipc_sock, sk);
 }
 
-static struct tipc_sock *tipc_sk(const struct sock *sk)
+int tsk_set_importance(struct sock *sk, int imp)
 {
-	return container_of(sk, struct tipc_sock, sk);
+	if (imp > TIPC_CRITICAL_IMPORTANCE)
+		return -EINVAL;
+	msg_set_importance(&tipc_sk(sk)->phdr, (u32)imp);
+	return 0;
 }
 
 static bool tsk_conn_cong(struct tipc_sock *tsk)
@@ -2661,7 +2661,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
 	/* Connect new socket to it's peer */
 	tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg));
 
-	tsk_set_importance(new_tsock, msg_importance(msg));
+	tsk_set_importance(new_sk, msg_importance(msg));
 	if (msg_named(msg)) {
 		new_tsock->conn_type = msg_nametype(msg);
 		new_tsock->conn_instance = msg_nameinst(msg);
@@ -3079,7 +3079,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 
 	switch (opt) {
 	case TIPC_IMPORTANCE:
-		res = tsk_set_importance(tsk, value);
+		res = tsk_set_importance(sk, value);
 		break;
 	case TIPC_SRC_DROPPABLE:
 		if (sock->type != SOCK_STREAM)
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index 235b9679acee4..b11575afc66fe 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -75,4 +75,6 @@ u32 tipc_sock_get_portid(struct sock *sk);
 bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb);
 bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb);
 
+int tsk_set_importance(struct sock *sk, int imp);
+
 #endif
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 73dbed0c4b6b8..a0d50649f71c2 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -494,7 +494,6 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk)
 
 static int tipc_topsrv_create_listener(struct tipc_topsrv *srv)
 {
-	int imp = TIPC_CRITICAL_IMPORTANCE;
 	struct socket *lsock = NULL;
 	struct sockaddr_tipc saddr;
 	struct sock *sk;
@@ -511,8 +510,9 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv)
 	sk->sk_user_data = srv;
 	write_unlock_bh(&sk->sk_callback_lock);
 
-	rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE,
-			       (char *)&imp, sizeof(imp));
+	lock_sock(sk);
+	rc = tsk_set_importance(sk, TIPC_CRITICAL_IMPORTANCE);
+	release_sock(sk);
 	if (rc < 0)
 		goto err;
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 31/33] net: remove kernel_setsockopt
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (29 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 30/33] tipc: call tsk_set_importance from tipc_topsrv_create_listener Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13  6:26 ` [PATCH 32/33] sctp: add sctp_sock_get_primary_addr Christoph Hellwig
                   ` (9 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

No users left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/net.h |  2 --
 net/socket.c        | 31 -------------------------------
 2 files changed, 33 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 6451425e828f5..ece7513326293 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -305,8 +305,6 @@ int kernel_getsockname(struct socket *sock, struct sockaddr *addr);
 int kernel_getpeername(struct socket *sock, struct sockaddr *addr);
 int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval,
 		      int *optlen);
-int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval,
-		      unsigned int optlen);
 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 		    size_t size, int flags);
 int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
diff --git a/net/socket.c b/net/socket.c
index 1c9a7260a41de..f37c3ef508691 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3749,37 +3749,6 @@ int kernel_getsockopt(struct socket *sock, int level, int optname,
 }
 EXPORT_SYMBOL(kernel_getsockopt);
 
-/**
- *	kernel_setsockopt - set a socket option (kernel space)
- *	@sock: socket
- *	@level: API level (SOL_SOCKET, ...)
- *	@optname: option tag
- *	@optval: option value
- *	@optlen: option length
- *
- *	Returns 0 or an error.
- */
-
-int kernel_setsockopt(struct socket *sock, int level, int optname,
-			char *optval, unsigned int optlen)
-{
-	mm_segment_t oldfs = get_fs();
-	char __user *uoptval;
-	int err;
-
-	uoptval = (char __user __force *) optval;
-
-	set_fs(KERNEL_DS);
-	if (level == SOL_SOCKET)
-		err = sock_setsockopt(sock, level, optname, uoptval, optlen);
-	else
-		err = sock->ops->setsockopt(sock, level, optname, uoptval,
-					    optlen);
-	set_fs(oldfs);
-	return err;
-}
-EXPORT_SYMBOL(kernel_setsockopt);
-
 /**
  *	kernel_sendpage - send a &page through a socket (kernel space)
  *	@sock: socket
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 32/33] sctp: add sctp_sock_get_primary_addr
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (30 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 31/33] net: remove kernel_setsockopt Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13 18:03   ` Marcelo Ricardo Leitner
  2020-05-13  6:26 ` [PATCH 33/33] net: remove kernel_getsockopt Christoph Hellwig
                   ` (8 subsequent siblings)
  40 siblings, 1 reply; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

Add a helper to directly get the SCTP_PRIMARY_ADDR sockopt from kernel
space without going through a fake uaccess.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/dlm/lowcomms.c       | 11 +++-----
 include/net/sctp/sctp.h |  1 +
 net/sctp/socket.c       | 57 +++++++++++++++++++++++++----------------
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 6fa45365666a8..46d2d71b62c57 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -855,10 +855,9 @@ static int tcp_accept_from_sock(struct connection *con)
 static int sctp_accept_from_sock(struct connection *con)
 {
 	/* Check that the new node is in the lockspace */
-	struct sctp_prim prim;
+	struct sctp_prim prim = { };
 	int nodeid;
-	int prim_len, ret;
-	int addr_len;
+	int addr_len, ret;
 	struct connection *newcon;
 	struct connection *addcon;
 	struct socket *newsock;
@@ -876,11 +875,7 @@ static int sctp_accept_from_sock(struct connection *con)
 	if (ret < 0)
 		goto accept_err;
 
-	memset(&prim, 0, sizeof(struct sctp_prim));
-	prim_len = sizeof(struct sctp_prim);
-
-	ret = kernel_getsockopt(newsock, IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
-				(char *)&prim, &prim_len);
+	ret = sctp_sock_get_primary_addr(con->sock->sk, &prim);
 	if (ret < 0) {
 		log_print("getsockopt/sctp_primary_addr failed: %d", ret);
 		goto accept_err;
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index b505fa082f254..c98b1d14db853 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -618,5 +618,6 @@ static inline bool sctp_newsk_ready(const struct sock *sk)
 int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
 		int addrs_size, int op);
 void sctp_sock_set_nodelay(struct sock *sk, bool val);
+int sctp_sock_get_primary_addr(struct sock *sk, struct sctp_prim *prim);
 
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 64c395f7a86d5..39bf8090dbe1e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6411,6 +6411,35 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 	return err;
 }
 
+static int __sctp_sock_get_primary_addr(struct sock *sk, struct sctp_prim *prim)
+{
+	struct sctp_association *asoc;
+
+	asoc = sctp_id2assoc(sk, prim->ssp_assoc_id);
+	if (!asoc)
+		return -EINVAL;
+	if (!asoc->peer.primary_path)
+		return -ENOTCONN;
+
+	memcpy(&prim->ssp_addr, &asoc->peer.primary_path->ipaddr,
+		asoc->peer.primary_path->af_specific->sockaddr_len);
+
+	sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk),
+			(union sctp_addr *)&prim->ssp_addr);
+	return 0;
+}
+
+int sctp_sock_get_primary_addr(struct sock *sk, struct sctp_prim *prim)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = __sctp_sock_get_primary_addr(sk, prim);
+	release_sock(sk);
+	return ret;
+}
+EXPORT_SYMBOL(sctp_sock_get_primary_addr);
+
 /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
  *
  * Requests that the local SCTP stack use the enclosed peer address as
@@ -6421,35 +6450,19 @@ static int sctp_getsockopt_primary_addr(struct sock *sk, int len,
 					char __user *optval, int __user *optlen)
 {
 	struct sctp_prim prim;
-	struct sctp_association *asoc;
-	struct sctp_sock *sp = sctp_sk(sk);
+	int ret;
 
 	if (len < sizeof(struct sctp_prim))
 		return -EINVAL;
-
-	len = sizeof(struct sctp_prim);
-
-	if (copy_from_user(&prim, optval, len))
+	if (copy_from_user(&prim, optval, sizeof(struct sctp_prim)))
 		return -EFAULT;
 
-	asoc = sctp_id2assoc(sk, prim.ssp_assoc_id);
-	if (!asoc)
-		return -EINVAL;
-
-	if (!asoc->peer.primary_path)
-		return -ENOTCONN;
-
-	memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr,
-		asoc->peer.primary_path->af_specific->sockaddr_len);
-
-	sctp_get_pf_specific(sk->sk_family)->addr_to_user(sp,
-			(union sctp_addr *)&prim.ssp_addr);
+	ret = __sctp_sock_get_primary_addr(sk, &prim);
+	if (ret)
+		return ret;
 
-	if (put_user(len, optlen))
+	if (put_user(len, optlen) || copy_to_user(optval, &prim, len))
 		return -EFAULT;
-	if (copy_to_user(optval, &prim, len))
-		return -EFAULT;
-
 	return 0;
 }
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH 33/33] net: remove kernel_getsockopt
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (31 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 32/33] sctp: add sctp_sock_get_primary_addr Christoph Hellwig
@ 2020-05-13  6:26 ` Christoph Hellwig
  2020-05-13 13:13 ` [PATCH 29/33] rxrpc_sock_set_min_security_level David Howells
                   ` (7 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-13  6:26 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

No users left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/net.h |  2 --
 net/socket.c        | 34 ----------------------------------
 2 files changed, 36 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index ece7513326293..e10f378194a59 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
 		   int flags);
 int kernel_getsockname(struct socket *sock, struct sockaddr *addr);
 int kernel_getpeername(struct socket *sock, struct sockaddr *addr);
-int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval,
-		      int *optlen);
 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 		    size_t size, int flags);
 int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
diff --git a/net/socket.c b/net/socket.c
index f37c3ef508691..49000f0d87f71 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3715,40 +3715,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
 }
 EXPORT_SYMBOL(kernel_getpeername);
 
-/**
- *	kernel_getsockopt - get a socket option (kernel space)
- *	@sock: socket
- *	@level: API level (SOL_SOCKET, ...)
- *	@optname: option tag
- *	@optval: option value
- *	@optlen: option length
- *
- *	Assigns the option length to @optlen.
- *	Returns 0 or an error.
- */
-
-int kernel_getsockopt(struct socket *sock, int level, int optname,
-			char *optval, int *optlen)
-{
-	mm_segment_t oldfs = get_fs();
-	char __user *uoptval;
-	int __user *uoptlen;
-	int err;
-
-	uoptval = (char __user __force *) optval;
-	uoptlen = (int __user __force *) optlen;
-
-	set_fs(KERNEL_DS);
-	if (level == SOL_SOCKET)
-		err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
-	else
-		err = sock->ops->getsockopt(sock, level, optname, uoptval,
-					    uoptlen);
-	set_fs(oldfs);
-	return err;
-}
-EXPORT_SYMBOL(kernel_getsockopt);
-
 /**
  *	kernel_sendpage - send a &page through a socket (kernel space)
  *	@sock: socket
-- 
2.26.2


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 11/33] tcp: tcp_sock_set_nodelay
  2020-05-13  6:26 ` [PATCH 11/33] tcp: tcp_sock_set_nodelay Christoph Hellwig
@ 2020-05-13 12:51   ` Jason Gunthorpe
  0 siblings, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2020-05-13 12:51 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

On Wed, May 13, 2020 at 08:26:26AM +0200, Christoph Hellwig wrote:
> Add a helper to directly set the TCP_NODELAY sockopt from kernel space
> without going through a fake uaccess.  Cleanup the callers to avoid
> pointless wrappers now that this is a simple function call.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/block/drbd/drbd_int.h             |  7 ----
>  drivers/block/drbd/drbd_main.c            |  2 +-
>  drivers/block/drbd/drbd_receiver.c        |  4 +--
>  drivers/infiniband/sw/siw/siw_cm.c        | 24 +++-----------
>  drivers/nvme/host/tcp.c                   |  9 +-----
>  drivers/nvme/target/tcp.c                 | 12 ++-----
>  drivers/target/iscsi/iscsi_target_login.c | 15 ++-------
>  fs/cifs/connect.c                         | 10 ++----
>  fs/dlm/lowcomms.c                         |  8 ++---
>  fs/ocfs2/cluster/tcp.c                    | 20 ++----------
>  include/linux/tcp.h                       |  1 +
>  net/ceph/messenger.c                      | 11 ++-----
>  net/ipv4/tcp.c                            | 39 +++++++++++++++--------
>  net/rds/tcp.c                             | 11 +------
>  net/rds/tcp.h                             |  1 -
>  net/rds/tcp_listen.c                      |  2 +-
>  16 files changed, 49 insertions(+), 127 deletions(-)

No problem with the siw change

Acked-by: Jason Gunthorpe <jgg@mellanox.com>

Jason

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 29/33] rxrpc_sock_set_min_security_level
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (32 preceding siblings ...)
  2020-05-13  6:26 ` [PATCH 33/33] net: remove kernel_getsockopt Christoph Hellwig
@ 2020-05-13 13:13 ` David Howells
  2020-05-14 10:29   ` Christoph Hellwig
  2020-05-15 15:13   ` David Howells
  2020-05-13 13:17 ` [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover David Howells
                   ` (6 subsequent siblings)
  40 siblings, 2 replies; 76+ messages in thread
From: David Howells @ 2020-05-13 13:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

Christoph Hellwig <hch@lst.de> wrote:

> +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
> +

Looks good - but you do need to add this to Documentation/networking/rxrpc.txt
also, thanks.

David


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (33 preceding siblings ...)
  2020-05-13 13:13 ` [PATCH 29/33] rxrpc_sock_set_min_security_level David Howells
@ 2020-05-13 13:17 ` David Howells
  2020-05-14  6:26   ` Christoph Hellwig
  2020-05-15 15:15   ` David Howells
  2020-05-13 13:24 ` [PATCH 20/33] ipv4: add ip_sock_set_recverr David Howells
                   ` (5 subsequent siblings)
  40 siblings, 2 replies; 76+ messages in thread
From: David Howells @ 2020-05-13 13:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

Christoph Hellwig <hch@lst.de> wrote:

> +		ip_sock_set_mtu_discover(conn->params.local->socket->sk,
> +				IP_PMTUDISC_DONT);

Um... The socket in question could be an AF_INET6 socket, not an AF_INET4
socket - I presume it will work in that case.  If so:

Reviewed-by: David Howells <dhowells@redhat.com> [rxrpc bits]


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 20/33] ipv4: add ip_sock_set_recverr
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (34 preceding siblings ...)
  2020-05-13 13:17 ` [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover David Howells
@ 2020-05-13 13:24 ` David Howells
  2020-05-13 13:25 ` [PATCH 23/33] ipv6: add ip6_sock_set_recverr David Howells
                   ` (4 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: David Howells @ 2020-05-13 13:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

Christoph Hellwig <hch@lst.de> wrote:

> Add a helper to directly set the IP_RECVERR sockopt from kernel space
> without going through a fake uaccess.

It looks like if this is an AF_INET6 socket, it will just pass the message
straight through to AF_INET4, so:

Reviewed-by: David Howells <dhowells@redhat.com>


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 23/33] ipv6: add ip6_sock_set_recverr
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (35 preceding siblings ...)
  2020-05-13 13:24 ` [PATCH 20/33] ipv4: add ip_sock_set_recverr David Howells
@ 2020-05-13 13:25 ` David Howells
  2020-05-13 13:27 ` [PATCH 06/33] net: add sock_set_timestamps David Howells
                   ` (3 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: David Howells @ 2020-05-13 13:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

Christoph Hellwig <hch@lst.de> wrote:

> Add a helper to directly set the IPV6_RECVERR sockopt from kernel space
> without going through a fake uaccess.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: David Howells <dhowells@redhat.com>


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 06/33] net: add sock_set_timestamps
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (36 preceding siblings ...)
  2020-05-13 13:25 ` [PATCH 23/33] ipv6: add ip6_sock_set_recverr David Howells
@ 2020-05-13 13:27 ` David Howells
  2020-05-13 17:38 ` remove kernel_setsockopt and kernel_getsockopt Joe Perches
                   ` (2 subsequent siblings)
  40 siblings, 0 replies; 76+ messages in thread
From: David Howells @ 2020-05-13 13:27 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

Christoph Hellwig <hch@lst.de> wrote:

> Add a helper to directly set the SO_TIMESTAMP* sockopts from kernel space
> without going through a fake uaccess.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: David Howells <dhowells@redhat.com>


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: remove kernel_setsockopt and kernel_getsockopt
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (37 preceding siblings ...)
  2020-05-13 13:27 ` [PATCH 06/33] net: add sock_set_timestamps David Howells
@ 2020-05-13 17:38 ` Joe Perches
  2020-05-14  6:27   ` Christoph Hellwig
  2020-05-14  8:29   ` David Laight
  2020-05-13 18:45 ` Sagi Grimberg
  2020-05-13 19:12 ` David Miller
  40 siblings, 2 replies; 76+ messages in thread
From: Joe Perches @ 2020-05-13 17:38 UTC (permalink / raw)
  To: Christoph Hellwig, David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

On Wed, 2020-05-13 at 08:26 +0200, Christoph Hellwig wrote:
> this series removes the kernel_setsockopt and kernel_getsockopt
> functions, and instead switches their users to small functions that
> implement setting (or in one case getting) a sockopt directly using
> a normal kernel function call with type safety and all the other
> benefits of not having a function call.
> 
> In some cases these functions seem pretty heavy handed as they do
> a lock_sock even for just setting a single variable, but this mirrors
> the real setsockopt implementation - counter to that a few kernel
> drivers just set the fields directly already.
> 
> Nevertheless the diffstat looks quite promising:
> 
>  42 files changed, 721 insertions(+), 799 deletions(-)

trivia:

It might be useful to show overall object size change.

More EXPORT_SYMBOL uses increase object size a little.

And not sure it matters much except it reduces overall object
size, but these patches remove (unnecessary) logging on error
and that could be mentioned in the cover letter too.

e.g.:

-       ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
-                       (char *)&sol, sizeof(sol));
-       if (ret) {
-               dev_err(nctrl->device,
-                       "failed to set SO_LINGER sock opt %d\n", ret);
-               goto err_sock;
-       }
+       sock_set_linger(queue->sock->sk, true, 0);




^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-13  6:26 ` [PATCH 27/33] sctp: export sctp_setsockopt_bindx Christoph Hellwig
@ 2020-05-13 18:00   ` Marcelo Ricardo Leitner
  2020-05-14  6:28     ` Christoph Hellwig
                       ` (3 more replies)
  0 siblings, 4 replies; 76+ messages in thread
From: Marcelo Ricardo Leitner @ 2020-05-13 18:00 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman, Jon Maloy,
	Ying Xue, drbd-dev, linux-block, linux-kernel, linux-rdma,
	linux-nvme, target-devel, linux-afs, linux-cifs, cluster-devel,
	ocfs2-devel, netdev, linux-sctp, ceph-devel, rds-devel,
	linux-nfs

On Wed, May 13, 2020 at 08:26:42AM +0200, Christoph Hellwig wrote:
> And call it directly from dlm instead of going through kernel_setsockopt.

The advantage on using kernel_setsockopt here is that sctp module will
only be loaded if dlm actually creates a SCTP socket.  With this
change, sctp will be loaded on setups that may not be actually using
it. It's a quite big module and might expose the system.

I'm okay with the SCTP changes, but I'll defer to DLM folks to whether
that's too bad or what for DLM.

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/dlm/lowcomms.c       | 13 ++++++++-----
>  include/net/sctp/sctp.h |  3 +++
>  net/sctp/socket.c       |  5 +++--
>  3 files changed, 14 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
> index b722a09a7ca05..e4939d770df53 100644
> --- a/fs/dlm/lowcomms.c
> +++ b/fs/dlm/lowcomms.c
> @@ -1005,14 +1005,17 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port)
>  		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
>  		make_sockaddr(&localaddr, port, &addr_len);
>  
> -		if (!i)
> +		if (!i) {
>  			result = kernel_bind(con->sock,
>  					     (struct sockaddr *)&localaddr,
>  					     addr_len);
> -		else
> -			result = kernel_setsockopt(con->sock, SOL_SCTP,
> -						   SCTP_SOCKOPT_BINDX_ADD,
> -						   (char *)&localaddr, addr_len);
> +		} else {
> +			lock_sock(con->sock->sk);
> +			result = sctp_setsockopt_bindx(con->sock->sk,
> +					(struct sockaddr *)&localaddr, addr_len,
> +					SCTP_BINDX_ADD_ADDR);
> +			release_sock(con->sock->sk);
> +		}
>  
>  		if (result < 0) {
>  			log_print("Can't bind to %d addr number %d, %d.\n",
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index 3ab5c6bbb90bd..f702b14d768ba 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -615,4 +615,7 @@ static inline bool sctp_newsk_ready(const struct sock *sk)
>  	return sock_flag(sk, SOCK_DEAD) || sk->sk_socket;
>  }
>  
> +int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
> +		int addrs_size, int op);
> +
>  #endif /* __net_sctp_h__ */
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 1c96b52c4aa28..30c981d9f6158 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -979,8 +979,8 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw)
>   *
>   * Returns 0 if ok, <0 errno code on error.
>   */
> -static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
> -				 int addrs_size, int op)
> +int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
> +		int addrs_size, int op)
>  {
>  	int err;
>  	int addrcnt = 0;
> @@ -1032,6 +1032,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
>  		return -EINVAL;
>  	}
>  }
> +EXPORT_SYMBOL(sctp_setsockopt_bindx);
>  
>  static int sctp_connect_new_asoc(struct sctp_endpoint *ep,
>  				 const union sctp_addr *daddr,
> -- 
> 2.26.2
> 

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 32/33] sctp: add sctp_sock_get_primary_addr
  2020-05-13  6:26 ` [PATCH 32/33] sctp: add sctp_sock_get_primary_addr Christoph Hellwig
@ 2020-05-13 18:03   ` Marcelo Ricardo Leitner
  2020-05-14  9:51     ` David Laight
  0 siblings, 1 reply; 76+ messages in thread
From: Marcelo Ricardo Leitner @ 2020-05-13 18:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman, Jon Maloy,
	Ying Xue, drbd-dev, linux-block, linux-kernel, linux-rdma,
	linux-nvme, target-devel, linux-afs, linux-cifs, cluster-devel,
	ocfs2-devel, netdev, linux-sctp, ceph-devel, rds-devel,
	linux-nfs

On Wed, May 13, 2020 at 08:26:47AM +0200, Christoph Hellwig wrote:
> Add a helper to directly get the SCTP_PRIMARY_ADDR sockopt from kernel
> space without going through a fake uaccess.

Same comment as on the other dlm/sctp patch.

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/dlm/lowcomms.c       | 11 +++-----
>  include/net/sctp/sctp.h |  1 +
>  net/sctp/socket.c       | 57 +++++++++++++++++++++++++----------------
>  3 files changed, 39 insertions(+), 30 deletions(-)
> 
> diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
> index 6fa45365666a8..46d2d71b62c57 100644
> --- a/fs/dlm/lowcomms.c
> +++ b/fs/dlm/lowcomms.c
> @@ -855,10 +855,9 @@ static int tcp_accept_from_sock(struct connection *con)
>  static int sctp_accept_from_sock(struct connection *con)
>  {
>  	/* Check that the new node is in the lockspace */
> -	struct sctp_prim prim;
> +	struct sctp_prim prim = { };
>  	int nodeid;
> -	int prim_len, ret;
> -	int addr_len;
> +	int addr_len, ret;
>  	struct connection *newcon;
>  	struct connection *addcon;
>  	struct socket *newsock;
> @@ -876,11 +875,7 @@ static int sctp_accept_from_sock(struct connection *con)
>  	if (ret < 0)
>  		goto accept_err;
>  
> -	memset(&prim, 0, sizeof(struct sctp_prim));
> -	prim_len = sizeof(struct sctp_prim);
> -
> -	ret = kernel_getsockopt(newsock, IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
> -				(char *)&prim, &prim_len);
> +	ret = sctp_sock_get_primary_addr(con->sock->sk, &prim);
>  	if (ret < 0) {
>  		log_print("getsockopt/sctp_primary_addr failed: %d", ret);
>  		goto accept_err;
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index b505fa082f254..c98b1d14db853 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -618,5 +618,6 @@ static inline bool sctp_newsk_ready(const struct sock *sk)
>  int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *kaddrs,
>  		int addrs_size, int op);
>  void sctp_sock_set_nodelay(struct sock *sk, bool val);
> +int sctp_sock_get_primary_addr(struct sock *sk, struct sctp_prim *prim);
>  
>  #endif /* __net_sctp_h__ */
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 64c395f7a86d5..39bf8090dbe1e 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -6411,6 +6411,35 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
>  	return err;
>  }
>  
> +static int __sctp_sock_get_primary_addr(struct sock *sk, struct sctp_prim *prim)
> +{
> +	struct sctp_association *asoc;
> +
> +	asoc = sctp_id2assoc(sk, prim->ssp_assoc_id);
> +	if (!asoc)
> +		return -EINVAL;
> +	if (!asoc->peer.primary_path)
> +		return -ENOTCONN;
> +
> +	memcpy(&prim->ssp_addr, &asoc->peer.primary_path->ipaddr,
> +		asoc->peer.primary_path->af_specific->sockaddr_len);
> +
> +	sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk),
> +			(union sctp_addr *)&prim->ssp_addr);
> +	return 0;
> +}
> +
> +int sctp_sock_get_primary_addr(struct sock *sk, struct sctp_prim *prim)
> +{
> +	int ret;
> +
> +	lock_sock(sk);
> +	ret = __sctp_sock_get_primary_addr(sk, prim);
> +	release_sock(sk);
> +	return ret;
> +}
> +EXPORT_SYMBOL(sctp_sock_get_primary_addr);
> +
>  /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
>   *
>   * Requests that the local SCTP stack use the enclosed peer address as
> @@ -6421,35 +6450,19 @@ static int sctp_getsockopt_primary_addr(struct sock *sk, int len,
>  					char __user *optval, int __user *optlen)
>  {
>  	struct sctp_prim prim;
> -	struct sctp_association *asoc;
> -	struct sctp_sock *sp = sctp_sk(sk);
> +	int ret;
>  
>  	if (len < sizeof(struct sctp_prim))
>  		return -EINVAL;
> -
> -	len = sizeof(struct sctp_prim);
> -
> -	if (copy_from_user(&prim, optval, len))
> +	if (copy_from_user(&prim, optval, sizeof(struct sctp_prim)))
>  		return -EFAULT;
>  
> -	asoc = sctp_id2assoc(sk, prim.ssp_assoc_id);
> -	if (!asoc)
> -		return -EINVAL;
> -
> -	if (!asoc->peer.primary_path)
> -		return -ENOTCONN;
> -
> -	memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr,
> -		asoc->peer.primary_path->af_specific->sockaddr_len);
> -
> -	sctp_get_pf_specific(sk->sk_family)->addr_to_user(sp,
> -			(union sctp_addr *)&prim.ssp_addr);
> +	ret = __sctp_sock_get_primary_addr(sk, &prim);
> +	if (ret)
> +		return ret;
>  
> -	if (put_user(len, optlen))
> +	if (put_user(len, optlen) || copy_to_user(optval, &prim, len))
>  		return -EFAULT;
> -	if (copy_to_user(optval, &prim, len))
> -		return -EFAULT;
> -
>  	return 0;
>  }
>  
> -- 
> 2.26.2
> 

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: remove kernel_setsockopt and kernel_getsockopt
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (38 preceding siblings ...)
  2020-05-13 17:38 ` remove kernel_setsockopt and kernel_getsockopt Joe Perches
@ 2020-05-13 18:45 ` Sagi Grimberg
  2020-05-13 19:12 ` David Miller
  40 siblings, 0 replies; 76+ messages in thread
From: Sagi Grimberg @ 2020-05-13 18:45 UTC (permalink / raw)
  To: Christoph Hellwig, David S. Miller, Jakub Kicinski
  Cc: Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel


> Hi Dave,
> 
> this series removes the kernel_setsockopt and kernel_getsockopt
> functions, and instead switches their users to small functions that
> implement setting (or in one case getting) a sockopt directly using
> a normal kernel function call with type safety and all the other
> benefits of not having a function call.
> 
> In some cases these functions seem pretty heavy handed as they do
> a lock_sock even for just setting a single variable, but this mirrors
> the real setsockopt implementation - counter to that a few kernel
> drivers just set the fields directly already.
> 
> Nevertheless the diffstat looks quite promising:
> 
>   42 files changed, 721 insertions(+), 799 deletions(-)

For the nvme-tcp bits,

Acked-by: Sagi Grimberg <sagi@grimberg.me>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: remove kernel_setsockopt and kernel_getsockopt
  2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
                   ` (39 preceding siblings ...)
  2020-05-13 18:45 ` Sagi Grimberg
@ 2020-05-13 19:12 ` David Miller
  40 siblings, 0 replies; 76+ messages in thread
From: David Miller @ 2020-05-13 19:12 UTC (permalink / raw)
  To: hch
  Cc: kuba, edumazet, kuznet, yoshfuji, vyasevich, nhorman,
	marcelo.leitner, jmaloy, ying.xue, drbd-dev, linux-block,
	linux-kernel, linux-rdma, linux-nvme, target-devel, linux-afs,
	linux-cifs, cluster-devel, ocfs2-devel, netdev, linux-sctp,
	ceph-devel, rds-devel, linux-nfs

From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 May 2020 08:26:15 +0200

> Hi Dave,
> 
> this series removes the kernel_setsockopt and kernel_getsockopt
> functions, and instead switches their users to small functions that
> implement setting (or in one case getting) a sockopt directly using
> a normal kernel function call with type safety and all the other
> benefits of not having a function call.
> 
> In some cases these functions seem pretty heavy handed as they do
> a lock_sock even for just setting a single variable, but this mirrors
> the real setsockopt implementation - counter to that a few kernel
> drivers just set the fields directly already.
> 
> Nevertheless the diffstat looks quite promising:
> 
>  42 files changed, 721 insertions(+), 799 deletions(-)

Overall I'm fine with these changes, but three things need to happen
before I can think about applying this:

1) Address David's feedback about the ip_mtu*() calls that can occur
   on ipv6 sockets too.

2) Handle the feedback about dlm now bringing in sctp even if sctp
   sockets are not even used because of the symbol dependency.

3) Add the rxrpc documentation requested by David.

Thank you.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 20/33] ipv4: add ip_sock_set_recverr
  2020-05-13  6:26 ` [PATCH 20/33] ipv4: add ip_sock_set_recverr Christoph Hellwig
@ 2020-05-13 21:00   ` Joe Perches
  2020-05-14 10:30     ` Christoph Hellwig
  0 siblings, 1 reply; 76+ messages in thread
From: Joe Perches @ 2020-05-13 21:00 UTC (permalink / raw)
  To: Christoph Hellwig, David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

On Wed, 2020-05-13 at 08:26 +0200, Christoph Hellwig wrote:
> Add a helper to directly set the IP_RECVERR sockopt from kernel space
> without going through a fake uaccess.

This seems used only with true as the second arg.
Is there reason to have that argument at all?

> diff --git a/include/net/ip.h b/include/net/ip.h
[]
> @@ -767,5 +767,6 @@ static inline bool inetdev_valid_mtu(unsigned int mtu)
>  
>  void ip_sock_set_tos(struct sock *sk, int val);
>  void ip_sock_set_freebind(struct sock *sk, bool val);
> +void ip_sock_set_recverr(struct sock *sk, bool val);
>  
>  #endif	/* _IP_H */
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 0c40887a817f8..9abecc3195520 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -589,6 +589,16 @@ void ip_sock_set_freebind(struct sock *sk, bool val)
>  }
>  EXPORT_SYMBOL(ip_sock_set_freebind);
>  
> +void ip_sock_set_recverr(struct sock *sk, bool val)
> +{
> +	lock_sock(sk);
> +	inet_sk(sk)->recverr = val;
> +	if (!val)
> +		skb_queue_purge(&sk->sk_error_queue);
> +	release_sock(sk);
> +}
> +EXPORT_SYMBOL(ip_sock_set_recverr);
> +
>  /*
>   *	Socket option code for IP. This is the end of the line after any
>   *	TCP,UDP etc options on an IP socket.
> diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
> index 562ea36c96b0f..1b87b8a9ff725 100644
> --- a/net/rxrpc/local_object.c
> +++ b/net/rxrpc/local_object.c
> @@ -171,13 +171,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
>  		/* Fall through */
>  	case AF_INET:
>  		/* we want to receive ICMP errors */
> -		opt = 1;
> -		ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
> -					(char *) &opt, sizeof(opt));
> -		if (ret < 0) {
> -			_debug("setsockopt failed");
> -			goto error;
> -		}
> +		ip_sock_set_recverr(local->socket->sk, true);
>  
>  		/* we want to set the don't fragment bit */
>  		opt = IP_PMTUDISC_DO;


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover
  2020-05-13 13:17 ` [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover David Howells
@ 2020-05-14  6:26   ` Christoph Hellwig
  2020-05-15 15:15   ` David Howells
  1 sibling, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-14  6:26 UTC (permalink / raw)
  To: David Howells
  Cc: Christoph Hellwig, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

On Wed, May 13, 2020 at 02:17:41PM +0100, David Howells wrote:
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > +		ip_sock_set_mtu_discover(conn->params.local->socket->sk,
> > +				IP_PMTUDISC_DONT);
> 
> Um... The socket in question could be an AF_INET6 socket, not an AF_INET4
> socket - I presume it will work in that case.  If so:

Yes, the implementation of that sockopt, including the inet_sock
structure where these options are set is shared between ipv4 and ipv6.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: remove kernel_setsockopt and kernel_getsockopt
  2020-05-13 17:38 ` remove kernel_setsockopt and kernel_getsockopt Joe Perches
@ 2020-05-14  6:27   ` Christoph Hellwig
  2020-05-14  8:29   ` David Laight
  1 sibling, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-14  6:27 UTC (permalink / raw)
  To: Joe Perches
  Cc: Christoph Hellwig, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

On Wed, May 13, 2020 at 10:38:59AM -0700, Joe Perches wrote:
> It might be useful to show overall object size change.
> 
> More EXPORT_SYMBOL uses increase object size a little.
> 
> And not sure it matters much except it reduces overall object
> size, but these patches remove (unnecessary) logging on error
> and that could be mentioned in the cover letter too.

The intent here is not to reduce code size.  The intent is to kill of
set_fs users so that we can eventually remove set_fs entirely.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-13 18:00   ` Marcelo Ricardo Leitner
@ 2020-05-14  6:28     ` Christoph Hellwig
  2020-05-14  8:23     ` David Laight
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-14  6:28 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Christoph Hellwig, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Jon Maloy, Ying Xue, drbd-dev, linux-block, linux-kernel,
	linux-rdma, linux-nvme, target-devel, linux-afs, linux-cifs,
	cluster-devel, ocfs2-devel, netdev, linux-sctp, ceph-devel,
	rds-devel, linux-nfs

On Wed, May 13, 2020 at 03:00:58PM -0300, Marcelo Ricardo Leitner wrote:
> On Wed, May 13, 2020 at 08:26:42AM +0200, Christoph Hellwig wrote:
> > And call it directly from dlm instead of going through kernel_setsockopt.
> 
> The advantage on using kernel_setsockopt here is that sctp module will
> only be loaded if dlm actually creates a SCTP socket.  With this
> change, sctp will be loaded on setups that may not be actually using
> it. It's a quite big module and might expose the system.

True.  Not that the intent is to kill kernel space callers of setsockopt,
as I plan to remove the set_fs address space override used for it.  So
if always pulling in sctp is not an option for the DLM maintainers we'd
have to do tricks using symbol_get() or similar.

The same would also apply for ipv6, although I'm not sure how common
modular ipv6 is in practice.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-13 18:00   ` Marcelo Ricardo Leitner
  2020-05-14  6:28     ` Christoph Hellwig
@ 2020-05-14  8:23     ` David Laight
  2020-05-14 10:40     ` is it ok to always pull in sctp for dlm, was: " Christoph Hellwig
  2020-05-15 15:20     ` David Howells
  3 siblings, 0 replies; 76+ messages in thread
From: David Laight @ 2020-05-14  8:23 UTC (permalink / raw)
  To: 'Marcelo Ricardo Leitner', Christoph Hellwig
  Cc: David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman, Jon Maloy,
	Ying Xue, drbd-dev, linux-block, linux-kernel, linux-rdma,
	linux-nvme, target-devel, linux-afs, linux-cifs, cluster-devel,
	ocfs2-devel, netdev, linux-sctp, ceph-devel, rds-devel,
	linux-nfs

From: Marcelo Ricardo Leitner
> Sent: 13 May 2020 19:01
> On Wed, May 13, 2020 at 08:26:42AM +0200, Christoph Hellwig wrote:
> > And call it directly from dlm instead of going through kernel_setsockopt.
> 
> The advantage on using kernel_setsockopt here is that sctp module will
> only be loaded if dlm actually creates a SCTP socket.  With this
> change, sctp will be loaded on setups that may not be actually using
> it. It's a quite big module and might expose the system.
> 
> I'm okay with the SCTP changes, but I'll defer to DLM folks to whether
> that's too bad or what for DLM.

I didn't see these sneak through.

There is a big long list of SCTP socket options that are
needed to make anything work.

They all need exporting.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: remove kernel_setsockopt and kernel_getsockopt
  2020-05-13 17:38 ` remove kernel_setsockopt and kernel_getsockopt Joe Perches
  2020-05-14  6:27   ` Christoph Hellwig
@ 2020-05-14  8:29   ` David Laight
  2020-05-14 10:18     ` Christoph Hellwig
  2020-05-14 19:57     ` David Miller
  1 sibling, 2 replies; 76+ messages in thread
From: David Laight @ 2020-05-14  8:29 UTC (permalink / raw)
  To: 'Joe Perches',
	Christoph Hellwig, David S. Miller, Jakub Kicinski
  Cc: Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich,
	Neil Horman, Marcelo Ricardo Leitner, Jon Maloy, Ying Xue,
	drbd-dev, linux-block, linux-kernel, linux-rdma, linux-nvme,
	target-devel, linux-afs, linux-cifs, cluster-devel, ocfs2-devel,
	netdev, linux-sctp, ceph-devel, rds-devel, linux-nfs

From: Joe Perches
> Sent: 13 May 2020 18:39
> On Wed, 2020-05-13 at 08:26 +0200, Christoph Hellwig wrote:
> > this series removes the kernel_setsockopt and kernel_getsockopt
> > functions, and instead switches their users to small functions that
> > implement setting (or in one case getting) a sockopt directly using
> > a normal kernel function call with type safety and all the other
> > benefits of not having a function call.
> >
> > In some cases these functions seem pretty heavy handed as they do
> > a lock_sock even for just setting a single variable, but this mirrors
> > the real setsockopt implementation - counter to that a few kernel
> > drivers just set the fields directly already.
> >
> > Nevertheless the diffstat looks quite promising:
> >
> >  42 files changed, 721 insertions(+), 799 deletions(-)

I missed this patch going through.
Massive NACK.

You need to export functions that do most of the socket options
for all protocols.
As well as REUSADDR and NODELAY SCTP has loads because a lot
of stuff that should have been extra system calls got piled
into setsockopt.

An alternate solution would be to move the copy_to/from_user()
into a wrapper function so that the kernel_[sg]etsockopt()
functions would bypass them completely.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 32/33] sctp: add sctp_sock_get_primary_addr
  2020-05-13 18:03   ` Marcelo Ricardo Leitner
@ 2020-05-14  9:51     ` David Laight
  2020-05-14 12:30       ` David Laight
  0 siblings, 1 reply; 76+ messages in thread
From: David Laight @ 2020-05-14  9:51 UTC (permalink / raw)
  To: 'Marcelo Ricardo Leitner', Christoph Hellwig
  Cc: David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman, Jon Maloy,
	Ying Xue, drbd-dev, linux-block, linux-kernel, linux-rdma,
	linux-nvme, target-devel, linux-afs, linux-cifs, cluster-devel,
	ocfs2-devel, netdev, linux-sctp, ceph-devel, rds-devel,
	linux-nfs

From: Marcelo Ricardo Leitner
> Sent: 13 May 2020 19:03
> 
> On Wed, May 13, 2020 at 08:26:47AM +0200, Christoph Hellwig wrote:
> > Add a helper to directly get the SCTP_PRIMARY_ADDR sockopt from kernel
> > space without going through a fake uaccess.
> 
> Same comment as on the other dlm/sctp patch.

Wouldn't it be best to write sctp_[gs]etsockotp() that
use a kernel buffer and then implement the user-space
calls using a wrapper that does the copies to an on-stack
(or malloced if big) buffer.

That will also simplify the code be removing all the copies
and -EFAULT returns.
Only the size checks will be needed and the code can assume
the buffer is at least the size of the on-stack buffer.

Our SCTP code uses SO_REUSADDR, SCTP_EVENTS, SCTP_NODELAY,
SCTP_STATUS, SCTP_INITMSG, IPV6_ONLY, SCTP_SOCKOPT_BINDX_ADD
and SO_LINGER.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: remove kernel_setsockopt and kernel_getsockopt
  2020-05-14  8:29   ` David Laight
@ 2020-05-14 10:18     ` Christoph Hellwig
  2020-05-14 10:26       ` David Laight
  2020-05-14 19:57     ` David Miller
  1 sibling, 1 reply; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-14 10:18 UTC (permalink / raw)
  To: David Laight
  Cc: 'Joe Perches',
	Christoph Hellwig, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

On Thu, May 14, 2020 at 08:29:30AM +0000, David Laight wrote:
> You need to export functions that do most of the socket options
> for all protocols.

Only for those were we have users, and all those are covered.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: remove kernel_setsockopt and kernel_getsockopt
  2020-05-14 10:18     ` Christoph Hellwig
@ 2020-05-14 10:26       ` David Laight
  2020-05-14 10:34         ` 'Christoph Hellwig'
  2020-05-14 20:03         ` David Miller
  0 siblings, 2 replies; 76+ messages in thread
From: David Laight @ 2020-05-14 10:26 UTC (permalink / raw)
  To: 'Christoph Hellwig'
  Cc: 'Joe Perches',
	David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

From: Christoph Hellwig
> Only for those were we have users, and all those are covered.

What do we tell all our users when our kernel SCTP code
no longer works?

It uses SO_REUSADDR, SCTP_EVENTS, SCTP_NODELAY,
SCTP_STATUS, SCTP_INITMSG, IPV6_ONLY, SCTP_SOCKOPT_BINDX_ADD
and SO_LINGER.
We should probably use the CONNECTX function as well.

I doubt we are the one company with out-of-tree drivers
that use the kernel_socket interface.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 29/33] rxrpc_sock_set_min_security_level
  2020-05-13 13:13 ` [PATCH 29/33] rxrpc_sock_set_min_security_level David Howells
@ 2020-05-14 10:29   ` Christoph Hellwig
  2020-05-15 15:13   ` David Howells
  1 sibling, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-14 10:29 UTC (permalink / raw)
  To: David Howells
  Cc: Christoph Hellwig, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

On Wed, May 13, 2020 at 02:13:07PM +0100, David Howells wrote:
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
> > +
> 
> Looks good - but you do need to add this to Documentation/networking/rxrpc.txt
> also, thanks.

That file doesn't exist, instead we now have a
cumentation/networking/rxrpc.rst in weird markup.  Where do you want this
to be added, and with what text?  Remember I don't really know what this
thing does, I just provide a shortcut.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 20/33] ipv4: add ip_sock_set_recverr
  2020-05-13 21:00   ` Joe Perches
@ 2020-05-14 10:30     ` Christoph Hellwig
  2020-05-14 11:51       ` Joe Perches
  0 siblings, 1 reply; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-14 10:30 UTC (permalink / raw)
  To: Joe Perches
  Cc: Christoph Hellwig, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

On Wed, May 13, 2020 at 02:00:43PM -0700, Joe Perches wrote:
> On Wed, 2020-05-13 at 08:26 +0200, Christoph Hellwig wrote:
> > Add a helper to directly set the IP_RECVERR sockopt from kernel space
> > without going through a fake uaccess.
> 
> This seems used only with true as the second arg.
> Is there reason to have that argument at all?

Mostly to keep it symmetric with the sockopt.  I could probably remove
a few arguments in the series if we want to be strict.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: remove kernel_setsockopt and kernel_getsockopt
  2020-05-14 10:26       ` David Laight
@ 2020-05-14 10:34         ` 'Christoph Hellwig'
  2020-05-14 11:11           ` David Laight
  2020-05-14 20:03         ` David Miller
  1 sibling, 1 reply; 76+ messages in thread
From: 'Christoph Hellwig' @ 2020-05-14 10:34 UTC (permalink / raw)
  To: David Laight
  Cc: 'Christoph Hellwig', 'Joe Perches',
	David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

On Thu, May 14, 2020 at 10:26:41AM +0000, David Laight wrote:
> From: Christoph Hellwig
> > Only for those were we have users, and all those are covered.
> 
> What do we tell all our users when our kernel SCTP code
> no longer works?

We only care about in-tree modules, just like for every other interface
in the kernel.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* is it ok to always pull in sctp for dlm, was: Re: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-13 18:00   ` Marcelo Ricardo Leitner
  2020-05-14  6:28     ` Christoph Hellwig
  2020-05-14  8:23     ` David Laight
@ 2020-05-14 10:40     ` Christoph Hellwig
  2020-05-14 14:24       ` David Teigland
  2020-05-15 15:20     ` David Howells
  3 siblings, 1 reply; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-14 10:40 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner, Christine Caulfield, David Teigland
  Cc: Christoph Hellwig, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Jon Maloy, Ying Xue, drbd-dev, linux-block, linux-kernel,
	linux-rdma, linux-nvme, target-devel, linux-afs, linux-cifs,
	cluster-devel, ocfs2-devel, netdev, linux-sctp, ceph-devel,
	rds-devel, linux-nfs

On Wed, May 13, 2020 at 03:00:58PM -0300, Marcelo Ricardo Leitner wrote:
> On Wed, May 13, 2020 at 08:26:42AM +0200, Christoph Hellwig wrote:
> > And call it directly from dlm instead of going through kernel_setsockopt.
> 
> The advantage on using kernel_setsockopt here is that sctp module will
> only be loaded if dlm actually creates a SCTP socket.  With this
> change, sctp will be loaded on setups that may not be actually using
> it. It's a quite big module and might expose the system.
> 
> I'm okay with the SCTP changes, but I'll defer to DLM folks to whether
> that's too bad or what for DLM.

So for ipv6 I could just move the helpers inline as they were trivial
and avoid that issue.  But some of the sctp stuff really is way too
big for that, so the only other option would be to use symbol_get.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: remove kernel_setsockopt and kernel_getsockopt
  2020-05-14 10:34         ` 'Christoph Hellwig'
@ 2020-05-14 11:11           ` David Laight
  2020-05-14 19:35             ` [Ocfs2-devel] " Matthew Wilcox
  0 siblings, 1 reply; 76+ messages in thread
From: David Laight @ 2020-05-14 11:11 UTC (permalink / raw)
  To: 'Christoph Hellwig'
  Cc: 'Joe Perches',
	David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

From: 'Christoph Hellwig'
> Sent: 14 May 2020 11:35
> On Thu, May 14, 2020 at 10:26:41AM +0000, David Laight wrote:
> > From: Christoph Hellwig
> > > Only for those were we have users, and all those are covered.
> >
> > What do we tell all our users when our kernel SCTP code
> > no longer works?
> 
> We only care about in-tree modules, just like for every other interface
> in the kernel.

Even if our management agreed to release the code and the code
layout matched the kernel guidelines you still wouldn't want
two large drivers that implement telephony functionality
for hardware that very few people actually have.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 20/33] ipv4: add ip_sock_set_recverr
  2020-05-14 10:30     ` Christoph Hellwig
@ 2020-05-14 11:51       ` Joe Perches
  2020-05-20 14:18         ` Christoph Hellwig
  0 siblings, 1 reply; 76+ messages in thread
From: Joe Perches @ 2020-05-14 11:51 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: David S. Miller, Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

On Thu, 2020-05-14 at 12:30 +0200, Christoph Hellwig wrote:
> On Wed, May 13, 2020 at 02:00:43PM -0700, Joe Perches wrote:
> > On Wed, 2020-05-13 at 08:26 +0200, Christoph Hellwig wrote:
> > > Add a helper to directly set the IP_RECVERR sockopt from kernel space
> > > without going through a fake uaccess.
> > 
> > This seems used only with true as the second arg.
> > Is there reason to have that argument at all?
> 
> Mostly to keep it symmetric with the sockopt.  I could probably remove
> a few arguments in the series if we want to be strict.

My preference would use strict and add
arguments only when necessary.



^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 32/33] sctp: add sctp_sock_get_primary_addr
  2020-05-14  9:51     ` David Laight
@ 2020-05-14 12:30       ` David Laight
  2020-05-14 13:27         ` David Laight
  0 siblings, 1 reply; 76+ messages in thread
From: David Laight @ 2020-05-14 12:30 UTC (permalink / raw)
  To: 'Marcelo Ricardo Leitner', 'Christoph Hellwig'
  Cc: 'David S. Miller', 'Jakub Kicinski',
	'Eric Dumazet', 'Alexey Kuznetsov',
	'Hideaki YOSHIFUJI', 'Vlad Yasevich',
	'Neil Horman', 'Jon Maloy', 'Ying Xue',
	'drbd-dev@lists.linbit.com',
	'linux-block@vger.kernel.org',
	'linux-kernel@vger.kernel.org',
	'linux-rdma@vger.kernel.org',
	'linux-nvme@lists.infradead.org',
	'target-devel@vger.kernel.org',
	'linux-afs@lists.infradead.org',
	'linux-cifs@vger.kernel.org',
	'cluster-devel@redhat.com',
	'ocfs2-devel@oss.oracle.com',
	'netdev@vger.kernel.org',
	'linux-sctp@vger.kernel.org',
	'ceph-devel@vger.kernel.org',
	'rds-devel@oss.oracle.com',
	'linux-nfs@vger.kernel.org'

From: David Laight
> Sent: 14 May 2020 10:51
> From: Marcelo Ricardo Leitner
> > Sent: 13 May 2020 19:03
> >
> > On Wed, May 13, 2020 at 08:26:47AM +0200, Christoph Hellwig wrote:
> > > Add a helper to directly get the SCTP_PRIMARY_ADDR sockopt from kernel
> > > space without going through a fake uaccess.
> >
> > Same comment as on the other dlm/sctp patch.
> 
> Wouldn't it be best to write sctp_[gs]etsockotp() that
> use a kernel buffer and then implement the user-space
> calls using a wrapper that does the copies to an on-stack
> (or malloced if big) buffer.

Actually looking at __sys_setsockopt() it calls
BPF_CGROUP_RUN_PROG_SETSOCKOPT() which (by the look of it)
can copy the user buffer into malloc()ed memory and
cause set_fs(KERNEL_DS) be called.

The only way to get rid of that set_fs() is to always
have the buffer in kernel memory when the underlying
setsockopt() code is called.

The comment above __sys_[sg]etsockopt() about not knowing
the length is just wrong.
It probably applied to getsockopt() in the dim and distant
past before it was made read-update.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 32/33] sctp: add sctp_sock_get_primary_addr
  2020-05-14 12:30       ` David Laight
@ 2020-05-14 13:27         ` David Laight
  0 siblings, 0 replies; 76+ messages in thread
From: David Laight @ 2020-05-14 13:27 UTC (permalink / raw)
  To: 'Marcelo Ricardo Leitner', 'Christoph Hellwig'
  Cc: 'David S. Miller', 'Jakub Kicinski',
	'Eric Dumazet', 'Alexey Kuznetsov',
	'Hideaki YOSHIFUJI', 'Vlad Yasevich',
	'Neil Horman', 'Jon Maloy', 'Ying Xue',
	'drbd-dev@lists.linbit.com',
	'linux-block@vger.kernel.org',
	'linux-kernel@vger.kernel.org',
	'linux-rdma@vger.kernel.org',
	'linux-nvme@lists.infradead.org',
	'target-devel@vger.kernel.org',
	'linux-afs@lists.infradead.org',
	'linux-cifs@vger.kernel.org',
	'cluster-devel@redhat.com',
	'ocfs2-devel@oss.oracle.com',
	'netdev@vger.kernel.org',
	'linux-sctp@vger.kernel.org',
	'ceph-devel@vger.kernel.org',
	'rds-devel@oss.oracle.com',
	'linux-nfs@vger.kernel.org'

From: David Laight
> Sent: 14 May 2020 13:30
> Subject: RE: [PATCH 32/33] sctp: add sctp_sock_get_primary_addr
> 
> From: David Laight
> > Sent: 14 May 2020 10:51
> > From: Marcelo Ricardo Leitner
> > > Sent: 13 May 2020 19:03
> > >
> > > On Wed, May 13, 2020 at 08:26:47AM +0200, Christoph Hellwig wrote:
> > > > Add a helper to directly get the SCTP_PRIMARY_ADDR sockopt from kernel
> > > > space without going through a fake uaccess.
> > >
> > > Same comment as on the other dlm/sctp patch.
> >
> > Wouldn't it be best to write sctp_[gs]etsockotp() that
> > use a kernel buffer and then implement the user-space
> > calls using a wrapper that does the copies to an on-stack
> > (or malloced if big) buffer.
> 
> Actually looking at __sys_setsockopt() it calls
> BPF_CGROUP_RUN_PROG_SETSOCKOPT() which (by the look of it)
> can copy the user buffer into malloc()ed memory and
> cause set_fs(KERNEL_DS) be called.
> 
> The only way to get rid of that set_fs() is to always
> have the buffer in kernel memory when the underlying
> setsockopt() code is called.

And having started to try coding __sys_setsockopt()
and then found the compat code I suspect that would
be a whole lot more sane if the buffer was in kernel
and it knew that at least (say) 64 bytes were allocated.

The whole compat_alloc_user_space() 'crap' could probably go.

Actually it looks like an application can avoid whatever
checks BPF_CGROUP_RUN_PROG_SETSOCKOPT() is trying to do
by using the 32bit compat ioctls.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: is it ok to always pull in sctp for dlm, was: Re: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-14 10:40     ` is it ok to always pull in sctp for dlm, was: " Christoph Hellwig
@ 2020-05-14 14:24       ` David Teigland
  0 siblings, 0 replies; 76+ messages in thread
From: David Teigland @ 2020-05-14 14:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Marcelo Ricardo Leitner, Christine Caulfield, David S. Miller,
	Jakub Kicinski, Eric Dumazet, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman, Jon Maloy,
	Ying Xue, drbd-dev, linux-block, linux-kernel, linux-rdma,
	linux-nvme, target-devel, linux-afs, linux-cifs, cluster-devel,
	ocfs2-devel, netdev, linux-sctp, ceph-devel, rds-devel,
	linux-nfs

On Thu, May 14, 2020 at 12:40:40PM +0200, Christoph Hellwig wrote:
> On Wed, May 13, 2020 at 03:00:58PM -0300, Marcelo Ricardo Leitner wrote:
> > On Wed, May 13, 2020 at 08:26:42AM +0200, Christoph Hellwig wrote:
> > > And call it directly from dlm instead of going through kernel_setsockopt.
> > 
> > The advantage on using kernel_setsockopt here is that sctp module will
> > only be loaded if dlm actually creates a SCTP socket.  With this
> > change, sctp will be loaded on setups that may not be actually using
> > it. It's a quite big module and might expose the system.
> > 
> > I'm okay with the SCTP changes, but I'll defer to DLM folks to whether
> > that's too bad or what for DLM.
> 
> So for ipv6 I could just move the helpers inline as they were trivial
> and avoid that issue.  But some of the sctp stuff really is way too
> big for that, so the only other option would be to use symbol_get.

Let's try symbol_get, having the sctp module always loaded caused problems
last time it happened (almost nobody uses dlm with it.)
Dave 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [Ocfs2-devel] remove kernel_setsockopt and kernel_getsockopt
  2020-05-14 11:11           ` David Laight
@ 2020-05-14 19:35             ` Matthew Wilcox
  0 siblings, 0 replies; 76+ messages in thread
From: Matthew Wilcox @ 2020-05-14 19:35 UTC (permalink / raw)
  To: David Laight
  Cc: 'Christoph Hellwig',
	Marcelo Ricardo Leitner, linux-nvme, Eric Dumazet, target-devel,
	linux-afs, drbd-dev, linux-cifs, rds-devel, linux-rdma,
	cluster-devel, Alexey Kuznetsov, linux-block,
	'Joe Perches',
	Jakub Kicinski, ceph-devel, linux-nfs, Neil Horman,
	Hideaki YOSHIFUJI, netdev, Vlad Yasevich, linux-kernel,
	Jon Maloy, linux-sctp, Ying Xue, David S. Miller, ocfs2-devel

On Thu, May 14, 2020 at 11:11:34AM +0000, David Laight wrote:
> From: 'Christoph Hellwig'
> > Sent: 14 May 2020 11:35
> > On Thu, May 14, 2020 at 10:26:41AM +0000, David Laight wrote:
> > > From: Christoph Hellwig
> > > > Only for those were we have users, and all those are covered.
> > >
> > > What do we tell all our users when our kernel SCTP code
> > > no longer works?
> > 
> > We only care about in-tree modules, just like for every other interface
> > in the kernel.
> 
> Even if our management agreed to release the code and the code
> layout matched the kernel guidelines you still wouldn't want
> two large drivers that implement telephony functionality
> for hardware that very few people actually have.

Oh, good point, we'll change the policy for all modules and make every
interface in the kernel stable from now on to cater to your special case.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: remove kernel_setsockopt and kernel_getsockopt
  2020-05-14  8:29   ` David Laight
  2020-05-14 10:18     ` Christoph Hellwig
@ 2020-05-14 19:57     ` David Miller
  1 sibling, 0 replies; 76+ messages in thread
From: David Miller @ 2020-05-14 19:57 UTC (permalink / raw)
  To: David.Laight
  Cc: joe, hch, kuba, edumazet, kuznet, yoshfuji, vyasevich, nhorman,
	marcelo.leitner, jmaloy, ying.xue, drbd-dev, linux-block,
	linux-kernel, linux-rdma, linux-nvme, target-devel, linux-afs,
	linux-cifs, cluster-devel, ocfs2-devel, netdev, linux-sctp,
	ceph-devel, rds-devel, linux-nfs

From: David Laight <David.Laight@ACULAB.COM>
Date: Thu, 14 May 2020 08:29:30 +0000

> You need to export functions that do most of the socket options
> for all protocols.

If all in-tree users of this stuff are converted, there is no argument
for keeping these routines.

You seemed to be concerned about out of tree stuff.  If so, that's not
of our concern.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: remove kernel_setsockopt and kernel_getsockopt
  2020-05-14 10:26       ` David Laight
  2020-05-14 10:34         ` 'Christoph Hellwig'
@ 2020-05-14 20:03         ` David Miller
  2020-05-15  8:14           ` David Laight
  1 sibling, 1 reply; 76+ messages in thread
From: David Miller @ 2020-05-14 20:03 UTC (permalink / raw)
  To: David.Laight
  Cc: hch, joe, kuba, edumazet, kuznet, yoshfuji, vyasevich, nhorman,
	marcelo.leitner, jmaloy, ying.xue, drbd-dev, linux-block,
	linux-kernel, linux-rdma, linux-nvme, target-devel, linux-afs,
	linux-cifs, cluster-devel, ocfs2-devel, netdev, linux-sctp,
	ceph-devel, rds-devel, linux-nfs

From: David Laight <David.Laight@ACULAB.COM>
Date: Thu, 14 May 2020 10:26:41 +0000

> I doubt we are the one company with out-of-tree drivers
> that use the kernel_socket interface.

Not our problem.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: remove kernel_setsockopt and kernel_getsockopt
  2020-05-14 20:03         ` David Miller
@ 2020-05-15  8:14           ` David Laight
  0 siblings, 0 replies; 76+ messages in thread
From: David Laight @ 2020-05-15  8:14 UTC (permalink / raw)
  To: 'David Miller'
  Cc: hch, joe, kuba, edumazet, kuznet, yoshfuji, vyasevich, nhorman,
	marcelo.leitner, jmaloy, ying.xue, drbd-dev, linux-block,
	linux-kernel, linux-rdma, linux-nvme, target-devel, linux-afs,
	linux-cifs, cluster-devel, ocfs2-devel, netdev, linux-sctp,
	ceph-devel, rds-devel, linux-nfs

Looking at __sys_setsockopt() I noticed that the BPF intercept
can also cause set_fs(KERNEL_DS) be set in order to pass a
modified buffer into the actual setsockopt() code.

If that functionality is to be kept then the underlying
protocol specific code needs changing to accept a kernel buffer.

The 32bit compat code would also be a lot simpler if it could
pass an kernel buffer through.
At the moment it copies the modified buffer back out onto the
user stack.

I'm sure there have been suggestions to remove that complete hack.
Fixing the compat code would leave a kernel_[sg]et_sockopt() that
still worked.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 29/33] rxrpc_sock_set_min_security_level
  2020-05-13 13:13 ` [PATCH 29/33] rxrpc_sock_set_min_security_level David Howells
  2020-05-14 10:29   ` Christoph Hellwig
@ 2020-05-15 15:13   ` David Howells
  1 sibling, 0 replies; 76+ messages in thread
From: David Howells @ 2020-05-15 15:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

Christoph Hellwig <hch@lst.de> wrote:

> > Looks good - but you do need to add this to Documentation/networking/rxrpc.txt
> > also, thanks.
> 
> That file doesn't exist, instead we now have a
> cumentation/networking/rxrpc.rst in weird markup.

Yeah - that's only in net/next thus far.

> Where do you want this to be added, and with what text?  Remember I don't
> really know what this thing does, I just provide a shortcut.

The document itself describes what each rxrpc sockopt does.  Just look for
RXRPC_MIN_SECURITY_LEVEL in there;-)

Anyway, see the attached.  This also fixes a couple of errors in the doc that
I noticed.

David
---
diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst
index 5ad35113d0f4..68552b92dc44 100644
--- a/Documentation/networking/rxrpc.rst
+++ b/Documentation/networking/rxrpc.rst
@@ -477,7 +477,7 @@ AF_RXRPC sockets support a few socket options at the SOL_RXRPC level:
 	 Encrypted checksum plus packet padded and first eight bytes of packet
 	 encrypted - which includes the actual packet length.
 
-     (c) RXRPC_SECURITY_ENCRYPTED
+     (c) RXRPC_SECURITY_ENCRYPT
 
 	 Encrypted checksum plus entire packet padded and encrypted, including
 	 actual packet length.
@@ -578,7 +578,7 @@ A client would issue an operation by:
      This issues a request_key() to get the key representing the security
      context.  The minimum security level can be set::
 
-	unsigned int sec = RXRPC_SECURITY_ENCRYPTED;
+	unsigned int sec = RXRPC_SECURITY_ENCRYPT;
 	setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
 		   &sec, sizeof(sec));
 
@@ -1090,6 +1090,15 @@ The kernel interface functions are as follows:
      jiffies).  In the event of the timeout occurring, the call will be
      aborted and -ETIME or -ETIMEDOUT will be returned.
 
+ (#) Apply the RXRPC_MIN_SECURITY_LEVEL sockopt to a socket from within in the
+     kernel::
+
+       int rxrpc_sock_set_min_security_level(struct sock *sk,
+					     unsigned int val);
+
+     This specifies the minimum security level required for calls on this
+     socket.
+
 
 Configurable Parameters
 =======================
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 7dfcbd58da85..e313dae01674 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -57,7 +57,7 @@ int afs_open_socket(struct afs_net *net)
 	srx.transport.sin6.sin6_port	= htons(AFS_CM_PORT);
 
 	ret = rxrpc_sock_set_min_security_level(socket->sk,
-			RXRPC_SECURITY_ENCRYPT);
+						RXRPC_SECURITY_ENCRYPT);
 	if (ret < 0)
 		goto error_2;
 

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover
  2020-05-13 13:17 ` [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover David Howells
  2020-05-14  6:26   ` Christoph Hellwig
@ 2020-05-15 15:15   ` David Howells
  1 sibling, 0 replies; 76+ messages in thread
From: David Howells @ 2020-05-15 15:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, David S. Miller, Jakub Kicinski,
	Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	ceph-devel, linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev,
	Vlad Yasevich, linux-kernel, Jon Maloy, Ying Xue, ocfs2-devel

Christoph Hellwig <hch@lst.de> wrote:

> > > +		ip_sock_set_mtu_discover(conn->params.local->socket->sk,
> > > +				IP_PMTUDISC_DONT);
> > 
> > Um... The socket in question could be an AF_INET6 socket, not an AF_INET4
> > socket - I presume it will work in that case.  If so:
> 
> Yes, the implementation of that sockopt, including the inet_sock
> structure where these options are set is shared between ipv4 and ipv6.

Great!  Could you note that either in the patch description or in the
kerneldoc attached to the function?

Thanks,
David


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-13 18:00   ` Marcelo Ricardo Leitner
                       ` (2 preceding siblings ...)
  2020-05-14 10:40     ` is it ok to always pull in sctp for dlm, was: " Christoph Hellwig
@ 2020-05-15 15:20     ` David Howells
  2020-05-15 15:24       ` Christoph Hellwig
  2020-05-16 15:11       ` David Laight
  3 siblings, 2 replies; 76+ messages in thread
From: David Howells @ 2020-05-15 15:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme,
	linux-sctp, target-devel, linux-afs, drbd-dev, linux-cifs,
	rds-devel, linux-rdma, cluster-devel, Alexey Kuznetsov,
	linux-block, Jakub Kicinski, ceph-devel, linux-nfs, Neil Horman,
	Hideaki YOSHIFUJI, netdev, Vlad Yasevich, linux-kernel,
	Jon Maloy, Ying Xue, David S. Miller, ocfs2-devel

Christoph Hellwig <hch@lst.de> wrote:

> > The advantage on using kernel_setsockopt here is that sctp module will
> > only be loaded if dlm actually creates a SCTP socket.  With this
> > change, sctp will be loaded on setups that may not be actually using
> > it. It's a quite big module and might expose the system.
> 
> True.  Not that the intent is to kill kernel space callers of setsockopt,
> as I plan to remove the set_fs address space override used for it.

For getsockopt, does it make sense to have the core kernel load optval/optlen
into a buffer before calling the protocol driver?  Then the driver need not
see the userspace pointer at all.

Similar could be done for setsockopt - allocate a buffer of the size requested
by the user inside the kernel and pass it into the driver, then copy the data
back afterwards.

David


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-15 15:20     ` David Howells
@ 2020-05-15 15:24       ` Christoph Hellwig
  2020-05-16 15:21         ` David Laight
  2020-05-16 15:11       ` David Laight
  1 sibling, 1 reply; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-15 15:24 UTC (permalink / raw)
  To: David Howells
  Cc: Christoph Hellwig, Marcelo Ricardo Leitner, Eric Dumazet,
	linux-nvme, linux-sctp, target-devel, linux-afs, drbd-dev,
	linux-cifs, rds-devel, linux-rdma, cluster-devel,
	Alexey Kuznetsov, linux-block, Jakub Kicinski, ceph-devel,
	linux-nfs, Neil Horman, Hideaki YOSHIFUJI, netdev, Vlad Yasevich,
	linux-kernel, Jon Maloy, Ying Xue, David S. Miller, ocfs2-devel

On Fri, May 15, 2020 at 04:20:02PM +0100, David Howells wrote:
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > > The advantage on using kernel_setsockopt here is that sctp module will
> > > only be loaded if dlm actually creates a SCTP socket.  With this
> > > change, sctp will be loaded on setups that may not be actually using
> > > it. It's a quite big module and might expose the system.
> > 
> > True.  Not that the intent is to kill kernel space callers of setsockopt,
> > as I plan to remove the set_fs address space override used for it.
> 
> For getsockopt, does it make sense to have the core kernel load optval/optlen
> into a buffer before calling the protocol driver?  Then the driver need not
> see the userspace pointer at all.
> 
> Similar could be done for setsockopt - allocate a buffer of the size requested
> by the user inside the kernel and pass it into the driver, then copy the data
> back afterwards.

I did look into that initially.  The problem is that tons of sockopts
entirely ignore optlen and just use a fixed size.  So I fear that there
could be tons of breakage if we suddently respect it.  Otherwise that
would be a pretty nice way to handle the situation.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-15 15:20     ` David Howells
  2020-05-15 15:24       ` Christoph Hellwig
@ 2020-05-16 15:11       ` David Laight
  2020-05-16 15:36         ` [Ocfs2-devel] " Matthew Wilcox
  1 sibling, 1 reply; 76+ messages in thread
From: David Laight @ 2020-05-16 15:11 UTC (permalink / raw)
  To: 'David Howells', Christoph Hellwig
  Cc: Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	Jakub Kicinski, ceph-devel, linux-nfs, Neil Horman,
	Hideaki YOSHIFUJI, netdev, Vlad Yasevich, linux-kernel,
	Jon Maloy, Ying Xue, David S. Miller, ocfs2-devel

From: David Howells
> Sent: 15 May 2020 16:20
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > > The advantage on using kernel_setsockopt here is that sctp module will
> > > only be loaded if dlm actually creates a SCTP socket.  With this
> > > change, sctp will be loaded on setups that may not be actually using
> > > it. It's a quite big module and might expose the system.
> >
> > True.  Not that the intent is to kill kernel space callers of setsockopt,
> > as I plan to remove the set_fs address space override used for it.
> 
> For getsockopt, does it make sense to have the core kernel load optval/optlen
> into a buffer before calling the protocol driver?  Then the driver need not
> see the userspace pointer at all.
> 
> Similar could be done for setsockopt - allocate a buffer of the size requested
> by the user inside the kernel and pass it into the driver, then copy the data
> back afterwards.

Yes, it also simplifies all the compat code.
And there is a BPF test in setsockopt that also wants to
pass on a kernel buffer.

I'm willing to sit and write the patch.
Quoting from a post I made later on Friday.

Basically:

This patch sequence (to be written) does the following:

Patch 1: Change __sys_setsockopt() to allocate a kernel buffer,
         copy the data into it then call set_fs(KERNEL_DS).
         An on-stack buffer (say 64 bytes) will be used for
         small transfers.

Patch 2: The same for __sys_getsockopt().

Patch 3: Compat setsockopt.

Patch 4: Compat getsockopt.

Patch 5: Remove the user copies from the global socket options code.

Patches 6 to n-1; Remove the user copies from the per-protocol code.

Patch n: Remove the set_fs(KERNEL_DS) from the entry points.

This should be bisectable.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-15 15:24       ` Christoph Hellwig
@ 2020-05-16 15:21         ` David Laight
  0 siblings, 0 replies; 76+ messages in thread
From: David Laight @ 2020-05-16 15:21 UTC (permalink / raw)
  To: 'Christoph Hellwig', David Howells
  Cc: Marcelo Ricardo Leitner, Eric Dumazet, linux-nvme, linux-sctp,
	target-devel, linux-afs, drbd-dev, linux-cifs, rds-devel,
	linux-rdma, cluster-devel, Alexey Kuznetsov, linux-block,
	Jakub Kicinski, ceph-devel, linux-nfs, Neil Horman,
	Hideaki YOSHIFUJI, netdev, Vlad Yasevich, linux-kernel,
	Jon Maloy, Ying Xue, David S. Miller, ocfs2-devel

From: Christoph Hellwig
> Sent: 15 May 2020 16:25
> On Fri, May 15, 2020 at 04:20:02PM +0100, David Howells wrote:
> > Christoph Hellwig <hch@lst.de> wrote:
> >
> > > > The advantage on using kernel_setsockopt here is that sctp module will
> > > > only be loaded if dlm actually creates a SCTP socket.  With this
> > > > change, sctp will be loaded on setups that may not be actually using
> > > > it. It's a quite big module and might expose the system.
> > >
> > > True.  Not that the intent is to kill kernel space callers of setsockopt,
> > > as I plan to remove the set_fs address space override used for it.
> >
> > For getsockopt, does it make sense to have the core kernel load optval/optlen
> > into a buffer before calling the protocol driver?  Then the driver need not
> > see the userspace pointer at all.
> >
> > Similar could be done for setsockopt - allocate a buffer of the size requested
> > by the user inside the kernel and pass it into the driver, then copy the data
> > back afterwards.
> 
> I did look into that initially.  The problem is that tons of sockopts
> entirely ignore optlen and just use a fixed size.  So I fear that there
> could be tons of breakage if we suddently respect it.  Otherwise that
> would be a pretty nice way to handle the situation.

I'd guess that most application use the correct size for setsockopt().
(Well, apart from using 4 instead of 1.)

It is certainly possible to always try to read in 64 bytes
regardless of the supplied length, but handle the EFAULT case
by shortening the buffer.

Historically getsockopt() only wrote the length back.
Treating 0 and garbage as (say) 4k and letting the protocol
code set a shorten the copy to user might work.
All short transfers would want to use an on-stack buffer,
so slight oversizes could also be allowed for.

OTOH if i did a getsockopt() with too short a length I wouldn't
want the kernel to trash my program memory.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [Ocfs2-devel] [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-16 15:11       ` David Laight
@ 2020-05-16 15:36         ` Matthew Wilcox
  2020-05-17  8:48           ` David Laight
  0 siblings, 1 reply; 76+ messages in thread
From: Matthew Wilcox @ 2020-05-16 15:36 UTC (permalink / raw)
  To: David Laight
  Cc: 'David Howells',
	Christoph Hellwig, Marcelo Ricardo Leitner, linux-nvme,
	linux-kernel, linux-sctp, target-devel, linux-afs, drbd-dev,
	linux-cifs, rds-devel, linux-rdma, cluster-devel, Jakub Kicinski,
	linux-block, Alexey Kuznetsov, ceph-devel, linux-nfs,
	Neil Horman, Hideaki YOSHIFUJI, netdev, Vlad Yasevich,
	Eric Dumazet, Jon Maloy, Ying Xue, David S. Miller, ocfs2-devel

On Sat, May 16, 2020 at 03:11:40PM +0000, David Laight wrote:
> From: David Howells
> > Sent: 15 May 2020 16:20
> > Christoph Hellwig <hch@lst.de> wrote:
> > 
> > > > The advantage on using kernel_setsockopt here is that sctp module will
> > > > only be loaded if dlm actually creates a SCTP socket.  With this
> > > > change, sctp will be loaded on setups that may not be actually using
> > > > it. It's a quite big module and might expose the system.
> > >
> > > True.  Not that the intent is to kill kernel space callers of setsockopt,
> > > as I plan to remove the set_fs address space override used for it.
> > 
> > For getsockopt, does it make sense to have the core kernel load optval/optlen
> > into a buffer before calling the protocol driver?  Then the driver need not
> > see the userspace pointer at all.
> > 
> > Similar could be done for setsockopt - allocate a buffer of the size requested
> > by the user inside the kernel and pass it into the driver, then copy the data
> > back afterwards.
> 
> Yes, it also simplifies all the compat code.
> And there is a BPF test in setsockopt that also wants to
> pass on a kernel buffer.
> 
> I'm willing to sit and write the patch.
> Quoting from a post I made later on Friday.
> 
> Basically:
> 
> This patch sequence (to be written) does the following:
> 
> Patch 1: Change __sys_setsockopt() to allocate a kernel buffer,
>          copy the data into it then call set_fs(KERNEL_DS).
>          An on-stack buffer (say 64 bytes) will be used for
>          small transfers.
> 
> Patch 2: The same for __sys_getsockopt().
> 
> Patch 3: Compat setsockopt.
> 
> Patch 4: Compat getsockopt.
> 
> Patch 5: Remove the user copies from the global socket options code.
> 
> Patches 6 to n-1; Remove the user copies from the per-protocol code.
> 
> Patch n: Remove the set_fs(KERNEL_DS) from the entry points.
> 
> This should be bisectable.

I appreciate your dedication to not publishing the source code to
your kernel module, but Christoph's patch series is actually better.
It's typesafe rather than passing void pointers around.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [Ocfs2-devel] [PATCH 27/33] sctp: export sctp_setsockopt_bindx
  2020-05-16 15:36         ` [Ocfs2-devel] " Matthew Wilcox
@ 2020-05-17  8:48           ` David Laight
  0 siblings, 0 replies; 76+ messages in thread
From: David Laight @ 2020-05-17  8:48 UTC (permalink / raw)
  To: 'Matthew Wilcox'
  Cc: 'David Howells',
	Christoph Hellwig, Marcelo Ricardo Leitner, linux-nvme,
	linux-kernel, linux-sctp, target-devel, linux-afs, drbd-dev,
	linux-cifs, rds-devel, linux-rdma, cluster-devel, Jakub Kicinski,
	linux-block, Alexey Kuznetsov, ceph-devel, linux-nfs,
	Neil Horman, Hideaki YOSHIFUJI, netdev, Vlad Yasevich,
	Eric Dumazet, Jon Maloy, Ying Xue, David S. Miller, ocfs2-devel


[-- Attachment #1: Type: text/plain, Size: 1469 bytes --]

From: Matthew Wilcox
> Sent: 16 May 2020 16:37
...
> > Basically:
> >
> > This patch sequence (to be written) does the following:
> >
> > Patch 1: Change __sys_setsockopt() to allocate a kernel buffer,
> >          copy the data into it then call set_fs(KERNEL_DS).
> >          An on-stack buffer (say 64 bytes) will be used for
> >          small transfers.
> >
> > Patch 2: The same for __sys_getsockopt().
> >
> > Patch 3: Compat setsockopt.
> >
> > Patch 4: Compat getsockopt.
> >
> > Patch 5: Remove the user copies from the global socket options code.
> >
> > Patches 6 to n-1; Remove the user copies from the per-protocol code.
> >
> > Patch n: Remove the set_fs(KERNEL_DS) from the entry points.
> >
> > This should be bisectable.
> 
> I appreciate your dedication to not publishing the source code to
> your kernel module, but Christoph's patch series is actually better.
> It's typesafe rather than passing void pointers around.

There are plenty on interfaces that pass a 'pointer and length'.
Having the compiler do a type check doesn't give any security
benefit - just stops silly errors.

Oh yes, I've attached the only driver source file that calls
into the Linux kernel.
You are perfectly free to look at all the thing we have to do
to support different and broken kernel releases.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

[-- Attachment #2: ss7osglue.c --]
[-- Type: text/plain, Size: 36047 bytes --]

#ident "@(#) (c) Aculab plc $Header: /home/cvs/repository/ss7/stack/src/driver/linux/ss7osglue.c,v 1.157 2019-08-29 16:09:14 davidla Exp $ $Name:  $"
#ifndef MODULE
#define MODULE
#endif

#include <linux/version.h>

#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
#error minimum kernel version is 2.6.28
#endif

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 34)
#include <generated/autoconf.h>
#else
#include <linux/autoconf.h>
#endif

#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/kmod.h>
#include <linux/string.h>
#include <linux/sched.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
#include <linux/sched/signal.h>
#endif
#include <linux/wait.h>
#include <linux/socket.h>
#include <linux/signal.h>
#include <linux/poll.h>
#include <linux/net.h>
#include <linux/nsproxy.h>
#include <linux/in.h>
#include <linux/reboot.h>
#include <asm/atomic.h>
#include <asm/uaccess.h>

#include <linux/kthread.h>

/* This is only in the kernel build tree */
#include <net/sock.h>

#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0)
#include <uapi/linux/sctp.h>
#else
#include <net/sctp/user.h>    /* netinet/sctp.h ought to be this file */
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
#define wait_queue_head __wait_queue_head
#define wait_queue_entry __wait_queue
#endif

#define SK_PROTOCOL(sock) (sock)->sk->sk_protocol

extern void ss7_trace_mem(int, void *, int, const char *, ...);
extern void ss7_trace_printf(int, const char *, ...);

/* Aculab DACP interfaces - these are in aculab's kern_if.h */
void *dacp_symbol_get(const char *);
int dacp_symbol_release(const char *);

MODULE_AUTHOR("Aculab");
MODULE_LICENSE("Proprietary");

#include "ss7osglue.h"

/* Mutex for driver interface code */
static struct mutex ss7_glue_mutex;

static int ss7dev_major;
static const void *ss7_dtls_handle;
static int ss7_use_count;
static int ss7_stop_pid;

static struct task_struct *asserted_tasks[16];
static unsigned int asserted_task_count;

typedef char ss7_verify_const[ SS7_SOCK_STREAM == SOCK_STREAM && SS7_SOCK_SEQPACKET == SOCK_SEQPACKET ? 1 : -1];

static void ss7_net_ns_unload(void);

#define TCP_NODELAY 1

static int ss7_glue_open(struct inode *, struct file *);
static int ss7_glue_release(struct inode *, struct file *);
static long ss7_glue_unlocked_ioctl(struct file *, unsigned int, unsigned long);
static unsigned int ss7_glue_poll(struct file *const, poll_table *);

static struct file_operations ss7dev_fop =
{
    open:           ss7_glue_open,
    release:        ss7_glue_release,
    unlocked_ioctl: ss7_glue_unlocked_ioctl,
    compat_ioctl:   ss7_glue_unlocked_ioctl,
    poll:           ss7_glue_poll,
    owner:          THIS_MODULE
};

static int ss7_reboot_notify(struct notifier_block *nb, unsigned long action,
        void *data)
{
    /* System being rebooted.
     * I added this hoping to use it to get the ss7maint daemon to exit,
     * but it isn't called until all user processes have died.
     * Leave it here - might be useful one day. */
    return 0;
}

static struct notifier_block ss7_reboot_notifier_block = {
    .notifier_call = ss7_reboot_notify,
};

static int
ss7_init_fail(int rval)
{
    if (ss7dev_major > 0)
        unregister_chrdev(ss7dev_major, "ss7server");
    return rval;
}

static int
ss7_init_mod(void)
{
    const void *(*dtls_register)(const char *, int (*)(struct dtls_get_if *));
    int rval;

    ss7_mutex_init(&ss7_glue_mutex);

    printk(KERN_INFO "%s\n", ss7version);

    ss7dev_major = register_chrdev(0, "ss7server", &ss7dev_fop);

    if (ss7dev_major < 0) {
        printk(KERN_INFO "ss7server: register_chrdev() failed: %d\n",
                ss7dev_major);
        return ss7_init_fail(ss7dev_major);
    }

    rval = ss7_driver_init();
    if (rval != 0) {
        printk(KERN_INFO "ss7server: ss7_driver_init() failed: %d\n", rval);
        return ss7_init_fail(-EIO);
    }

    dtls_register = dacp_symbol_get("acuc_dtls_register");
    if (dtls_register == NULL)
        printk(KERN_INFO "ss7server: cannot locate \"acuc_dtls_register\"\n");
    else
        ss7_dtls_handle = dtls_register(DYNAMIC_TLS_PREFIX "ss7",
                ss7_tls_get_if);

    register_reboot_notifier(&ss7_reboot_notifier_block);
    return 0;
}

static void
ss7_cleanup_mod(void)
{
    int (*dtls_unregister)(const void *);

    unregister_reboot_notifier(&ss7_reboot_notifier_block);

    if (ss7_dtls_handle != NULL) {
        dtls_unregister = dacp_symbol_get("acuc_dtls_unregister");
        dacp_symbol_release("acuc_dtls_register");
        if (dtls_unregister != NULL) {
            dtls_unregister(ss7_dtls_handle);
            dacp_symbol_release("acuc_dtls_unregister");
        }
    }

    ss7_init_fail(0);

    printk(KERN_INFO "Aculab ss7server: driver unloaded\n");
}

module_init(ss7_init_mod)
module_exit(ss7_cleanup_mod)

static int
ss7_glue_open(struct inode *const inode, struct file *const filp)
{
    int rval, pid;

    if (filp->private_data)
        /* Duplicate open */
        return 0;

    ss7_mutex_enter(&ss7_glue_mutex);
    if (ss7_use_count < 0) {
        /* ss7_driver_shutdown() has been called, to late to do anything */
        ss7_mutex_exit(&ss7_glue_mutex);
        return -EIO;
    }
    ss7_use_count++;
    ss7_mutex_exit(&ss7_glue_mutex);

    rval = ss7_devif_open(&filp->private_data);
    if (rval != 0) {
        ss7_mutex_enter(&ss7_glue_mutex);
        ss7_use_count--;
        ss7_mutex_exit(&ss7_glue_mutex);
        pid = ss7_pid();
        if (pid != ss7_stop_pid)
            printk(KERN_INFO "ss7_devif_open() pid %d failed ss7 error %d\n",
                    pid, rval);
        return -EIO;
    }

    return 0;
}

static int
ss7_glue_release(struct inode *const inode, struct file *const filp)
{
    if (filp->private_data)
        ss7_devif_close(filp->private_data);

    ss7_mutex_enter(&ss7_glue_mutex);
    ss7_use_count--;

    if (ss7_use_count == 0 && ss7_stop_pid != 0) {
        /* Last user process has gone, complete shutdown functions */
        ss7_net_ns_unload();
        /* Stop any more opens */
        ss7_use_count = -1;
        ss7_driver_shutdown();
    }

    ss7_mutex_exit(&ss7_glue_mutex);

    return 0;
}

static long
ss7_glue_unlocked_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
    if (!filp->private_data)
        return -ENODEV;

    switch (cmd) {

    case SS7_STOP:          /* ss7maint shutting us down */
        /* Start shutdown now, will complete on last close */
        ss7_driver_stop();
        ss7_stop_pid = ss7_pid();
        return 0;

    /* Request from ss7maint or user application */
    case SS7_USER_IOCTL_CODE:
        return ss7dev_ioctl(filp->private_data, cmd, arg);

    default:
        return -ENOTTY;
    }
}

static unsigned int
ss7_glue_poll(struct file *filp, poll_table *pt)
{
    poll_wait(filp, *ss7_devif_get_pollqueue_head(filp->private_data), pt);
    return ss7_devif_get_poll_status(filp->private_data);
}

void *
ss7_os_malloc(int s, int ss7_flags)
{
    return kmalloc(s, GFP_KERNEL);
}

void
ss7_os_free(void *p)
{
    kfree(p);
}

void
ss7_poll_queue_head_deinit(wait_queue_head_t **pqhp)
{
    ss7_os_free(*pqhp);
}

int
ss7_poll_queue_head_init(wait_queue_head_t **pqhp)
{
    wait_queue_head_t *pqh = ss7_os_malloc(sizeof *pqh, 0);
    if (pqh == NULL)
        return -1;
    init_waitqueue_head(pqh);
    *pqhp = pqh;
    return 0;
}

void
ss7_pollwakeup(wait_queue_head_t **pqh, unsigned int poll_event)
{
    wake_up(*pqh);
}

void
ss7_kill_task(struct task_struct *task, int signo)
{
    /* Send signal even though set to SIG_IGN */
    force_sig(signo, task);
}


#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 32)
/* spinlock_t is a typedef for an unnamed structure so we can't
 * make 'struct spinlock' match the kernel spinlock type. */
#define SPINLOCK_CAST (spinlock_t *)
#else
#define SPINLOCK_CAST
#endif

size_t
ss7_spin_lock_size(void)
{
    return sizeof *SPINLOCK_CAST(struct spinlock *)0;
}

void
ss7_spin_lock_init(struct spinlock *s)
{
    spin_lock_init(SPINLOCK_CAST s);
}

void
ss7_spin_lock_enter(struct spinlock *s)
{
    spin_lock(SPINLOCK_CAST s);
}

void
ss7_spin_lock_exit(struct spinlock *s)
{
    spin_unlock(SPINLOCK_CAST s);
}

size_t
ss7_mutex_size(void)
{
    return sizeof(struct mutex);
}

void
ss7_mutex_init(struct mutex *s)
{
    mutex_init(s);
}

void
ss7_mutex_enter(struct mutex *s)
{
    mutex_lock(s);
}

int
ss7_mutex_enter_tmo(struct mutex *s, int max_wait)
{
    /* There is no mutex_enter_timeout() however this was all added
     * to stop status commands sleeping forever when a process has
     * 'oopsed' with a mutex held.
     * Do a sneak check on the state of any owning task then
     * wait interruptibly.
     * ^C should error out the status call. */

    /* If uncontended just acquire */
    if (mutex_trylock(s))
        return 1;

#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
    {
        struct task_struct *owner;
        int state;

        spin_lock(&s->wait_lock);
        owner = __mutex_owner(s);
        state = owner ? owner->state : 0;
        spin_unlock(&s->wait_lock);
        if (state & TASK_DEAD)
            /* mutex will never be released, treat as timeout */
            return 0;
    }
#endif

    /* If C7_ASSERT() has been called, just let everyone in */
    if (asserted_task_count)
        return 0;

    return mutex_lock_interruptible(s) ? -1 /* EINTR */ : 1 /* acquired */;
}

void
ss7_mutex_exit(struct mutex *s)
{
    mutex_unlock(s);
}

size_t
ss7_cv_size(void)
{
    return sizeof(wait_queue_head_t);
}

void
ss7_cv_init(wait_queue_head_t *const v)
{
    init_waitqueue_head(v);
}

static int
ss7_schedule_tmo(int tmo_ms)
{
    int tmo_jiffies;

    /* Really sleep - unless woken since unlocking spinlock */
    if (tmo_ms >= 0) {
        if (tmo_ms <= 1)
            tmo_jiffies = tmo_ms;
        else
            /* Convert to jiffies and round up */
            tmo_jiffies = 1 + (tmo_ms + 1 - 1) * 16 / (16000/HZ);
        /* Return value of schedule_timeout() is unexpired timeout */
        /* We want 0 for 'timedout' (to match cv_wait_sig()) */
        return schedule_timeout(tmo_jiffies) != 0;
    }

    schedule();
    if (!signal_pending(current))
        /* Woken by the event */
        return 1;

    /* Report 0 for a signal, except -1 for SIGKILL (reboot) */
    return sigismember(&current->pending.signal, SIGKILL) ? -1 : 0;
}

int
ss7_cv_wait_guts(wait_queue_head_t *cvp, struct mutex *mtxp,
        int interruptible, int tmo_ms)
{
    int r;
    struct wait_queue_entry w;
    int sleep_state;

    init_waitqueue_entry(&w, current);

    /* Tell scheduler we are going to sleep... */
    if (signal_pending(current) && !interruptible)
        /* We don't want waking immediately (again) */
        sleep_state = TASK_UNINTERRUPTIBLE;
    else
        sleep_state = TASK_INTERRUPTIBLE;
    set_current_state(sleep_state);

    /* Connect to condition variable ... */
    add_wait_queue(cvp, &w);
    mutex_unlock(mtxp); /* Release mutex */

    r = ss7_schedule_tmo(tmo_ms);

    /* Disconnect from condition variable ... */
    remove_wait_queue(cvp, &w);

    /* Re-acquire mutex */
    mutex_lock(mtxp);

    /* return 1 if woken, 0 if timed_out/signal, -1 if SIGKILL */
    return r;
}

int
ss7_cv_wait_spin_lock(wait_queue_head_t *cvp, struct spinlock *lock,
        int interruptible, int tmo_ms)
{
    int r;
    struct wait_queue_entry w;
    int sleep_state;

    init_waitqueue_entry(&w, current);

    /* Tell scheduler we are going to sleep... */
    if (signal_pending(current) && !interruptible)
        /* We don't want waking immediately (again) */
        sleep_state = TASK_UNINTERRUPTIBLE;
    else
        sleep_state = TASK_INTERRUPTIBLE;
    set_current_state(sleep_state);

    /* Connect to condition variable ... */
    add_wait_queue(cvp, &w);
    spin_unlock(SPINLOCK_CAST lock);

    r = ss7_schedule_tmo(tmo_ms);

    /* Disconnect from condition variable ... */
    remove_wait_queue(cvp, &w);

    /* Re-acquire mutex */
    spin_lock(SPINLOCK_CAST lock);

    return r;
}

/*---------------------------------------------------------------------**
** ss7_cv_broadcast                                                    **
** Awaken all threads that are sleeping on a condition variable.       **
** Caller must use the associated mutex sensibly, i.e. ...             **
**      acquire the mutex                                              **
**      Set some flag that a sleeping thread will check for            **
**      ss7_cv_broadcast()                                             **
**      release the mutex                                              **
**---------------------------------------------------------------------*/

void
ss7_cv_broadcast(wait_queue_head_t *const cvp)
{
    wake_up(cvp);
}


unsigned long
ss7_copy_to_user(void *to, const void *from, unsigned long c)
{
    return copy_to_user(to, from, c);
}

unsigned long
ss7_copy_from_user(void *to, const void *from, unsigned long c)
{
    return copy_from_user(to, from, c);
}

unsigned int
ss7_pid(void)
{
    return current->pid;
}

struct task_struct *
ss7_current_task(void)
{
    return current;
}

unsigned int
ss7_task_pid(struct task_struct *task)
{
    return task->pid;
}

int
ss7_glue_thread_fn(void *ss7_thread)
{
    ss7_thread_run(ss7_thread);
    module_put_and_exit(0);
    return 0;
}

struct task_struct *
ss7_os_thread_create(struct ss7_thread *thrp, const char *desc)
{
    struct task_struct *task;
    const char *sp;
    int len;

    if (!try_module_get(THIS_MODULE))
        return NULL;

    /* The thread description gets truncated to 15 chars, can't be helped!
     * Use 'ss7maint osstatus -t' to get the full description. */

    /* Remove any leading space and truncate after second word */
    if (desc[0] == ' ')
        desc++;
    len = 100;
    sp = ss7strchr(desc, ' ');
    if (sp != NULL) {
        sp = ss7strchr(sp + 1, ' ');
        if (sp != NULL)
            len = sp - desc;
    }

    task = kthread_run(ss7_glue_thread_fn, thrp, "ss7:%.*s", len, desc);
    if (IS_ERR(task)) {
        module_put(THIS_MODULE);
        return NULL;
    }
    return task;
}

void
ss7_ms_delay(const unsigned int ms)
{
    set_current_state(TASK_UNINTERRUPTIBLE);
    schedule_timeout((unsigned long long)HZ * ms / 1000);
}

int
ss7_os_get_ticks(void)
{
    return jiffies;
}

int
ss7_os_ticks_to_us(int interval)
{
    return interval * 1000000 / HZ;
}

int
ss7_os_ticks_to_ms(int interval)
{
    return interval * 1000 / HZ;
}

int
ss7_os_ticks_to_secs(int interval)
{
    return interval / HZ;
}

unsigned int
ss7_get_ms_time(void)
{
    static unsigned long epoch;
    struct timespec now;

    getrawmonotonic(&now);

    if (epoch == 0)
       epoch = now.tv_sec;

    return (now.tv_sec - epoch) * 1000 + now.tv_nsec / 1000000;
}

struct acu_ss7maint_time {
   unsigned int st_sec;
   unsigned int st_usec;
};

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
static inline void do_gettimeofday(struct timeval *tv)
{
        struct timespec64 ts;

        ktime_get_real_ts64(&ts);
        tv->tv_sec = ts.tv_sec;
        tv->tv_usec = ts.tv_nsec/1000u;
}
#endif

void
ss7_get_timestamp(struct acu_ss7maint_time *ptime)
{
    struct timeval tv;

    /* do_gettimeofday() returns 'wall clock time'.
     * It can go backwards.  */
    do_gettimeofday(&tv);
    ptime->st_sec = tv.tv_sec;
    ptime->st_usec = tv.tv_usec;
}

unsigned int
ss7_get_elapsed(const struct acu_ss7maint_time *epoch)
{
    struct timeval tv;
    do_gettimeofday(&tv);

    return tv.tv_sec - epoch->st_sec;
}

void
ss7_os_log_error(const char *text)
{
    printk(KERN_EMERG "ss7server: %s", text);
    if (memcmp(text, "Assertion fail", 14) == 0) {
        dump_stack();
        /* Although we return, the caller sleeps forever */
        /* Remember the 'stuck' tasks */
        asserted_tasks[asserted_task_count++ & 15] = current;
    }
}

/*---------------------------------------------------------------------**
** Miscellanous string and memory functions                            **
**---------------------------------------------------------------------*/

void
ss7memzero(void *buf, size_t len)
{
    memset(buf, 0, len);
}

void
ss7memcpy(void *dest, const void *src, size_t len)
{
    memcpy(dest, src, len);
}

void
ss7_memmove(void *dest, const void *src, size_t len)
{
    memmove(dest, src, len);
}

int
ss7memcmp(const void *s1, const void *s2, size_t len)
{
    return memcmp(s1, s2, len);
}

unsigned int
ss7strlen(const char *str)
{
    return strlen(str);
}

void
ss7strcpy(char *dest, const char *src)
{
    strcpy(dest, src);
}

int
ss7strcmp(const char *dest, const char *src)
{
    return strcmp(dest, src);
}

char *
ss7strncpy(char *const s1, const char *s2, size_t n)
{
    return strncpy(s1, s2, n);
}

char *
ss7strchr(const char *s, const int c)
{
    return strchr(s, c);
}

/*---------------------------------------------------------------------**
** TCP/IP functions                                                    **
**---------------------------------------------------------------------*/

int
ss7_sctp_supported(void)
{
    return 1;
}

unsigned int
ss7_get_default_af_opts(unsigned int protocol, unsigned int port)
{
    /* The SS7 driver needs to know the which address families (IPv4 or IPv6)
     * to use for listening sockets.
     *
     * Whether an IPV6 socket can accept IPV4 connections depends on
     * the IPV6_V6ONLY socket option. The default for which depends
     * on net.ipv6.bindv6only (which usually defaults to 0 - allowing IPV4).
     * There also might be kernels where clearing IPV6_V6ONLY is disallowed.
     *
     * Normally only a single socket is created for each port since an IPv6
     * socket can receive IPv4 connections. However a separate IPv4 socket
     * can be requested.
     *
     * This function should return one of:
     *    SS7_AF_OPT_IPv6
     *        IPV6 socket with the default IPV6_V6ONLY value.
     *    SS7_AF_OPT_IPv6_V6ONLY_CLR
     *        IPV6 socket with IPV6_V6ONLY explicitly cleared.
     *    SS7_AF_OPT_IPv6_V6ONLY_SET
     *        IPV6 socket with IPV6_V6ONLY explicitly set.
     * Possibly logically ored with:
     *    SS7_AF_OPT_IPv4
     *        A separate IPv4 socket.
     *
     * For flexibility the decision can be based on the protocol (either
     * IPPROTO_SCTP or IPPROTO_TCP) or the port number.
     *
     * Default to creating a single socket and disabling IPV6_V6ONLY.
     */
#ifndef SS7_DEFAULT_AF_OPTS
#define SS7_DEFAULT_AF_OPTS SS7_AF_OPT_IPv6
#endif
     return SS7_DEFAULT_AF_OPTS;
}

/* kernel_get/set_sockopt() prototypes have (char *) for the buffer.
 * #define a (void *) cast.
 */
#define kernel_setsockopt(sock, level, name, val, len) \
        kernel_setsockopt(sock, level, name, (void *)val, len)
#define kernel_getsockopt(sock, level, name, val, len) \
        kernel_getsockopt(sock, level, name, (void *)val, len)

/* Note that we can't (easily) hold reference counts on the namespace
 * because put_net() is GPL_ONLY.
 * Instead we keep our own table and create a socket to hold the
 * reference for us.
 * Table entries 0 and 1 always refer to init_net and the namespace
 * of the (last started) ss7 daemon. Neither is reference counted
 * (although we hold a single reference on the latter).
 * Higher entries are saved from invocations of 'ss7maint start'
 * and 'firmware download'. */

static struct ss7_ns_info {
    struct net    *ni_net_ns;
    struct socket *ni_sock;
    unsigned int  ni_refcount;
} ss7_ns_table[256];

static struct socket *
ss7_glue_create_ns_socket(struct net *net)
{
    struct socket *sock;

    if (__sock_create(net, AF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 0))
        return NULL;
    return sock;
}

void
ss7_net_ns_get(unsigned int namespace)
{
    unsigned int idx = SS7_NET_NS_IDX(namespace);

    if (idx <= SS7_NET_NS_IDX(SS7_NET_NS_DAEMON))
        /* SS7_NET_NS_INIT and SS7_NET_NS_DAEMON aren't ref-counted */
        return;

    ss7_mutex_enter(&ss7_glue_mutex);
    ss7_ns_table[idx].ni_refcount++;
    ss7_mutex_exit(&ss7_glue_mutex);

    ss7_trace_printf(0, "ss7_net_ns_get(%x): refcount %d, sock %p, net %p\n",
            namespace, ss7_ns_table[idx].ni_refcount, ss7_ns_table[idx].ni_sock,
            ss7_ns_table[idx].ni_net_ns);
}

void
ss7_net_ns_put(unsigned int namespace)
{
    struct ss7_ns_info *ni;
    unsigned int idx = SS7_NET_NS_IDX(namespace);

    if (idx <= SS7_NET_NS_IDX(SS7_NET_NS_DAEMON))
        /* SS7_NET_NS_INIT and SS7_NET_NS_DAEMON aren't ref-counted */
        return;
    ni = ss7_ns_table + idx;

    ss7_trace_printf(0, "ss7_net_ns_put(%x): refcount %d, sock %p, net %p\n",
            namespace, ni->ni_refcount, ni->ni_sock, ni->ni_net_ns);

    ss7_mutex_enter(&ss7_glue_mutex);
    if (ni->ni_refcount && !--ni->ni_refcount) {
        /* Last reference gone */
        sock_release(ni->ni_sock);
        ni->ni_net_ns = NULL;
        ni->ni_sock = NULL;
    }
    ss7_mutex_exit(&ss7_glue_mutex);
}

static void
ss7_net_ns_unload(void)
{
    unsigned int idx;
    struct ss7_ns_info *ni;

    for (idx = 1; idx < ARRAY_SIZE(ss7_ns_table); idx++) {
        ni = ss7_ns_table + idx;
        if (!ni->ni_sock)
            continue;

        /* This should only report anything for the 'daemon' slot */
        printk(KERN_INFO "ss7_net_ns_unload(): idx %d, refcount %d, sock %p, net %p\n",
                idx, ni->ni_refcount, ni->ni_sock, ni->ni_net_ns);
        sock_release(ni->ni_sock);
        ni->ni_net_ns = NULL;
        ni->ni_sock = NULL;
        ni->ni_refcount = 0;
    }
}

unsigned int
ss7_net_ns_set(unsigned int new_namespace, unsigned int old_namespace)
{
    static unsigned int num_used_idx = 2;
    unsigned int idx, free_idx;
    struct ss7_ns_info *ni;
    struct net *net;

    /* The new_namespace should have the low 16 bits zero.
     * The low bits of old_namespace indicate what was actually being used. */

    if (new_namespace != SS7_NET_NS_START) {
        ss7_net_ns_put(old_namespace);
        return new_namespace == SS7_NET_NS_DAEMON ? SS7_NET_NS_DAEMON : SS7_NET_NS_INIT;
    }

    /* SS7_NET_NS_START - look for an entry for the namespace of the current 
     * process (which will be 'ss7maint start'). */
    net = current->nsproxy->net_ns;

    idx = SS7_NET_NS_IDX(old_namespace);
    ni = ss7_ns_table + idx;
    if (ni->ni_net_ns == net)
        /* Unchanged index, no need to change reference count */
        return SS7_NET_NS_START | idx;

    /* Different slot needed, drop old reference */
    ss7_net_ns_put(old_namespace);

    /* Check init and daemon entries, neither goes away */
    if (idx != SS7_NET_NS_IDX(SS7_NET_NS_INIT)
            && net == &init_net)
        return SS7_NET_NS_START | SS7_NET_NS_IDX(SS7_NET_NS_INIT);

    idx = SS7_NET_NS_IDX(SS7_NET_NS_DAEMON);
    ni = ss7_ns_table + idx;
    if (net == ni->ni_net_ns)
        return SS7_NET_NS_START | idx;

    ss7_mutex_enter(&ss7_glue_mutex);

    /* Scan table for an existing reference */
    free_idx = 0;
    for (idx = 2; idx < num_used_idx; idx++) {
        ni = ss7_ns_table + idx;
        if (ni->ni_net_ns == net) {
            /* found a match */
            ni->ni_refcount++;
            ss7_mutex_exit(&ss7_glue_mutex);
            ss7_trace_printf(0, "ss7_net_ns_set(%x, %x): found idx %d, refcount %d, sock %p, net %p\n",
                    new_namespace, old_namespace, idx, ni->ni_refcount, ni->ni_sock, ni->ni_net_ns);
            return SS7_NET_NS_START | idx;
        }
        if (!free_idx && !ni->ni_net_ns)
            free_idx = idx;
    }

    /* Not found allocate lowest free slot */
    if (!free_idx) {
        if (num_used_idx >= ARRAY_SIZE(ss7_ns_table))
            /* Table full, borked */
            goto no_ref;
        free_idx = num_used_idx++;
    }

    ni = &ss7_ns_table[free_idx];
    ni->ni_sock = ss7_glue_create_ns_socket(net);
    if (!ni->ni_sock)
        goto no_ref;
    ni->ni_net_ns = net;

    ss7_mutex_exit(&ss7_glue_mutex);
    ss7_trace_printf(0, "ss7_net_ns_set(%x, %x): new idx %d, sock %p, net %p\n",
            new_namespace, old_namespace, free_idx, ni->ni_sock, ni->ni_net_ns);

    return SS7_NET_NS_START | free_idx;

  no_ref:
    ss7_mutex_exit(&ss7_glue_mutex);
    ss7_trace_printf(0, "ss7_net_ns_set(%x, %x): no_ref\n",
            new_namespace, old_namespace);
    return SS7_NET_NS_START;
}

void
ss7_glue_daemon_open(void)
{
    struct ss7_ns_info *ni = &ss7_ns_table[SS7_NET_NS_IDX(SS7_NET_NS_DAEMON)];
    struct net *net = current->nsproxy->net_ns;

    /* Save (and reference count) the network namespace the ss7 daemon
     * is started in. */

    /* Initialise the entry for init_net here - has to be done somewhere. */
    ss7_ns_table[SS7_NET_NS_IDX(SS7_NET_NS_INIT)].ni_net_ns = &init_net;

    if (net == ni->ni_net_ns)
        /* Unchanged */
        return;

    if (ni->ni_sock)
        sock_release(ni->ni_sock);
    ni->ni_sock = NULL;

    if (net != &init_net && !((ni->ni_sock = ss7_glue_create_ns_socket(net))))
        /* Can't create socket, default to global namespace */
        net = &init_net;

    ni->ni_net_ns = net;
}

int
ss7_socket(int family, int type, int protocol, unsigned int namespace, struct socket **sockp)
{
    struct socket *sock;
    struct net *net;
    unsigned int one = 1U;
    int rval;

    net = ss7_ns_table[SS7_NET_NS_IDX(namespace)].ni_net_ns;
    if (!net)
        net = &init_net;

    /* If we have to autoload the sctp module, we might re-enter it
     * before it has finished initialising - might go 'boom'. */
    ss7_mutex_enter(&ss7_glue_mutex);

    /* sock_create_kern() creates a socket that doesn't hold a reference
     * to the namespace (they get used for sockets needed by the protocol
     * stack code itself).
     * We need a socket that holds a reference to the namespace, so create
     * a 'user' socket in a specific namespace.
     * This adds an extra security check which we should pass because all the
     * sockets are created by kernel threads.
     */
    rval = __sock_create(net, family, type, protocol, sockp, 0);
    ss7_mutex_exit(&ss7_glue_mutex);
    if (rval != 0)
        return rval;
    sock = *sockp;

    kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one, sizeof one);

    return 0;
}

void
ss7_setsockopt_nodelay(struct socket *sock, int enabled)
{
    kernel_setsockopt(sock, SK_PROTOCOL(sock),
            SK_PROTOCOL(sock) == IPPROTO_TCP ? TCP_NODELAY : SCTP_NODELAY,
            &enabled, sizeof enabled);
}

static void
ss7_sctp_set_opts(struct socket *sock)
{
    struct sctp_event_subscribe events;
    int len, rval;

    if (SK_PROTOCOL(sock) != IPPROTO_SCTP)
        return;

    len = sizeof events;
    rval = kernel_getsockopt(sock, IPPROTO_SCTP, SCTP_EVENTS, &events, &len);
    if (rval != 0)
        return;

    /* We need to know the stream and ppid */
    events.sctp_data_io_event = 1;
    /* Enable notifications to detect connection restart */
    events.sctp_association_event = 1;
    kernel_setsockopt(sock, IPPROTO_SCTP, SCTP_EVENTS, &events, sizeof events);
}

unsigned int
ss7_get_max_sctp_ostreams(struct socket *sock)
{
    struct sctp_status sstat;
    int len;

    if (SK_PROTOCOL(sock) != IPPROTO_SCTP)
        return 0;

    len = sizeof sstat;
    if (kernel_getsockopt(sock, IPPROTO_SCTP, SCTP_STATUS, &sstat, &len))
        return 0;

    return sstat.sstat_outstrms;
}

void
ss7_set_max_sctp_streams(struct socket *sock, unsigned int max_streams)
{
    struct sctp_initmsg sinit;

    if (SK_PROTOCOL(sock) != IPPROTO_SCTP)
        return;

    memset(&sinit, 0, sizeof sinit);

    sinit.sinit_num_ostreams = max_streams;
    sinit.sinit_max_instreams = max_streams;
    kernel_setsockopt(sock, IPPROTO_SCTP, SCTP_INITMSG, &sinit, sizeof sinit);
}

void
ss7_trans_setsockopt(struct socket *sock)
{
    unsigned int one = 1U;

    ss7_setsockopt_nodelay(sock, 1);
    ss7_sctp_set_opts(sock);
    if (SK_PROTOCOL(sock) == IPPROTO_TCP)
        kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, &one, sizeof one);
}

void
ss7_transbind_setsockopt(struct socket *sock)
{
    /* Set options for a listening socket */
    ss7_sctp_set_opts(sock);

    /* M3UA may need 16 data streams, it is just TFH to configure this */
    ss7_set_max_sctp_streams(sock, 1 + 16);
}

#define IP_ADDR_LEN(sa) ((sa)->sin6_family == AF_INET6 ? sizeof *(sa) : 16)
int
ss7_connect(struct socket *sock, struct sockaddr_in6 *sa)
{
    return kernel_connect(sock, (void *)sa, IP_ADDR_LEN(sa), O_RDWR);
}

int
ss7_bind(struct socket *sock, struct sockaddr_in6 *sa, unsigned int af_opts)
{
    /* If we are binding INADDR6_ANY to an IPv6 socket (typically for
     * a listening socket) then we probably want to ensure that IPV6_V6ONLY
     * is 0 so that the socket will also be given IPv4 connections. */
    if (sa->sin6_family == AF_INET6 && af_opts & SS7_AF_OPT_IPv6_V6ONLY
            && sa->sin6_addr.in6_u.u6_addr32[0] == 0
            && (sa->sin6_addr.in6_u.u6_addr32[1]
                | sa->sin6_addr.in6_u.u6_addr32[2]
                | sa->sin6_addr.in6_u.u6_addr32[3]) == 0) {
        int v6only = af_opts & 1;
        kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, &v6only, sizeof v6only);
    }

    return kernel_bind(sock, (void *)sa, IP_ADDR_LEN(sa));
}

int
ss7_bindx(struct socket *sock, struct sockaddr_in6 *sa)
{
    if (SK_PROTOCOL(sock) != IPPROTO_SCTP)
        return -EPROTONOSUPPORT;

    return kernel_setsockopt(sock, IPPROTO_SCTP, SCTP_SOCKOPT_BINDX_ADD,
            sa, IP_ADDR_LEN(sa));
}

int
ss7_listen(struct socket *sock, int len)
{
    return kernel_listen(sock, len);
}

int
ss7_accept(struct socket *sock, struct socket **new_sockp, int flags)
{
    return kernel_accept(sock, new_sockp, flags);
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)
static inline int
ss7_kernel_getsockname(struct socket *sock, struct sockaddr *address)
{
    int err, len;

    err = kernel_getsockname(sock, (struct sockaddr *)address, &len);
    return err ? err : len;
}
#define kernel_getsockname ss7_kernel_getsockname

static inline int
ss7_kernel_getpeername(struct socket *sock, struct sockaddr *address)
{
    int err, len;

    err = kernel_getpeername(sock, (struct sockaddr *)address, &len);
    return err ? err : len;
}
#define kernel_getpeername ss7_kernel_getpeername
#endif

int
ss7_get_loc_port(struct socket *sock)
{
    char address[128 /*MAX_SOCK_ADDR*/];
    int len;

    len = kernel_getsockname(sock, (struct sockaddr *)address);
    if (len < 0)
        return 0;

    /* This works well enough for IPv4 and IPv6 */
    return ntohs(((struct sockaddr_in *)address)->sin_port);
}

int
ss7_get_rem_addr(struct socket *sock, struct sockaddr_in6 *saddr)
{
    int len;

    len = kernel_getpeername(sock, (struct sockaddr *)saddr);
    if (len < 0)
        return len;

    if (len > sizeof *saddr)
        printk(KERN_EMERG "ss7server: socket address (family %d) %d > %d",
                saddr->sin6_family, len, (int)sizeof *saddr);

    return 0;
}

int
ss7_shutdown(struct socket *sock, int how)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
    if (SK_PROTOCOL(sock) == IPPROTO_SCTP) {
        struct linger linger;

        /* If we call kernel_sock_shutdown() then the connection isn't released
         * until all outstanding data has been acked.
         * If the remote system sends an INIT (restarting the connection)
         * while the linux kernel is waiting for data to be acked then it
         * will never disconnect.
         * Enabling 'linger' with a delay of zero causes sock_release()
         * to abort the connection (sends an ABORT chunk).
         *
         * The ss7 code never needs to wait for sent data to be acked,
         * so aborting the connection doesn't really matter.
         * All calls to ss7_shutdown() are immediately followed by calls to
         * ss7_closesocket().
         *
         * Plausibly we should always abort connections if we are disconnecting
         * due to an application level timeout.
         *
         * Fixed by the kernel patch:
         *    "sctp: handle association restarts when the socket is closed"
         * Known to be included in the following kernels:
         *  - mainline 3.18
         *  - Ubuntu 3.13.11.11
         * Queued for 3.10-stable, 3.14-stable, 3.16-stable and 3.17-stable
         */

        linger.l_onoff = 1;
        linger.l_linger = 0;
        kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, &linger, sizeof linger);

        return 0;
    }
#endif
    return kernel_sock_shutdown(sock, how);
}

void
ss7_closesocket(struct socket *sock)
{
    sock_release(sock);
}

int
ss7_send(struct socket *sock, struct ss7_iovec *iov, int iovlen, int totlen,
        void *ctl, int ctl_len, unsigned int flags)
{
    struct msghdr msg;

    msg.msg_name       = 0;
    msg.msg_namelen    = 0;
    msg.msg_control    = ctl;
    msg.msg_controllen = ctl_len;
    msg.msg_flags      = flags | MSG_NOSIGNAL;

    return kernel_sendmsg(sock, &msg, iov, iovlen, totlen);
}

int
ss7_recv(struct socket *sock, unsigned char *data, int length, int flags)
{
    struct kvec iov;
    struct msghdr msg;

    if (!sock->sk)
        return 0;

    iov.iov_len        = length;
    iov.iov_base       = data;

    msg.msg_name       = 0;
    msg.msg_namelen    = 0;
    msg.msg_control    = NULL;
    msg.msg_controllen = 0;
    msg.msg_flags      = 0;

    return kernel_recvmsg(sock, &msg, &iov, 1, length, 0);
}

int
ss7_recv_sctp(struct socket *sock, void *buf_1, int len_1, void *buf_2,
    int len_2, struct ss7_msgb *ss7_msg)
{
    struct msghdr msg;
    struct kvec iov[2];
    unsigned char *data = buf_1;
    int msg_len, ctl_len;
    int rval;
    union {
        struct cmsghdr cmsg;
        unsigned int buf[16];
    } ctlbuf;

    if (!sock->sk)
        return 0;

    /* For SCTP each recvmsg should give us a single data record.
     * Since we only ever send SIGTRAN encoded messages bytes 4-7 are the
     * length - and should match that of the sctp data chunk.
     * buf_1/len_1 refer to the normal ss7 message buffer area, buf_2/len_2
     * are per-socket. Long messages get copied together by the caller.
     * The result is always a single valid SIGTRAN message */

    iov[0].iov_base    = buf_1;
    iov[0].iov_len     = len_1;
    iov[1].iov_base    = buf_2;
    iov[1].iov_len     = len_2;

    msg.msg_name       = 0;
    msg.msg_namelen    = 0;
    msg.msg_control    = &ctlbuf;
    msg.msg_controllen = sizeof ctlbuf;
    msg.msg_flags      = 0;

    rval = kernel_recvmsg(sock, &msg, iov, 2, len_1 + len_2, 0);

    if (rval <= 0)
        /* Don't return EBADMSG here */
        return rval != -EBADMSG ? rval : -EIO;

    if (msg.msg_flags & MSG_NOTIFICATION)
        /* msg data is a notification */
        return -EBADMSG;

    ctl_len = (char *)msg.msg_control - (char *)&ctlbuf;
    if (ctl_len >= ctlbuf.cmsg.cmsg_len
            && ctlbuf.cmsg.cmsg_level == IPPROTO_SCTP
            && ctlbuf.cmsg.cmsg_type == SCTP_SNDRCV) {
        struct sctp_sndrcvinfo *sinfo = CMSG_DATA(&ctlbuf.cmsg);
        ss7_trans_set_msg_info(ss7_msg, sinfo->sinfo_stream, sinfo->sinfo_ppid);
    }

    msg_len = data[4] << 24 | data[5] << 16 | data[6] << 8 | data[7];
    if (msg_len >= 65556)
        /* Disbelieve this is valid data */
        return -EIO;

    if (rval != msg_len || !(msg.msg_flags & MSG_EOR))
        return -EIO;
    return rval;
}

int
ss7_trans_init_sctp_sinfo(void *buf, int maxlen, __u16 **stream, __u32 **ppid)
{
    struct cmsghdr *cmsg;
    struct sctp_sndrcvinfo *sinfo;

    if (maxlen < CMSG_LEN(sizeof *sinfo))
        return -1;

    cmsg = buf;
    cmsg->cmsg_level = IPPROTO_SCTP;
    cmsg->cmsg_type = SCTP_SNDRCV;
    cmsg->cmsg_len = CMSG_LEN(sizeof *sinfo);
    sinfo = CMSG_DATA(cmsg);
    memset(sinfo, 0, sizeof *sinfo);
    *stream = &sinfo->sinfo_stream;
    *ppid = &sinfo->sinfo_ppid;

    return CMSG_LEN(sizeof *sinfo);
}

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 20/33] ipv4: add ip_sock_set_recverr
  2020-05-14 11:51       ` Joe Perches
@ 2020-05-20 14:18         ` Christoph Hellwig
  0 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2020-05-20 14:18 UTC (permalink / raw)
  To: Joe Perches
  Cc: Christoph Hellwig, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Vlad Yasevich, Neil Horman,
	Marcelo Ricardo Leitner, Jon Maloy, Ying Xue, drbd-dev,
	linux-block, linux-kernel, linux-rdma, linux-nvme, target-devel,
	linux-afs, linux-cifs, cluster-devel, ocfs2-devel, netdev,
	linux-sctp, ceph-devel, rds-devel, linux-nfs

On Thu, May 14, 2020 at 04:51:26AM -0700, Joe Perches wrote:
> > Mostly to keep it symmetric with the sockopt.  I could probably remove
> > a few arguments in the series if we want to be strict.
> 
> My preference would use strict and add
> arguments only when necessary.

In a few cases that would create confusion as the arguments are rather
overloaded.  But for a lot of the cases where it doesn't and there isn't
really much use for other arguments I've done that now.

^ permalink raw reply	[flat|nested] 76+ messages in thread

end of thread, back to index

Thread overview: 76+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-13  6:26 remove kernel_setsockopt and kernel_getsockopt Christoph Hellwig
2020-05-13  6:26 ` [PATCH 01/33] net: add sock_set_reuseaddr Christoph Hellwig
2020-05-13  6:26 ` [PATCH 02/33] net: add sock_set_linger Christoph Hellwig
2020-05-13  6:26 ` [PATCH 03/33] net: add sock_set_priority Christoph Hellwig
2020-05-13  6:26 ` [PATCH 04/33] net: add sock_set_sndtimeo Christoph Hellwig
2020-05-13  6:26 ` [PATCH 05/33] net: add sock_bindtoindex Christoph Hellwig
2020-05-13  6:26 ` [PATCH 06/33] net: add sock_set_timestamps Christoph Hellwig
2020-05-13  6:26 ` [PATCH 07/33] net: add sock_set_keepalive Christoph Hellwig
2020-05-13  6:26 ` [PATCH 08/33] net: add sock_set_rcvbuf Christoph Hellwig
2020-05-13  6:26 ` [PATCH 09/33] net: add sock_set_reuseport Christoph Hellwig
2020-05-13  6:26 ` [PATCH 10/33] tcp: add tcp_sock_set_cork Christoph Hellwig
2020-05-13  6:26 ` [PATCH 11/33] tcp: tcp_sock_set_nodelay Christoph Hellwig
2020-05-13 12:51   ` Jason Gunthorpe
2020-05-13  6:26 ` [PATCH 12/33] tcp: add tcp_sock_set_quickack Christoph Hellwig
2020-05-13  6:26 ` [PATCH 13/33] tcp: add tcp_sock_set_syncnt Christoph Hellwig
2020-05-13  6:26 ` [PATCH 14/33] tcp: add tcp_sock_set_user_timeout Christoph Hellwig
2020-05-13  6:26 ` [PATCH 15/33] tcp: add tcp_sock_set_keepidle Christoph Hellwig
2020-05-13  6:26 ` [PATCH 16/33] tcp: add tcp_sock_set_keepintvl Christoph Hellwig
2020-05-13  6:26 ` [PATCH 17/33] tcp: add tcp_sock_set_keepcnt Christoph Hellwig
2020-05-13  6:26 ` [PATCH 18/33] ipv4: add ip_sock_set_tos Christoph Hellwig
2020-05-13  6:26 ` [PATCH 19/33] ipv4: add ip_sock_set_freebind Christoph Hellwig
2020-05-13  6:26 ` [PATCH 20/33] ipv4: add ip_sock_set_recverr Christoph Hellwig
2020-05-13 21:00   ` Joe Perches
2020-05-14 10:30     ` Christoph Hellwig
2020-05-14 11:51       ` Joe Perches
2020-05-20 14:18         ` Christoph Hellwig
2020-05-13  6:26 ` [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover Christoph Hellwig
2020-05-13  6:26 ` [PATCH 22/33] ipv6: add ip6_sock_set_v6only Christoph Hellwig
2020-05-13  6:26 ` [PATCH 23/33] ipv6: add ip6_sock_set_recverr Christoph Hellwig
2020-05-13  6:26 ` [PATCH 24/33] ipv6: add ip6_sock_set_addr_preferences Christoph Hellwig
2020-05-13  6:26 ` [PATCH 25/33] ipv6: add ip6_sock_set_recvpktinfo Christoph Hellwig
2020-05-13  6:26 ` [PATCH 26/33] sctp: lift copying in addrs into sctp_setsockopt Christoph Hellwig
2020-05-13  6:26 ` [PATCH 27/33] sctp: export sctp_setsockopt_bindx Christoph Hellwig
2020-05-13 18:00   ` Marcelo Ricardo Leitner
2020-05-14  6:28     ` Christoph Hellwig
2020-05-14  8:23     ` David Laight
2020-05-14 10:40     ` is it ok to always pull in sctp for dlm, was: " Christoph Hellwig
2020-05-14 14:24       ` David Teigland
2020-05-15 15:20     ` David Howells
2020-05-15 15:24       ` Christoph Hellwig
2020-05-16 15:21         ` David Laight
2020-05-16 15:11       ` David Laight
2020-05-16 15:36         ` [Ocfs2-devel] " Matthew Wilcox
2020-05-17  8:48           ` David Laight
2020-05-13  6:26 ` [PATCH 28/33] sctp: add sctp_sock_set_nodelay Christoph Hellwig
2020-05-13  6:26 ` [PATCH 29/33] rxrpc_sock_set_min_security_level Christoph Hellwig
2020-05-13  6:26 ` [PATCH 30/33] tipc: call tsk_set_importance from tipc_topsrv_create_listener Christoph Hellwig
2020-05-13  6:26 ` [PATCH 31/33] net: remove kernel_setsockopt Christoph Hellwig
2020-05-13  6:26 ` [PATCH 32/33] sctp: add sctp_sock_get_primary_addr Christoph Hellwig
2020-05-13 18:03   ` Marcelo Ricardo Leitner
2020-05-14  9:51     ` David Laight
2020-05-14 12:30       ` David Laight
2020-05-14 13:27         ` David Laight
2020-05-13  6:26 ` [PATCH 33/33] net: remove kernel_getsockopt Christoph Hellwig
2020-05-13 13:13 ` [PATCH 29/33] rxrpc_sock_set_min_security_level David Howells
2020-05-14 10:29   ` Christoph Hellwig
2020-05-15 15:13   ` David Howells
2020-05-13 13:17 ` [PATCH 21/33] ipv4: add ip_sock_set_mtu_discover David Howells
2020-05-14  6:26   ` Christoph Hellwig
2020-05-15 15:15   ` David Howells
2020-05-13 13:24 ` [PATCH 20/33] ipv4: add ip_sock_set_recverr David Howells
2020-05-13 13:25 ` [PATCH 23/33] ipv6: add ip6_sock_set_recverr David Howells
2020-05-13 13:27 ` [PATCH 06/33] net: add sock_set_timestamps David Howells
2020-05-13 17:38 ` remove kernel_setsockopt and kernel_getsockopt Joe Perches
2020-05-14  6:27   ` Christoph Hellwig
2020-05-14  8:29   ` David Laight
2020-05-14 10:18     ` Christoph Hellwig
2020-05-14 10:26       ` David Laight
2020-05-14 10:34         ` 'Christoph Hellwig'
2020-05-14 11:11           ` David Laight
2020-05-14 19:35             ` [Ocfs2-devel] " Matthew Wilcox
2020-05-14 20:03         ` David Miller
2020-05-15  8:14           ` David Laight
2020-05-14 19:57     ` David Miller
2020-05-13 18:45 ` Sagi Grimberg
2020-05-13 19:12 ` David Miller

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org
	public-inbox-index linux-block

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git