All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 net-next 1/4] tcp: ULP infrastructure
       [not found] <cover.1497465295.git.davejwatson@fb.com>
@ 2017-06-14 18:37 ` Dave Watson
  2017-06-17  0:14   ` Christoph Paasch
                     ` (2 more replies)
  2017-06-14 18:37 ` [PATCH v3 net-next 2/4] tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions Dave Watson
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 20+ messages in thread
From: Dave Watson @ 2017-06-14 18:37 UTC (permalink / raw)
  To: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet
  Cc: Alexei Starovoitov, nmav, fridolin.pokorny

Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
ULP can add its own logic by changing the TCP proto_ops structure to its own
methods.

Example usage:

setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));

modules will call:
tcp_register_ulp(&tcp_tls_ulp_ops);

to register/unregister their ulp, with an init function and name.

A list of registered ulps will be returned by tcp_get_available_ulp, which is
hooked up to /proc.  Example:

$ cat /proc/sys/net/ipv4/tcp_available_ulp
tls

There is currently no functionality to remove or chain ULPs, but
it should be possible to add these in the future if needed.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Dave Watson <davejwatson@fb.com>
---
 include/net/inet_connection_sock.h |   4 ++
 include/net/tcp.h                  |  25 +++++++
 include/uapi/linux/tcp.h           |   1 +
 net/ipv4/Makefile                  |   2 +-
 net/ipv4/sysctl_net_ipv4.c         |  25 +++++++
 net/ipv4/tcp.c                     |  28 ++++++++
 net/ipv4/tcp_ipv4.c                |   2 +
 net/ipv4/tcp_ulp.c                 | 134 +++++++++++++++++++++++++++++++++++++
 8 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/tcp_ulp.c

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c7a5779..13e4c89 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -75,6 +75,8 @@ struct inet_connection_sock_af_ops {
  * @icsk_pmtu_cookie	   Last pmtu seen by socket
  * @icsk_ca_ops		   Pluggable congestion control hook
  * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
+ * @icsk_ulp_ops	   Pluggable ULP control hook
+ * @icsk_ulp_data	   ULP private data
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
  * @icsk_pending:	   Scheduled timer event
@@ -97,6 +99,8 @@ struct inet_connection_sock {
 	__u32			  icsk_pmtu_cookie;
 	const struct tcp_congestion_ops *icsk_ca_ops;
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
+	const struct tcp_ulp_ops  *icsk_ulp_ops;
+	void			  *icsk_ulp_data;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state:6,
 				  icsk_ca_setsockopt:1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3ab677d..b439f46 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1991,4 +1991,29 @@ static inline void tcp_listendrop(const struct sock *sk)
 
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 
+/*
+ * Interface for adding Upper Level Protocols over TCP
+ */
+
+#define TCP_ULP_NAME_MAX	16
+#define TCP_ULP_MAX		128
+#define TCP_ULP_BUF_MAX		(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
+
+struct tcp_ulp_ops {
+	struct list_head	list;
+
+	/* initialize ulp */
+	int (*init)(struct sock *sk);
+	/* cleanup ulp */
+	void (*release)(struct sock *sk);
+
+	char		name[TCP_ULP_NAME_MAX];
+	struct module	*owner;
+};
+int tcp_register_ulp(struct tcp_ulp_ops *type);
+void tcp_unregister_ulp(struct tcp_ulp_ops *type);
+int tcp_set_ulp(struct sock *sk, const char *name);
+void tcp_get_available_ulp(char *buf, size_t len);
+void tcp_cleanup_ulp(struct sock *sk);
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 38a2b07..8204dce 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -117,6 +117,7 @@ enum {
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
 #define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
+#define TCP_ULP		31	/* Attach a ULP to a TCP connection */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f83de23..afcb435 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-	     tcp_rate.o tcp_recovery.o \
+	     tcp_rate.o tcp_recovery.o tcp_ulp.o \
 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7065234a..9bf8097 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -360,6 +360,25 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		tcp_fastopen_active_timeout_reset();
+
+	return ret;
+}
+
+static int proc_tcp_available_ulp(struct ctl_table *ctl,
+				  int write,
+				  void __user *buffer, size_t *lenp,
+				  loff_t *ppos)
+{
+	struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+	tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	kfree(tbl.data);
+
 	return ret;
 }
 
@@ -686,6 +705,12 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
 	{
+		.procname	= "tcp_available_ulp",
+		.maxlen		= TCP_ULP_BUF_MAX,
+		.mode		= 0444,
+		.proc_handler   = proc_tcp_available_ulp,
+	},
+	{
 		.procname	= "icmp_msgs_per_sec",
 		.data		= &sysctl_icmp_msgs_per_sec,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cc8fd8b..b06ee30 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2482,6 +2482,24 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		release_sock(sk);
 		return err;
 	}
+	case TCP_ULP: {
+		char name[TCP_ULP_NAME_MAX];
+
+		if (optlen < 1)
+			return -EINVAL;
+
+		val = strncpy_from_user(name, optval,
+					min_t(long, TCP_ULP_NAME_MAX - 1,
+					      optlen));
+		if (val < 0)
+			return -EFAULT;
+		name[val] = 0;
+
+		lock_sock(sk);
+		err = tcp_set_ulp(sk, name);
+		release_sock(sk);
+		return err;
+	}
 	default:
 		/* fallthru */
 		break;
@@ -3038,6 +3056,16 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		return 0;
 
+	case TCP_ULP:
+		if (get_user(len, optlen))
+			return -EFAULT;
+		len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
+			return -EFAULT;
+		return 0;
+
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1dc8c44..eec2ff9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1860,6 +1860,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	tcp_cleanup_congestion_control(sk);
 
+	tcp_cleanup_ulp(sk);
+
 	/* Cleanup up the write buffer. */
 	tcp_write_queue_purge(sk);
 
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
new file mode 100644
index 0000000..e855ea7
--- /dev/null
+++ b/net/ipv4/tcp_ulp.c
@@ -0,0 +1,134 @@
+/*
+ * Pluggable TCP upper layer protocol support.
+ *
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ */
+
+#include<linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_ulp_list_lock);
+static LIST_HEAD(tcp_ulp_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
+{
+	struct tcp_ulp_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+
+	return NULL;
+}
+
+static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
+{
+	const struct tcp_ulp_ops *ulp = NULL;
+
+	rcu_read_lock();
+	ulp = tcp_ulp_find(name);
+
+#ifdef CONFIG_MODULES
+	if (!ulp && capable(CAP_NET_ADMIN)) {
+		rcu_read_unlock();
+		request_module("%s", name);
+		rcu_read_lock();
+		ulp = tcp_ulp_find(name);
+	}
+#endif
+	if (!ulp || !try_module_get(ulp->owner))
+		ulp = NULL;
+
+	rcu_read_unlock();
+	return ulp;
+}
+
+/* Attach new upper layer protocol to the list
+ * of available protocols.
+ */
+int tcp_register_ulp(struct tcp_ulp_ops *ulp)
+{
+	int ret = 0;
+
+	spin_lock(&tcp_ulp_list_lock);
+	if (tcp_ulp_find(ulp->name)) {
+		pr_notice("%s already registered or non-unique name\n",
+			  ulp->name);
+		ret = -EEXIST;
+	} else {
+		list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
+	}
+	spin_unlock(&tcp_ulp_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_ulp);
+
+void tcp_unregister_ulp(struct tcp_ulp_ops *ulp)
+{
+	spin_lock(&tcp_ulp_list_lock);
+	list_del_rcu(&ulp->list);
+	spin_unlock(&tcp_ulp_list_lock);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_ulp);
+
+/* Build string with list of available upper layer protocl values */
+void tcp_get_available_ulp(char *buf, size_t maxlen)
+{
+	struct tcp_ulp_ops *ulp_ops;
+	size_t offs = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) {
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", ulp_ops->name);
+	}
+	rcu_read_unlock();
+}
+
+void tcp_cleanup_ulp(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (!icsk->icsk_ulp_ops)
+		return;
+
+	if (icsk->icsk_ulp_ops->release)
+		icsk->icsk_ulp_ops->release(sk);
+	module_put(icsk->icsk_ulp_ops->owner);
+}
+
+/* Change upper layer protocol for socket */
+int tcp_set_ulp(struct sock *sk, const char *name)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_ulp_ops *ulp_ops;
+	int err = 0;
+
+	if (icsk->icsk_ulp_ops)
+		return -EEXIST;
+
+	ulp_ops = __tcp_ulp_find_autoload(name);
+	if (!ulp_ops)
+		err = -ENOENT;
+	else
+		err = ulp_ops->init(sk);
+
+	if (err)
+		goto out;
+
+	icsk->icsk_ulp_ops = ulp_ops;
+ out:
+	return err;
+}
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v3 net-next 2/4] tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions
       [not found] <cover.1497465295.git.davejwatson@fb.com>
  2017-06-14 18:37 ` [PATCH v3 net-next 1/4] tcp: ULP infrastructure Dave Watson
@ 2017-06-14 18:37 ` Dave Watson
  2017-06-14 18:37 ` [PATCH v3 net-next 3/4] tls: kernel TLS support Dave Watson
  2017-06-14 18:37 ` [PATCH v3 net-next 4/4] tls: Documentation Dave Watson
  3 siblings, 0 replies; 20+ messages in thread
From: Dave Watson @ 2017-06-14 18:37 UTC (permalink / raw)
  To: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet
  Cc: Alexei Starovoitov, nmav, fridolin.pokorny

Export do_tcp_sendpages and tcp_rate_check_app_limited, since tls will need to
sendpages while the socket is already locked.

tcp_sendpage is exported, but requires the socket lock to not be held already.

Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Dave Watson <davejwatson@fb.com>
---
 include/net/tcp.h   | 2 ++
 net/ipv4/tcp.c      | 5 +++--
 net/ipv4/tcp_rate.c | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b439f46..e17ec28 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -350,6 +350,8 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 		 int flags);
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+		 size_t size, int flags);
 void tcp_release_cb(struct sock *sk);
 void tcp_wfree(struct sk_buff *skb);
 void tcp_write_timer_handler(struct sock *sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b06ee30..11e4ee2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -901,8 +901,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 	return mss_now;
 }
 
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
-				size_t size, int flags)
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+			 size_t size, int flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int mss_now, size_goal;
@@ -1032,6 +1032,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 	}
 	return sk_stream_error(sk, flags, err);
 }
+EXPORT_SYMBOL_GPL(do_tcp_sendpages);
 
 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 		 size_t size, int flags)
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index ad99569..3330a37 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -185,3 +185,4 @@ void tcp_rate_check_app_limited(struct sock *sk)
 		tp->app_limited =
 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
 }
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v3 net-next 3/4] tls: kernel TLS support
       [not found] <cover.1497465295.git.davejwatson@fb.com>
  2017-06-14 18:37 ` [PATCH v3 net-next 1/4] tcp: ULP infrastructure Dave Watson
  2017-06-14 18:37 ` [PATCH v3 net-next 2/4] tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions Dave Watson
@ 2017-06-14 18:37 ` Dave Watson
  2017-06-16 20:56   ` Stephen Hemminger
                     ` (2 more replies)
  2017-06-14 18:37 ` [PATCH v3 net-next 4/4] tls: Documentation Dave Watson
  3 siblings, 3 replies; 20+ messages in thread
From: Dave Watson @ 2017-06-14 18:37 UTC (permalink / raw)
  To: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet
  Cc: Alexei Starovoitov, nmav, fridolin.pokorny

Software implementation of transport layer security, implemented using ULP
infrastructure.  tcp proto_ops are replaced with tls equivalents of sendmsg and
sendpage.

Only symmetric crypto is done in the kernel, keys are passed by setsockopt
after the handshake is complete.  All control messages are supported via CMSG
data - the actual symmetric encryption is the same, just the message type needs
to be passed separately.

For user API, please see Documentation patch.

Pieces that can be shared between hw and sw implementation
are in tls_main.c

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: Dave Watson <davejwatson@fb.com>
---
 MAINTAINERS              |  10 +
 include/linux/socket.h   |   1 +
 include/net/tls.h        | 237 +++++++++++++++
 include/uapi/linux/tls.h |  79 +++++
 net/Kconfig              |   1 +
 net/Makefile             |   1 +
 net/tls/Kconfig          |  12 +
 net/tls/Makefile         |   7 +
 net/tls/tls_main.c       | 487 ++++++++++++++++++++++++++++++
 net/tls/tls_sw.c         | 772 +++++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 1607 insertions(+)
 create mode 100644 include/net/tls.h
 create mode 100644 include/uapi/linux/tls.h
 create mode 100644 net/tls/Kconfig
 create mode 100644 net/tls/Makefile
 create mode 100644 net/tls/tls_main.c
 create mode 100644 net/tls/tls_sw.c

diff --git a/MAINTAINERS b/MAINTAINERS
index f4e682c..710af53 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8979,6 +8979,16 @@ F:	net/ipv6/
 F:	include/net/ip*
 F:	arch/x86/net/*
 
+NETWORKING [TLS]
+M:	Ilya Lesokhin <ilyal@mellanox.com>
+M:	Aviad Yehezkel <aviadye@mellanox.com>
+M:	Dave Watson <davejwatson@fb.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	net/tls/*
+F:	include/uapi/linux/tls.h
+F:	include/net/tls.h
+
 NETWORKING [IPSEC]
 M:	Steffen Klassert <steffen.klassert@secunet.com>
 M:	Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 0820274..8b13db5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -334,6 +334,7 @@ struct ucred {
 #define SOL_ALG		279
 #define SOL_NFC		280
 #define SOL_KCM		281
+#define SOL_TLS		282
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/net/tls.h b/include/net/tls.h
new file mode 100644
index 0000000..b89d397
--- /dev/null
+++ b/include/net/tls.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TLS_OFFLOAD_H
+#define _TLS_OFFLOAD_H
+
+#include <linux/types.h>
+
+#include <uapi/linux/tls.h>
+
+
+/* Maximum data size carried in a TLS record */
+#define TLS_MAX_PAYLOAD_SIZE		((size_t)1 << 14)
+
+#define TLS_HEADER_SIZE			5
+#define TLS_NONCE_OFFSET		TLS_HEADER_SIZE
+
+#define TLS_CRYPTO_INFO_READY(info)	((info)->cipher_type)
+
+#define TLS_RECORD_TYPE_DATA		0x17
+
+#define TLS_AAD_SPACE_SIZE		13
+
+struct tls_sw_context {
+	struct crypto_aead *aead_send;
+
+	/* Sending context */
+	char aad_space[TLS_AAD_SPACE_SIZE];
+
+	unsigned int sg_plaintext_size;
+	int sg_plaintext_num_elem;
+	struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
+
+	unsigned int sg_encrypted_size;
+	int sg_encrypted_num_elem;
+	struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS];
+
+	/* AAD | sg_plaintext_data | sg_tag */
+	struct scatterlist sg_aead_in[2];
+	/* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */
+	struct scatterlist sg_aead_out[2];
+};
+
+enum {
+	TLS_PENDING_CLOSED_RECORD
+};
+
+struct tls_context {
+	union {
+		struct tls_crypto_info crypto_send;
+		struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128;
+	};
+
+	void *priv_ctx;
+
+	u16 prepend_size;
+	u16 tag_size;
+	u16 overhead_size;
+	u16 iv_size;
+	char *iv;
+	u16 rec_seq_size;
+	char *rec_seq;
+
+	struct scatterlist *partially_sent_record;
+	u16 partially_sent_offset;
+	unsigned long flags;
+
+	u16 pending_open_record_frags;
+	int (*push_pending_record)(struct sock *sk, int flags);
+	void (*free_resources)(struct sock *sk);
+
+	void (*sk_write_space)(struct sock *sk);
+	void (*sk_proto_close)(struct sock *sk, long timeout);
+
+	int  (*setsockopt)(struct sock *sk, int level,
+			   int optname, char __user *optval,
+			   unsigned int optlen);
+	int  (*getsockopt)(struct sock *sk, int level,
+			   int optname, char __user *optval,
+			   int __user *optlen);
+};
+
+int wait_on_pending_writer(struct sock *sk, long *timeo);
+int tls_sk_query(struct sock *sk, int optname, char __user *optval,
+		int __user *optlen);
+int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
+		  unsigned int optlen);
+
+
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx);
+int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tls_sw_sendpage(struct sock *sk, struct page *page,
+		    int offset, size_t size, int flags);
+void tls_sw_close(struct sock *sk, long timeout);
+
+void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
+void tls_icsk_clean_acked(struct sock *sk);
+
+int tls_push_sg(struct sock *sk, struct tls_context *ctx,
+		struct scatterlist *sg, u16 first_offset,
+		int flags);
+int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
+				   int flags, long *timeo);
+
+static inline bool tls_is_pending_closed_record(struct tls_context *ctx)
+{
+	return test_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+}
+
+static inline int tls_complete_pending_work(struct sock *sk,
+					    struct tls_context *ctx,
+					    int flags, long *timeo)
+{
+	int rc = 0;
+
+	if (unlikely(sk->sk_write_pending))
+		rc = wait_on_pending_writer(sk, timeo);
+
+	if (!rc && tls_is_pending_closed_record(ctx))
+		rc = tls_push_pending_closed_record(sk, ctx, flags, timeo);
+
+	return rc;
+}
+
+static inline bool tls_is_partially_sent_record(struct tls_context *ctx)
+{
+	return !!ctx->partially_sent_record;
+}
+
+static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
+{
+	return tls_ctx->pending_open_record_frags;
+}
+
+static inline void tls_err_abort(struct sock *sk)
+{
+	sk->sk_err = -EBADMSG;
+	sk->sk_error_report(sk);
+}
+
+static inline bool tls_bigint_increment(unsigned char *seq, int len)
+{
+	int i;
+
+	for (i = len - 1; i >= 0; i--) {
+		++seq[i];
+		if (seq[i] != 0)
+			break;
+	}
+
+	return (i == -1);
+}
+
+static inline void tls_advance_record_sn(struct sock *sk,
+					 struct tls_context *ctx)
+{
+	if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
+		tls_err_abort(sk);
+	tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			     ctx->iv_size);
+}
+
+static inline void tls_fill_prepend(struct tls_context *ctx,
+			     char *buf,
+			     size_t plaintext_len,
+			     unsigned char record_type)
+{
+	size_t pkt_len, iv_size = ctx->iv_size;
+
+	pkt_len = plaintext_len + iv_size + ctx->tag_size;
+
+	/* we cover nonce explicit here as well, so buf should be of
+	 * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
+	 */
+	buf[0] = record_type;
+	buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.version);
+	buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.version);
+	/* we can use IV for nonce explicit according to spec */
+	buf[3] = pkt_len >> 8;
+	buf[4] = pkt_len & 0xFF;
+	memcpy(buf + TLS_NONCE_OFFSET,
+	       ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size);
+}
+
+static inline struct tls_context *tls_get_ctx(const struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	return icsk->icsk_ulp_data;
+}
+
+static inline struct tls_sw_context *tls_sw_ctx(
+		const struct tls_context *tls_ctx)
+{
+	return (struct tls_sw_context *)tls_ctx->priv_ctx;
+}
+
+static inline struct tls_offload_context *tls_offload_ctx(
+		const struct tls_context *tls_ctx)
+{
+	return (struct tls_offload_context *)tls_ctx->priv_ctx;
+}
+
+int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
+		      unsigned char *record_type);
+
+#endif /* _TLS_OFFLOAD_H */
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
new file mode 100644
index 0000000..cc1d21d
--- /dev/null
+++ b/include/uapi/linux/tls.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UAPI_LINUX_TLS_H
+#define _UAPI_LINUX_TLS_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+/* TLS socket options */
+#define TLS_TX			1	/* Set transmit parameters */
+
+/* Supported versions */
+#define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
+#define TLS_VERSION_MAJOR(ver)	(((ver) >> 8) & 0xFF)
+
+#define TLS_VERSION_NUMBER(id)	((((id##_VERSION_MAJOR) & 0xFF) << 8) |	\
+				 ((id##_VERSION_MINOR) & 0xFF))
+
+#define TLS_1_2_VERSION_MAJOR	0x3
+#define TLS_1_2_VERSION_MINOR	0x3
+#define TLS_1_2_VERSION		TLS_VERSION_NUMBER(TLS_1_2)
+
+/* Supported ciphers */
+#define TLS_CIPHER_AES_GCM_128				51
+#define TLS_CIPHER_AES_GCM_128_IV_SIZE			8
+#define TLS_CIPHER_AES_GCM_128_KEY_SIZE		16
+#define TLS_CIPHER_AES_GCM_128_SALT_SIZE		4
+#define TLS_CIPHER_AES_GCM_128_TAG_SIZE		16
+#define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE		8
+
+#define TLS_SET_RECORD_TYPE	1
+
+struct tls_crypto_info {
+	__u16 version;
+	__u16 cipher_type;
+};
+
+struct tls12_crypto_info_aes_gcm_128 {
+	struct tls_crypto_info info;
+	unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
+	unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+	unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+	unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
+};
+
+#endif /* _UAPI_LINUX_TLS_H */
diff --git a/net/Kconfig b/net/Kconfig
index 102f781..7d57ef3 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -55,6 +55,7 @@ menu "Networking options"
 
 source "net/packet/Kconfig"
 source "net/unix/Kconfig"
+source "net/tls/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
 source "net/smc/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 9086ffb..bed80fa 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_LLC)		+= llc/
 obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/ bpf/
 obj-$(CONFIG_NETFILTER)		+= netfilter/
 obj-$(CONFIG_INET)		+= ipv4/
+obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
new file mode 100644
index 0000000..61e5329
--- /dev/null
+++ b/net/tls/Kconfig
@@ -0,0 +1,12 @@
+#
+# TLS configuration
+#
+config TLS
+	tristate "Transport Layer Security support"
+	depends on NET
+	default m
+	---help---
+	Enable kernel support for TLS protocol. This allows symmetric
+	encryption handling of the TLS protocol to be done in-kernel.
+
+	If unsure, say M.
diff --git a/net/tls/Makefile b/net/tls/Makefile
new file mode 100644
index 0000000..a930fd1
--- /dev/null
+++ b/net/tls/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the TLS subsystem.
+#
+
+obj-$(CONFIG_TLS) += tls.o
+
+tls-y := tls_main.o tls_sw.o
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
new file mode 100644
index 0000000..2ebc328
--- /dev/null
+++ b/net/tls/tls_main.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/highmem.h>
+#include <linux/netdevice.h>
+#include <linux/sched/signal.h>
+
+#include <net/tls.h>
+
+MODULE_AUTHOR("Mellanox Technologies");
+MODULE_DESCRIPTION("Transport Layer Security Support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct proto tls_base_prot;
+static struct proto tls_sw_prot;
+
+int wait_on_pending_writer(struct sock *sk, long *timeo)
+{
+	int rc = 0;
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (1) {
+		if (!*timeo) {
+			rc = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			rc = sock_intr_errno(*timeo);
+			break;
+		}
+
+		if (sk_wait_event(sk, timeo, !sk->sk_write_pending, &wait))
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return rc;
+}
+
+int tls_push_sg(struct sock *sk,
+		struct tls_context *ctx,
+		struct scatterlist *sg,
+		u16 first_offset,
+		int flags)
+{
+	int sendpage_flags = flags | MSG_SENDPAGE_NOTLAST;
+	int ret = 0;
+	struct page *p;
+	size_t size;
+	int offset = first_offset;
+
+	size = sg->length - offset;
+	offset += sg->offset;
+
+	while (1) {
+		if (sg_is_last(sg))
+			sendpage_flags = flags;
+
+		/* is sending application-limited? */
+		tcp_rate_check_app_limited(sk);
+		p = sg_page(sg);
+retry:
+		ret = do_tcp_sendpages(sk, p, offset, size, sendpage_flags);
+
+		if (ret != size) {
+			if (ret > 0) {
+				offset += ret;
+				size -= ret;
+				goto retry;
+			}
+
+			offset -= sg->offset;
+			ctx->partially_sent_offset = offset;
+			ctx->partially_sent_record = (void *)sg;
+			return ret;
+		}
+
+		put_page(p);
+		sk_mem_uncharge(sk, sg->length);
+		sg = sg_next(sg);
+		if (!sg)
+			break;
+
+		offset = sg->offset;
+		size = sg->length;
+	}
+
+	clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+
+	return 0;
+}
+
+static int tls_handle_open_record(struct sock *sk, int flags)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (tls_is_pending_open_record(ctx))
+		return ctx->push_pending_record(sk, flags);
+
+	return 0;
+}
+
+int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
+		      unsigned char *record_type)
+{
+	struct cmsghdr *cmsg;
+	int rc = -EINVAL;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+		if (cmsg->cmsg_level != SOL_TLS)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case TLS_SET_RECORD_TYPE:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(*record_type)))
+				return -EINVAL;
+
+			if (msg->msg_flags & MSG_MORE)
+				return -EINVAL;
+
+			rc = tls_handle_open_record(sk, msg->msg_flags);
+			if (rc)
+				return rc;
+
+			*record_type = *(unsigned char *)CMSG_DATA(cmsg);
+			rc = 0;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return rc;
+}
+
+int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
+				   int flags, long *timeo)
+{
+	struct scatterlist *sg;
+	u16 offset;
+
+	if (!tls_is_partially_sent_record(ctx))
+		return ctx->push_pending_record(sk, flags);
+
+	sg = ctx->partially_sent_record;
+	offset = ctx->partially_sent_offset;
+
+	ctx->partially_sent_record = NULL;
+	return tls_push_sg(sk, ctx, sg, offset, flags);
+}
+
+static void tls_write_space(struct sock *sk)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) {
+		gfp_t sk_allocation = sk->sk_allocation;
+		int rc;
+		long timeo = 0;
+
+		sk->sk_allocation = GFP_ATOMIC;
+		rc = tls_push_pending_closed_record(sk, ctx,
+						    MSG_DONTWAIT |
+						    MSG_NOSIGNAL,
+						    &timeo);
+		sk->sk_allocation = sk_allocation;
+
+		if (rc < 0)
+			return;
+	}
+
+	ctx->sk_write_space(sk);
+}
+
+static void tls_sk_proto_close(struct sock *sk, long timeout)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	long timeo = sock_sndtimeo(sk, 0);
+	void (*sk_proto_close)(struct sock *sk, long timeout);
+
+	lock_sock(sk);
+
+	if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
+		tls_handle_open_record(sk, 0);
+
+	if (ctx->partially_sent_record) {
+		struct scatterlist *sg = ctx->partially_sent_record;
+
+		while (1) {
+			put_page(sg_page(sg));
+			sk_mem_uncharge(sk, sg->length);
+
+			if (sg_is_last(sg))
+				break;
+			sg++;
+		}
+	}
+	ctx->free_resources(sk);
+	kfree(ctx->rec_seq);
+	kfree(ctx->iv);
+
+	sk_proto_close = ctx->sk_proto_close;
+	kfree(ctx);
+
+	release_sock(sk);
+	sk_proto_close(sk, timeout);
+}
+
+static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
+				int __user *optlen)
+{
+	int rc = 0;
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct tls_crypto_info *crypto_info;
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (!optval || (len < sizeof(*crypto_info))) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (!ctx) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* get user crypto info */
+	crypto_info = &ctx->crypto_send;
+
+	if (!TLS_CRYPTO_INFO_READY(crypto_info)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	if (len == sizeof(crypto_info)) {
+		rc = copy_to_user(optval, crypto_info, sizeof(*crypto_info));
+		goto out;
+	}
+
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128: {
+		struct tls12_crypto_info_aes_gcm_128 *
+		  crypto_info_aes_gcm_128 =
+		  container_of(crypto_info,
+			       struct tls12_crypto_info_aes_gcm_128,
+			       info);
+
+		if (len != sizeof(*crypto_info_aes_gcm_128)) {
+			rc = -EINVAL;
+			goto out;
+		}
+		lock_sock(sk);
+		memcpy(crypto_info_aes_gcm_128->iv, ctx->iv,
+		       TLS_CIPHER_AES_GCM_128_IV_SIZE);
+		release_sock(sk);
+		rc = copy_to_user(optval,
+				  crypto_info_aes_gcm_128,
+				  sizeof(*crypto_info_aes_gcm_128));
+		break;
+	}
+	default:
+		rc = -EINVAL;
+	}
+
+out:
+	return rc;
+}
+
+static int do_tls_getsockopt(struct sock *sk, int optname,
+			     char __user *optval, int __user *optlen)
+{
+	int rc = 0;
+
+	switch (optname) {
+	case TLS_TX:
+		rc = do_tls_getsockopt_tx(sk, optval, optlen);
+		break;
+	default:
+		rc = -ENOPROTOOPT;
+		break;
+	}
+	return rc;
+}
+
+static int tls_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (level != SOL_TLS)
+		return ctx->getsockopt(sk, level, optname, optval, optlen);
+
+	return do_tls_getsockopt(sk, optname, optval, optlen);
+}
+
+static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
+				unsigned int optlen)
+{
+	struct tls_crypto_info *crypto_info, tmp_crypto_info;
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct proto *prot = NULL;
+	int rc = 0;
+
+	if (!optval || (optlen < sizeof(*crypto_info))) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info));
+	if (rc) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/* check version */
+	if (tmp_crypto_info.version != TLS_1_2_VERSION) {
+		rc = -ENOTSUPP;
+		goto out;
+	}
+
+	/* get user crypto info */
+	crypto_info = &ctx->crypto_send;
+
+	/* Currently we don't support set crypto info more than one time */
+	if (TLS_CRYPTO_INFO_READY(crypto_info))
+		goto out;
+
+	switch (tmp_crypto_info.cipher_type) {
+	case TLS_CIPHER_AES_GCM_128: {
+		if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) {
+			rc = -EINVAL;
+			goto out;
+		}
+		rc = copy_from_user(
+		  crypto_info,
+		  optval,
+		  sizeof(struct tls12_crypto_info_aes_gcm_128));
+
+		if (rc) {
+			rc = -EFAULT;
+			goto err_crypto_info;
+		}
+		break;
+	}
+	default:
+		rc = -EINVAL;
+		goto out;
+	}
+
+	ctx->sk_write_space = sk->sk_write_space;
+	sk->sk_write_space = tls_write_space;
+
+	ctx->sk_proto_close = sk->sk_prot->close;
+
+	/* currently SW is default, we will have ethtool in future */
+	rc = tls_set_sw_offload(sk, ctx);
+	prot = &tls_sw_prot;
+	if (rc)
+		goto err_crypto_info;
+
+	sk->sk_prot = prot;
+	goto out;
+
+err_crypto_info:
+	memset(crypto_info, 0, sizeof(*crypto_info));
+out:
+	return rc;
+}
+
+static int do_tls_setsockopt(struct sock *sk, int optname,
+			     char __user *optval, unsigned int optlen)
+{
+	int rc = 0;
+
+	switch (optname) {
+	case TLS_TX:
+		lock_sock(sk);
+		rc = do_tls_setsockopt_tx(sk, optval, optlen);
+		release_sock(sk);
+		break;
+	default:
+		rc = -ENOPROTOOPT;
+		break;
+	}
+	return rc;
+}
+
+static int tls_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	if (level != SOL_TLS)
+		return ctx->setsockopt(sk, level, optname, optval, optlen);
+
+	return do_tls_setsockopt(sk, optname, optval, optlen);
+}
+
+static int tls_init(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tls_context *ctx;
+	int rc = 0;
+
+	/* allocate tls context */
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	icsk->icsk_ulp_data = ctx;
+	ctx->setsockopt = sk->sk_prot->setsockopt;
+	ctx->getsockopt = sk->sk_prot->getsockopt;
+	sk->sk_prot = &tls_base_prot;
+out:
+	return rc;
+}
+
+static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
+	.name			= "tls",
+	.owner			= THIS_MODULE,
+	.init			= tls_init,
+};
+
+static int __init tls_register(void)
+{
+	tls_base_prot			= tcp_prot;
+	tls_base_prot.setsockopt	= tls_setsockopt;
+	tls_base_prot.getsockopt	= tls_getsockopt;
+
+	tls_sw_prot			= tls_base_prot;
+	tls_sw_prot.sendmsg		= tls_sw_sendmsg;
+	tls_sw_prot.sendpage            = tls_sw_sendpage;
+	tls_sw_prot.close               = tls_sk_proto_close;
+
+	tcp_register_ulp(&tcp_tls_ulp_ops);
+
+	return 0;
+}
+
+static void __exit tls_unregister(void)
+{
+	tcp_unregister_ulp(&tcp_tls_ulp_ops);
+}
+
+module_init(tls_register);
+module_exit(tls_unregister);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
new file mode 100644
index 0000000..fa596fa
--- /dev/null
+++ b/net/tls/tls_sw.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ * Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved.
+ * Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved.
+ * Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <crypto/aead.h>
+
+#include <net/tls.h>
+
+static inline void tls_make_aad(int recv,
+				char *buf,
+				size_t size,
+				char *record_sequence,
+				int record_sequence_size,
+				unsigned char record_type)
+{
+	memcpy(buf, record_sequence, record_sequence_size);
+
+	buf[8] = record_type;
+	buf[9] = TLS_1_2_VERSION_MAJOR;
+	buf[10] = TLS_1_2_VERSION_MINOR;
+	buf[11] = size >> 8;
+	buf[12] = size & 0xFF;
+}
+
+static void trim_sg(struct sock *sk, struct scatterlist *sg,
+		    int *sg_num_elem, unsigned int *sg_size, int target_size)
+{
+	int i = *sg_num_elem - 1;
+	int trim = *sg_size - target_size;
+
+	if (trim <= 0) {
+		WARN_ON(trim < 0);
+		return;
+	}
+
+	*sg_size = target_size;
+	while (trim >= sg[i].length) {
+		trim -= sg[i].length;
+		sk_mem_uncharge(sk, sg[i].length);
+		put_page(sg_page(&sg[i]));
+		i--;
+
+		if (i < 0)
+			goto out;
+	}
+
+	sg[i].length -= trim;
+	sk_mem_uncharge(sk, trim);
+
+out:
+	*sg_num_elem = i + 1;
+}
+
+static void trim_both_sgl(struct sock *sk, int target_size)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	trim_sg(sk, ctx->sg_plaintext_data,
+		&ctx->sg_plaintext_num_elem,
+		&ctx->sg_plaintext_size,
+		target_size);
+
+	if (target_size > 0)
+		target_size += tls_ctx->overhead_size;
+
+	trim_sg(sk, ctx->sg_encrypted_data,
+		&ctx->sg_encrypted_num_elem,
+		&ctx->sg_encrypted_size,
+		target_size);
+}
+
+static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
+		    int *sg_num_elem, unsigned int *sg_size,
+		    int first_coalesce)
+{
+	struct page_frag *pfrag;
+	unsigned int size = *sg_size;
+	int num_elem = *sg_num_elem, use = 0, rc = 0;
+	struct scatterlist *sge;
+	unsigned int orig_offset;
+
+	len -= size;
+	pfrag = sk_page_frag(sk);
+
+	while (len > 0) {
+		if (!sk_page_frag_refill(sk, pfrag)) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		use = min_t(int, len, pfrag->size - pfrag->offset);
+
+		if (!sk_wmem_schedule(sk, use)) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		sk_mem_charge(sk, use);
+		size += use;
+		orig_offset = pfrag->offset;
+		pfrag->offset += use;
+
+		sge = sg + num_elem - 1;
+		if (num_elem > first_coalesce && sg_page(sg) == pfrag->page &&
+		    sg->offset + sg->length == orig_offset) {
+			sg->length += use;
+		} else {
+			sge++;
+			sg_unmark_end(sge);
+			sg_set_page(sge, pfrag->page, use, orig_offset);
+			get_page(pfrag->page);
+			++num_elem;
+			if (num_elem == MAX_SKB_FRAGS) {
+				rc = -ENOSPC;
+				break;
+			}
+		}
+
+		len -= use;
+	}
+	goto out;
+
+out:
+	*sg_size = size;
+	*sg_num_elem = num_elem;
+	return rc;
+}
+
+static int alloc_encrypted_sg(struct sock *sk, int len)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int rc = 0;
+
+	rc = alloc_sg(sk, len, ctx->sg_encrypted_data,
+		      &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0);
+
+	return rc;
+}
+
+static int alloc_plaintext_sg(struct sock *sk, int len)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int rc = 0;
+
+	rc = alloc_sg(sk, len, ctx->sg_plaintext_data,
+		      &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
+		      tls_ctx->pending_open_record_frags);
+
+	return rc;
+}
+
+static void free_sg(struct sock *sk, struct scatterlist *sg,
+		    int *sg_num_elem, unsigned int *sg_size)
+{
+	int i, n = *sg_num_elem;
+
+	for (i = 0; i < n; ++i) {
+		sk_mem_uncharge(sk, sg[i].length);
+		put_page(sg_page(&sg[i]));
+	}
+	*sg_num_elem = 0;
+	*sg_size = 0;
+}
+
+static void tls_free_both_sg(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
+		&ctx->sg_encrypted_size);
+
+	free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
+		&ctx->sg_plaintext_size);
+}
+
+static int tls_do_encryption(struct tls_context *tls_ctx,
+			     struct tls_sw_context *ctx, size_t data_len,
+			     gfp_t flags)
+{
+	unsigned int req_size = sizeof(struct aead_request) +
+		crypto_aead_reqsize(ctx->aead_send);
+	struct aead_request *aead_req;
+	int rc;
+
+	aead_req = kmalloc(req_size, flags);
+	if (!aead_req)
+		return -ENOMEM;
+
+	ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size;
+	ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size;
+
+	aead_request_set_tfm(aead_req, ctx->aead_send);
+	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
+	aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
+			       data_len, tls_ctx->iv);
+	rc = crypto_aead_encrypt(aead_req);
+
+	ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size;
+	ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size;
+
+	kfree(aead_req);
+	return rc;
+}
+
+static int tls_push_record(struct sock *sk, int flags,
+			   unsigned char record_type)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int rc;
+
+	sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
+	sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1);
+
+	tls_make_aad(0, ctx->aad_space, ctx->sg_plaintext_size,
+		     tls_ctx->rec_seq, tls_ctx->rec_seq_size,
+		     record_type);
+
+	tls_fill_prepend(tls_ctx,
+			 page_address(sg_page(&ctx->sg_encrypted_data[0])) +
+			 ctx->sg_encrypted_data[0].offset,
+			 ctx->sg_plaintext_size, record_type);
+
+	tls_ctx->pending_open_record_frags = 0;
+	set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags);
+
+	rc = tls_do_encryption(tls_ctx, ctx, ctx->sg_plaintext_size,
+			       sk->sk_allocation);
+	if (rc < 0) {
+		/* If we are called from write_space and
+		 * we fail, we need to set this SOCK_NOSPACE
+		 * to trigger another write_space in the future.
+		 */
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		return rc;
+	}
+
+	free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
+		&ctx->sg_plaintext_size);
+
+	ctx->sg_encrypted_num_elem = 0;
+	ctx->sg_encrypted_size = 0;
+
+	/* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */
+	rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags);
+	if (rc < 0 && rc != -EAGAIN)
+		tls_err_abort(sk);
+
+	tls_advance_record_sn(sk, tls_ctx);
+	return rc;
+}
+
+static int tls_sw_push_pending_record(struct sock *sk, int flags)
+{
+	return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA);
+}
+
+static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+			      int length)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct page *pages[MAX_SKB_FRAGS];
+
+	size_t offset;
+	ssize_t copied, use;
+	int i = 0;
+	unsigned int size = ctx->sg_plaintext_size;
+	int num_elem = ctx->sg_plaintext_num_elem;
+	int rc = 0;
+	int maxpages;
+
+	while (length > 0) {
+		i = 0;
+		maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem;
+		if (maxpages == 0) {
+			rc = -EFAULT;
+			goto out;
+		}
+		copied = iov_iter_get_pages(from, pages,
+					    length,
+					    maxpages, &offset);
+		if (copied <= 0) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		iov_iter_advance(from, copied);
+
+		length -= copied;
+		size += copied;
+		while (copied) {
+			use = min_t(int, copied, PAGE_SIZE - offset);
+
+			sg_set_page(&ctx->sg_plaintext_data[num_elem],
+				    pages[i], use, offset);
+			sg_unmark_end(&ctx->sg_plaintext_data[num_elem]);
+			sk_mem_charge(sk, use);
+
+			offset = 0;
+			copied -= use;
+
+			++i;
+			++num_elem;
+		}
+	}
+
+out:
+	ctx->sg_plaintext_size = size;
+	ctx->sg_plaintext_num_elem = num_elem;
+	return rc;
+}
+
+static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+			     int bytes)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct scatterlist *sg = ctx->sg_plaintext_data;
+	int copy, i, rc = 0;
+
+	for (i = tls_ctx->pending_open_record_frags;
+	     i < ctx->sg_plaintext_num_elem; ++i) {
+		copy = sg[i].length;
+		if (copy_from_iter(
+				page_address(sg_page(&sg[i])) + sg[i].offset,
+				copy, from) != copy) {
+			rc = -EFAULT;
+			goto out;
+		}
+		bytes -= copy;
+
+		++tls_ctx->pending_open_record_frags;
+
+		if (!bytes)
+			break;
+	}
+
+out:
+	return rc;
+}
+
+int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int ret = 0;
+	int required_size;
+	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	bool eor = !(msg->msg_flags & MSG_MORE);
+	size_t try_to_copy, copied = 0;
+	unsigned char record_type = TLS_RECORD_TYPE_DATA;
+	int record_room;
+	bool full_record;
+	int orig_size;
+
+	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
+		return -ENOTSUPP;
+
+	lock_sock(sk);
+
+	if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo))
+		goto send_end;
+
+	if (unlikely(msg->msg_controllen)) {
+		ret = tls_proccess_cmsg(sk, msg, &record_type);
+		if (ret)
+			goto send_end;
+	}
+
+	while (msg_data_left(msg)) {
+		if (sk->sk_err) {
+			ret = sk->sk_err;
+			goto send_end;
+		}
+
+		orig_size = ctx->sg_plaintext_size;
+		full_record = false;
+		try_to_copy = msg_data_left(msg);
+		record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size;
+		if (try_to_copy >= record_room) {
+			try_to_copy = record_room;
+			full_record = true;
+		}
+
+		required_size = ctx->sg_plaintext_size + try_to_copy +
+				tls_ctx->overhead_size;
+
+		if (!sk_stream_memory_free(sk))
+			goto wait_for_sndbuf;
+alloc_encrypted:
+		ret = alloc_encrypted_sg(sk, required_size);
+		if (ret) {
+			if (ret != -ENOSPC)
+				goto wait_for_memory;
+
+			/* Adjust try_to_copy according to the amount that was
+			 * actually allocated. The difference is due
+			 * to max sg elements limit
+			 */
+			try_to_copy -= required_size - ctx->sg_encrypted_size;
+			full_record = true;
+		}
+
+		if (full_record || eor) {
+			ret = zerocopy_from_iter(sk, &msg->msg_iter,
+						 try_to_copy);
+			if (ret)
+				goto fallback_to_reg_send;
+
+			copied += try_to_copy;
+			ret = tls_push_record(sk, msg->msg_flags, record_type);
+			if (!ret)
+				continue;
+			if (ret == -EAGAIN)
+				goto send_end;
+
+			copied -= try_to_copy;
+fallback_to_reg_send:
+			iov_iter_revert(&msg->msg_iter,
+					ctx->sg_plaintext_size - orig_size);
+			trim_sg(sk, ctx->sg_plaintext_data,
+				&ctx->sg_plaintext_num_elem,
+				&ctx->sg_plaintext_size,
+				orig_size);
+		}
+
+		required_size = ctx->sg_plaintext_size + try_to_copy;
+alloc_plaintext:
+		ret = alloc_plaintext_sg(sk, required_size);
+		if (ret) {
+			if (ret != -ENOSPC)
+				goto wait_for_memory;
+
+			/* Adjust try_to_copy according to the amount that was
+			 * actually allocated. The difference is due
+			 * to max sg elements limit
+			 */
+			try_to_copy -= required_size - ctx->sg_plaintext_size;
+			full_record = true;
+
+			trim_sg(sk, ctx->sg_encrypted_data,
+				&ctx->sg_encrypted_num_elem,
+				&ctx->sg_encrypted_size,
+				ctx->sg_plaintext_size +
+				tls_ctx->overhead_size);
+		}
+
+		ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy);
+		if (ret)
+			goto trim_sgl;
+
+		copied += try_to_copy;
+		if (full_record || eor) {
+push_record:
+			ret = tls_push_record(sk, msg->msg_flags, record_type);
+			if (ret) {
+				if (ret == -ENOMEM)
+					goto wait_for_memory;
+
+				goto send_end;
+			}
+		}
+
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		ret = sk_stream_wait_memory(sk, &timeo);
+		if (ret) {
+trim_sgl:
+			trim_both_sgl(sk, orig_size);
+			goto send_end;
+		}
+
+		if (tls_is_pending_closed_record(tls_ctx))
+			goto push_record;
+
+		if (ctx->sg_encrypted_size < required_size)
+			goto alloc_encrypted;
+
+		goto alloc_plaintext;
+	}
+
+send_end:
+	ret = sk_stream_error(sk, msg->msg_flags, ret);
+
+	release_sock(sk);
+	return copied ? copied : ret;
+}
+
+int tls_sw_sendpage(struct sock *sk, struct page *page,
+		    int offset, size_t size, int flags)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	int ret = 0;
+	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	bool eor;
+	size_t orig_size = size;
+	unsigned char record_type = TLS_RECORD_TYPE_DATA;
+	struct scatterlist *sg;
+	bool full_record;
+	int record_room;
+
+	if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
+		      MSG_SENDPAGE_NOTLAST))
+		return -ENOTSUPP;
+
+	/* No MSG_EOR from splice, only look at MSG_MORE */
+	eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST));
+
+	lock_sock(sk);
+
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo))
+		goto sendpage_end;
+
+	/* Call the sk_stream functions to manage the sndbuf mem. */
+	while (size > 0) {
+		size_t copy, required_size;
+
+		if (sk->sk_err) {
+			ret = sk->sk_err;
+			goto sendpage_end;
+		}
+
+		full_record = false;
+		record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size;
+		copy = size;
+		if (copy >= record_room) {
+			copy = record_room;
+			full_record = true;
+		}
+		required_size = ctx->sg_plaintext_size + copy +
+			      tls_ctx->overhead_size;
+
+		if (!sk_stream_memory_free(sk))
+			goto wait_for_sndbuf;
+alloc_payload:
+		ret = alloc_encrypted_sg(sk, required_size);
+		if (ret) {
+			if (ret != -ENOSPC)
+				goto wait_for_memory;
+
+			/* Adjust copy according to the amount that was
+			 * actually allocated. The difference is due
+			 * to max sg elements limit
+			 */
+			copy -= required_size - ctx->sg_plaintext_size;
+			full_record = true;
+		}
+
+		get_page(page);
+		sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem;
+		sg_set_page(sg, page, copy, offset);
+		ctx->sg_plaintext_num_elem++;
+
+		sk_mem_charge(sk, copy);
+		offset += copy;
+		size -= copy;
+		ctx->sg_plaintext_size += copy;
+		tls_ctx->pending_open_record_frags = ctx->sg_plaintext_num_elem;
+
+		if (full_record || eor ||
+		    ctx->sg_plaintext_num_elem ==
+		    ARRAY_SIZE(ctx->sg_plaintext_data)) {
+push_record:
+			ret = tls_push_record(sk, flags, record_type);
+			if (ret) {
+				if (ret == -ENOMEM)
+					goto wait_for_memory;
+
+				goto sendpage_end;
+			}
+		}
+		continue;
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		ret = sk_stream_wait_memory(sk, &timeo);
+		if (ret) {
+			trim_both_sgl(sk, ctx->sg_plaintext_size);
+			goto sendpage_end;
+		}
+
+		if (tls_is_pending_closed_record(tls_ctx))
+			goto push_record;
+
+		goto alloc_payload;
+	}
+
+sendpage_end:
+	if (orig_size > size)
+		ret = orig_size - size;
+	else
+		ret = sk_stream_error(sk, flags, ret);
+
+	release_sock(sk);
+	return ret;
+}
+
+void tls_sw_free_resources(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	if (ctx->aead_send)
+		crypto_free_aead(ctx->aead_send);
+
+	tls_free_both_sg(sk);
+
+	kfree(ctx);
+}
+
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
+{
+	char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+	struct tls_crypto_info *crypto_info;
+	struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
+	struct tls_sw_context *sw_ctx;
+	u16 nonce_size, tag_size, iv_size, rec_seq_size;
+	char *iv, *rec_seq;
+	int rc = 0;
+
+	if (!ctx) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (ctx->priv_ctx) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
+	if (!sw_ctx) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
+	ctx->free_resources = tls_sw_free_resources;
+
+	crypto_info = &ctx->crypto_send;
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128: {
+		nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+		tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+		iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+		iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
+		rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
+		rec_seq =
+		 ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
+		gcm_128_info =
+			(struct tls12_crypto_info_aes_gcm_128 *)crypto_info;
+		break;
+	}
+	default:
+		rc = -EINVAL;
+		goto out;
+	}
+
+	ctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
+	ctx->tag_size = tag_size;
+	ctx->overhead_size = ctx->prepend_size + ctx->tag_size;
+	ctx->iv_size = iv_size;
+	ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			  GFP_KERNEL);
+	if (!ctx->iv) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+	ctx->rec_seq_size = rec_seq_size;
+	ctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+	if (!ctx->rec_seq) {
+		rc = -ENOMEM;
+		goto free_iv;
+	}
+	memcpy(ctx->rec_seq, rec_seq, rec_seq_size);
+
+	sg_init_table(sw_ctx->sg_encrypted_data,
+		      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
+	sg_init_table(sw_ctx->sg_plaintext_data,
+		      ARRAY_SIZE(sw_ctx->sg_plaintext_data));
+
+	sg_init_table(sw_ctx->sg_aead_in, 2);
+	sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
+		   sizeof(sw_ctx->aad_space));
+	sg_unmark_end(&sw_ctx->sg_aead_in[1]);
+	sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
+	sg_init_table(sw_ctx->sg_aead_out, 2);
+	sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
+		   sizeof(sw_ctx->aad_space));
+	sg_unmark_end(&sw_ctx->sg_aead_out[1]);
+	sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
+
+	if (!sw_ctx->aead_send) {
+		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
+		if (IS_ERR(sw_ctx->aead_send)) {
+			rc = PTR_ERR(sw_ctx->aead_send);
+			sw_ctx->aead_send = NULL;
+			goto free_rec_seq;
+		}
+	}
+
+	ctx->push_pending_record = tls_sw_push_pending_record;
+
+	memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+
+	rc = crypto_aead_setkey(sw_ctx->aead_send, keyval,
+				TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+	if (rc)
+		goto free_aead;
+
+	rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size);
+	if (!rc)
+		goto out;
+
+free_aead:
+	crypto_free_aead(sw_ctx->aead_send);
+	sw_ctx->aead_send = NULL;
+free_rec_seq:
+	kfree(ctx->rec_seq);
+	ctx->rec_seq = NULL;
+free_iv:
+	kfree(ctx->iv);
+	ctx->iv = NULL;
+out:
+	return rc;
+}
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v3 net-next 4/4] tls: Documentation
       [not found] <cover.1497465295.git.davejwatson@fb.com>
                   ` (2 preceding siblings ...)
  2017-06-14 18:37 ` [PATCH v3 net-next 3/4] tls: kernel TLS support Dave Watson
@ 2017-06-14 18:37 ` Dave Watson
  3 siblings, 0 replies; 20+ messages in thread
From: Dave Watson @ 2017-06-14 18:37 UTC (permalink / raw)
  To: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet
  Cc: Alexei Starovoitov, nmav, fridolin.pokorny

Add documentation for the tcp ULP tls interface.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Dave Watson <davejwatson@fb.com>
---
 Documentation/networking/tls.txt | 135 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 Documentation/networking/tls.txt

diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
new file mode 100644
index 0000000..77ed006
--- /dev/null
+++ b/Documentation/networking/tls.txt
@@ -0,0 +1,135 @@
+Overview
+========
+
+Transport Layer Security (TLS) is a Upper Layer Protocol (ULP) that runs over
+TCP. TLS provides end-to-end data integrity and confidentiality.
+
+User interface
+==============
+
+Creating a TLS connection
+-------------------------
+
+First create a new TCP socket and set the TLS ULP.
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
+
+Setting the TLS ULP allows us to set/get TLS socket options. Currently
+only the symmetric encryption is handled in the kernel.  After the TLS
+handshake is complete, we have all the parameters required to move the
+data-path to the kernel. There is a separate socket option for moving
+the transmit and the receive into the kernel.
+
+  /* From linux/tls.h */
+  struct tls_crypto_info {
+          unsigned short version;
+          unsigned short cipher_type;
+  };
+
+  struct tls12_crypto_info_aes_gcm_128 {
+          struct tls_crypto_info info;
+          unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
+          unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+          unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+          unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
+  };
+
+
+  struct tls12_crypto_info_aes_gcm_128 crypto_info;
+
+  crypto_info.info.version = TLS_1_2_VERSION;
+  crypto_info.info.cipher_type = TLS_CIPHER_AES_GCM_128;
+  memcpy(crypto_info.iv, iv_write, TLS_CIPHER_AES_GCM_128_IV_SIZE);
+  memcpy(crypto_info.rec_seq, seq_number_write,
+					TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
+  memcpy(crypto_info.key, cipher_key_write, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+  memcpy(crypto_info.salt, implicit_iv_write, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+
+  setsockopt(sock, SOL_TLS, TLS_TX, &crypto_info, sizeof(crypto_info));
+
+Sending TLS application data
+----------------------------
+
+After setting the TLS_TX socket option all application data sent over this
+socket is encrypted using TLS and the parameters provided in the socket option.
+For example, we can send an encrypted hello world record as follows:
+
+  const char *msg = "hello world\n";
+  send(sock, msg, strlen(msg));
+
+send() data is directly encrypted from the userspace buffer provided
+to the encrypted kernel send buffer if possible.
+
+The sendfile system call will send the file's data over TLS records of maximum
+length (2^14).
+
+  file = open(filename, O_RDONLY);
+  fstat(file, &stat);
+  sendfile(sock, file, &offset, stat.st_size);
+
+TLS records are created and sent after each send() call, unless
+MSG_MORE is passed.  MSG_MORE will delay creation of a record until
+MSG_MORE is not passed, or the maximum record size is reached.
+
+The kernel will need to allocate a buffer for the encrypted data.
+This buffer is allocated at the time send() is called, such that
+either the entire send() call will return -ENOMEM (or block waiting
+for memory), or the encryption will always succeed.  If send() returns
+-ENOMEM and some data was left on the socket buffer from a previous
+call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
+
+Send TLS control messages
+-------------------------
+
+Other than application data, TLS has control messages such as alert
+messages (record type 21) and handshake messages (record type 22), etc.
+These messages can be sent over the socket by providing the TLS record type
+via a CMSG. For example the following function sends @data of @length bytes
+using a record of type @record_type.
+
+/* send TLS control message using record_type */
+  static int klts_send_ctrl_message(int sock, unsigned char record_type,
+                                  void *data, size_t length)
+  {
+        struct msghdr msg = {0};
+        int cmsg_len = sizeof(record_type);
+        struct cmsghdr *cmsg;
+        char buf[CMSG_SPACE(cmsg_len)];
+        struct iovec msg_iov;   /* Vector of data to send/receive into.  */
+
+        msg.msg_control = buf;
+        msg.msg_controllen = sizeof(buf);
+        cmsg = CMSG_FIRSTHDR(&msg);
+        cmsg->cmsg_level = SOL_TLS;
+        cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+        cmsg->cmsg_len = CMSG_LEN(cmsg_len);
+        *CMSG_DATA(cmsg) = record_type;
+        msg.msg_controllen = cmsg->cmsg_len;
+
+        msg_iov.iov_base = data;
+        msg_iov.iov_len = length;
+        msg.msg_iov = &msg_iov;
+        msg.msg_iovlen = 1;
+
+        return sendmsg(sock, &msg, 0);
+  }
+
+Control message data should be provided unencrypted, and will be
+encrypted by the kernel.
+
+Integrating in to userspace TLS library
+---------------------------------------
+
+At a high level, the kernel TLS ULP is a replacement for the record
+layer of a userspace TLS library.
+
+A patchset to OpenSSL to use ktls as the record layer is here:
+
+https://github.com/Mellanox/tls-openssl
+
+An example of calling send directly after a handshake using
+gnutls.  Since it doesn't implement a full record layer, control
+messages are not supported:
+
+https://github.com/Mellanox/tls-af_ktls_tool
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 3/4] tls: kernel TLS support
  2017-06-14 18:37 ` [PATCH v3 net-next 3/4] tls: kernel TLS support Dave Watson
@ 2017-06-16 20:56   ` Stephen Hemminger
  2017-06-16 20:58   ` Stephen Hemminger
  2017-07-11  6:29   ` Steffen Klassert
  2 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2017-06-16 20:56 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On Wed, 14 Jun 2017 11:37:39 -0700
Dave Watson <davejwatson@fb.com> wrote:

> +
> +static inline struct tls_context *tls_get_ctx(const struct sock *sk)
> +{
> +	struct inet_connection_sock *icsk = inet_csk(sk);
> +
> +	return icsk->icsk_ulp_data;
> +}
> +
> +static inline struct tls_sw_context *tls_sw_ctx(
> +		const struct tls_context *tls_ctx)
> +{
> +	return (struct tls_sw_context *)tls_ctx->priv_ctx;
> +}
> +
> +static inline struct tls_offload_context *tls_offload_ctx(
> +		const struct tls_context *tls_ctx)
> +{
> +	return (struct tls_offload_context *)tls_ctx->priv_ctx;
> +}
> +

Since priv_ctx is void *, casts here are unnecessary.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 3/4] tls: kernel TLS support
  2017-06-14 18:37 ` [PATCH v3 net-next 3/4] tls: kernel TLS support Dave Watson
  2017-06-16 20:56   ` Stephen Hemminger
@ 2017-06-16 20:58   ` Stephen Hemminger
  2017-06-17  0:35     ` Dave Watson
  2017-07-11  6:29   ` Steffen Klassert
  2 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2017-06-16 20:58 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On Wed, 14 Jun 2017 11:37:39 -0700
Dave Watson <davejwatson@fb.com> wrote:

> --- /dev/null
> +++ b/net/tls/Kconfig
> @@ -0,0 +1,12 @@
> +#
> +# TLS configuration
> +#
> +config TLS
> +	tristate "Transport Layer Security support"
> +	depends on NET
> +	default m
> +	---help---
> +	Enable kernel support for TLS protocol. This allows symmetric
> +	encryption handling of the TLS protocol to be done in-kernel.
> +
> +	If unsure, say M.

I understand that this will be useful to lots of people and most distributions
will enable it. But the defacto policy in kernel configuration has been that
new features in kernel default to being disabled.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure
  2017-06-14 18:37 ` [PATCH v3 net-next 1/4] tcp: ULP infrastructure Dave Watson
@ 2017-06-17  0:14   ` Christoph Paasch
  2017-07-29 20:19     ` Tom Herbert
  2017-06-25  2:42   ` Levin, Alexander (Sasha Levin)
  2017-07-29 20:12   ` Tom Herbert
  2 siblings, 1 reply; 20+ messages in thread
From: Christoph Paasch @ 2017-06-17  0:14 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

Hello,

On 14/06/17 - 11:37:14, Dave Watson wrote:
> Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
> sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
> ULP can add its own logic by changing the TCP proto_ops structure to its own
> methods.
> 
> Example usage:
> 
> setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
> 
> modules will call:
> tcp_register_ulp(&tcp_tls_ulp_ops);
> 
> to register/unregister their ulp, with an init function and name.
> 
> A list of registered ulps will be returned by tcp_get_available_ulp, which is
> hooked up to /proc.  Example:
> 
> $ cat /proc/sys/net/ipv4/tcp_available_ulp
> tls
> 
> There is currently no functionality to remove or chain ULPs, but
> it should be possible to add these in the future if needed.
> 
> Signed-off-by: Boris Pismenny <borisp@mellanox.com>
> Signed-off-by: Dave Watson <davejwatson@fb.com>
> ---
>  include/net/inet_connection_sock.h |   4 ++
>  include/net/tcp.h                  |  25 +++++++
>  include/uapi/linux/tcp.h           |   1 +
>  net/ipv4/Makefile                  |   2 +-
>  net/ipv4/sysctl_net_ipv4.c         |  25 +++++++
>  net/ipv4/tcp.c                     |  28 ++++++++
>  net/ipv4/tcp_ipv4.c                |   2 +
>  net/ipv4/tcp_ulp.c                 | 134 +++++++++++++++++++++++++++++++++++++
>  8 files changed, 220 insertions(+), 1 deletion(-)
>  create mode 100644 net/ipv4/tcp_ulp.c

I know I'm pretty late to the game (and maybe this has already been
discussed but I couldn't find anything in the archives), but I am wondering
what the take is on potential races of the setsockopt() vs other system-calls.

For example one might race the setsockopt() with a sendmsg() and the sendmsg
might end up blocking on the lock_sock in tcp_sendmsg, waiting for
tcp_set_ulp() to finish changing sk_prot. When the setsockopt() finishes, we
are then inside tcp_sendmsg() coming directly from sendmsg(), while we
should have been in the ULP's sendmsg.

It seems like TLS-ULP is resilient to this (or at least, won't cause a panic),
but there might be more exotic users of ULP in the future, that change other
callbacks and then things might go wrong.


Thoughts?


Thanks,
Christoph

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 3/4] tls: kernel TLS support
  2017-06-16 20:58   ` Stephen Hemminger
@ 2017-06-17  0:35     ` Dave Watson
  0 siblings, 0 replies; 20+ messages in thread
From: Dave Watson @ 2017-06-17  0:35 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On 06/16/17 01:58 PM, Stephen Hemminger wrote:
> On Wed, 14 Jun 2017 11:37:39 -0700
> Dave Watson <davejwatson@fb.com> wrote:
> 
> > --- /dev/null
> > +++ b/net/tls/Kconfig
> > @@ -0,0 +1,12 @@
> > +#
> > +# TLS configuration
> > +#
> > +config TLS
> > +	tristate "Transport Layer Security support"
> > +	depends on NET
> > +	default m
> > +	---help---
> > +	Enable kernel support for TLS protocol. This allows symmetric
> > +	encryption handling of the TLS protocol to be done in-kernel.
> > +
> > +	If unsure, say M.
> 
> I understand that this will be useful to lots of people and most distributions
> will enable it. But the defacto policy in kernel configuration has been that
> new features in kernel default to being disabled.

Sure, will send a patch to switch to default n.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure
  2017-06-14 18:37 ` [PATCH v3 net-next 1/4] tcp: ULP infrastructure Dave Watson
  2017-06-17  0:14   ` Christoph Paasch
@ 2017-06-25  2:42   ` Levin, Alexander (Sasha Levin)
  2017-06-26 14:30     ` Dave Watson
  2017-07-29 20:12   ` Tom Herbert
  2 siblings, 1 reply; 20+ messages in thread
From: Levin, Alexander (Sasha Levin) @ 2017-06-25  2:42 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On Wed, Jun 14, 2017 at 11:37:14AM -0700, Dave Watson wrote:
>Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
>sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
>ULP can add its own logic by changing the TCP proto_ops structure to its own
>methods.
>
>Example usage:
>
>setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
>
>modules will call:
>tcp_register_ulp(&tcp_tls_ulp_ops);
>
>to register/unregister their ulp, with an init function and name.
>
>A list of registered ulps will be returned by tcp_get_available_ulp, which is
>hooked up to /proc.  Example:
>
>$ cat /proc/sys/net/ipv4/tcp_available_ulp
>tls
>
>There is currently no functionality to remove or chain ULPs, but
>it should be possible to add these in the future if needed.
>
>Signed-off-by: Boris Pismenny <borisp@mellanox.com>
>Signed-off-by: Dave Watson <davejwatson@fb.com>

Hey Dave,

I'm seeing the following while fuzzing, which was bisected to this commit:

==================================================================
BUG: KASAN: null-ptr-deref in copy_to_user include/linux/uaccess.h:168 [inline]
BUG: KASAN: null-ptr-deref in do_tcp_getsockopt.isra.33+0x24f/0x1e30 net/ipv4/tcp.c:3057
Read of size 4 at addr 0000000000000020 by task syz-executor1/15452

CPU: 0 PID: 15452 Comm: syz-executor1 Not tainted 4.12.0-rc6-next-20170623+ #173
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x11d/0x1e5 lib/dump_stack.c:52
 kasan_report_error mm/kasan/report.c:349 [inline]
 kasan_report+0x15e/0x370 mm/kasan/report.c:408
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x14b/0x1a0 mm/kasan/kasan.c:267
 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
 copy_to_user include/linux/uaccess.h:168 [inline]
 do_tcp_getsockopt.isra.33+0x24f/0x1e30 net/ipv4/tcp.c:3057
 tcp_getsockopt+0xb0/0xd0 net/ipv4/tcp.c:3194
 sock_common_getsockopt+0x95/0xd0 net/core/sock.c:2863
 SYSC_getsockopt net/socket.c:1869 [inline]
 SyS_getsockopt+0x180/0x360 net/socket.c:1851
 do_syscall_64+0x267/0x740 arch/x86/entry/common.c:284
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: 0033:0x451759
RSP: 002b:00007f5dc2b1fc08 EFLAGS: 00000216 ORIG_RAX: 0000000000000037
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000451759
RDX: 000000000000001f RSI: 0000000000000006 RDI: 0000000000000005
RBP: 0000000000000c30 R08: 00000000207bf000 R09: 0000000000000000
R10: 0000000020000ffc R11: 0000000000000216 R12: 00000000004b824b
R13: 00000000ffffffff R14: 0000000000000005 R15: 0000000000000006
==================================================================
Disabling lock debugging due to kernel taint
Kernel panic - not syncing: panic_on_warn set ...

CPU: 0 PID: 15452 Comm: syz-executor1 Tainted: G    B           4.12.0-rc6-next-20170623+ #173
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x11d/0x1e5 lib/dump_stack.c:52
 panic+0x1bc/0x3ad kernel/panic.c:180
 kasan_end_report+0x47/0x4f mm/kasan/report.c:176
 kasan_report_error mm/kasan/report.c:356 [inline]
 kasan_report+0x167/0x370 mm/kasan/report.c:408
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x14b/0x1a0 mm/kasan/kasan.c:267
 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
 copy_to_user include/linux/uaccess.h:168 [inline]
 do_tcp_getsockopt.isra.33+0x24f/0x1e30 net/ipv4/tcp.c:3057
 tcp_getsockopt+0xb0/0xd0 net/ipv4/tcp.c:3194
 sock_common_getsockopt+0x95/0xd0 net/core/sock.c:2863
 SYSC_getsockopt net/socket.c:1869 [inline]
 SyS_getsockopt+0x180/0x360 net/socket.c:1851
 do_syscall_64+0x267/0x740 arch/x86/entry/common.c:284
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: 0033:0x451759
RSP: 002b:00007f5dc2b1fc08 EFLAGS: 00000216 ORIG_RAX: 0000000000000037
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000451759
RDX: 000000000000001f RSI: 0000000000000006 RDI: 0000000000000005
RBP: 0000000000000c30 R08: 00000000207bf000 R09: 0000000000000000
R10: 0000000020000ffc R11: 0000000000000216 R12: 00000000004b824b
R13: 00000000ffffffff R14: 0000000000000005 R15: 0000000000000006
Dumping ftrace buffer:
   (ftrace buffer empty)
Kernel Offset: 0x24800000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
Rebooting in 86400 seconds..

-- 

Thanks,
Sasha

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure
  2017-06-25  2:42   ` Levin, Alexander (Sasha Levin)
@ 2017-06-26 14:30     ` Dave Watson
  2017-06-26 15:07       ` Levin, Alexander (Sasha Levin)
  0 siblings, 1 reply; 20+ messages in thread
From: Dave Watson @ 2017-06-26 14:30 UTC (permalink / raw)
  To: Levin, Alexander (Sasha Levin)
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On 06/25/17 02:42 AM, Levin, Alexander (Sasha Levin) wrote:
> On Wed, Jun 14, 2017 at 11:37:14AM -0700, Dave Watson wrote:
> >Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
> >sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
> >ULP can add its own logic by changing the TCP proto_ops structure to its own
> >methods.
> >
> >Example usage:
> >
> >setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
> >
> >modules will call:
> >tcp_register_ulp(&tcp_tls_ulp_ops);
> >
> >to register/unregister their ulp, with an init function and name.
> >
> >A list of registered ulps will be returned by tcp_get_available_ulp, which is
> >hooked up to /proc.  Example:
> >
> >$ cat /proc/sys/net/ipv4/tcp_available_ulp
> >tls
> >
> >There is currently no functionality to remove or chain ULPs, but
> >it should be possible to add these in the future if needed.
> >
> >Signed-off-by: Boris Pismenny <borisp@mellanox.com>
> >Signed-off-by: Dave Watson <davejwatson@fb.com>
> 
> Hey Dave,
> 
> I'm seeing the following while fuzzing, which was bisected to this commit:
> 
> ==================================================================
> BUG: KASAN: null-ptr-deref in copy_to_user include/linux/uaccess.h:168 [inline]
> BUG: KASAN: null-ptr-deref in do_tcp_getsockopt.isra.33+0x24f/0x1e30 net/ipv4/tcp.c:3057
> Read of size 4 at addr 0000000000000020 by task syz-executor1/15452

At a glance, this looks like it was fixed already by

https://www.mail-archive.com/netdev@vger.kernel.org/msg175226.html

Can you recheck with that patch, or verify that you already have it?
Thanks.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure
  2017-06-26 14:30     ` Dave Watson
@ 2017-06-26 15:07       ` Levin, Alexander (Sasha Levin)
  0 siblings, 0 replies; 20+ messages in thread
From: Levin, Alexander (Sasha Levin) @ 2017-06-26 15:07 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On Mon, Jun 26, 2017 at 07:30:19AM -0700, Dave Watson wrote:
>On 06/25/17 02:42 AM, Levin, Alexander (Sasha Levin) wrote:
>> On Wed, Jun 14, 2017 at 11:37:14AM -0700, Dave Watson wrote:
>> >Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
>> >sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
>> >ULP can add its own logic by changing the TCP proto_ops structure to its own
>> >methods.
>> >
>> >Example usage:
>> >
>> >setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
>> >
>> >modules will call:
>> >tcp_register_ulp(&tcp_tls_ulp_ops);
>> >
>> >to register/unregister their ulp, with an init function and name.
>> >
>> >A list of registered ulps will be returned by tcp_get_available_ulp, which is
>> >hooked up to /proc.  Example:
>> >
>> >$ cat /proc/sys/net/ipv4/tcp_available_ulp
>> >tls
>> >
>> >There is currently no functionality to remove or chain ULPs, but
>> >it should be possible to add these in the future if needed.
>> >
>> >Signed-off-by: Boris Pismenny <borisp@mellanox.com>
>> >Signed-off-by: Dave Watson <davejwatson@fb.com>
>>
>> Hey Dave,
>>
>> I'm seeing the following while fuzzing, which was bisected to this commit:
>>
>> ==================================================================
>> BUG: KASAN: null-ptr-deref in copy_to_user include/linux/uaccess.h:168 [inline]
>> BUG: KASAN: null-ptr-deref in do_tcp_getsockopt.isra.33+0x24f/0x1e30 net/ipv4/tcp.c:3057
>> Read of size 4 at addr 0000000000000020 by task syz-executor1/15452
>
>At a glance, this looks like it was fixed already by
>
>https://www.mail-archive.com/netdev@vger.kernel.org/msg175226.html
>
>Can you recheck with that patch, or verify that you already have it?
>Thanks.

I've already tried this patch, it doesn't fix the issue I've reported.

-- 

Thanks,
Sasha

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 3/4] tls: kernel TLS support
  2017-06-14 18:37 ` [PATCH v3 net-next 3/4] tls: kernel TLS support Dave Watson
  2017-06-16 20:56   ` Stephen Hemminger
  2017-06-16 20:58   ` Stephen Hemminger
@ 2017-07-11  6:29   ` Steffen Klassert
  2017-07-11 18:53     ` Dave Watson
  2 siblings, 1 reply; 20+ messages in thread
From: Steffen Klassert @ 2017-07-11  6:29 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

Sorry for replying to old mail...

On Wed, Jun 14, 2017 at 11:37:39AM -0700, Dave Watson wrote:
> +static int tls_do_encryption(struct tls_context *tls_ctx,
> +			     struct tls_sw_context *ctx, size_t data_len,
> +			     gfp_t flags)
> +{
> +	unsigned int req_size = sizeof(struct aead_request) +
> +		crypto_aead_reqsize(ctx->aead_send);
> +	struct aead_request *aead_req;
> +	int rc;
> +
> +	aead_req = kmalloc(req_size, flags);
> +	if (!aead_req)
> +		return -ENOMEM;
> +
> +	ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size;
> +	ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size;
> +
> +	aead_request_set_tfm(aead_req, ctx->aead_send);
> +	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
> +	aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
> +			       data_len, tls_ctx->iv);
> +	rc = crypto_aead_encrypt(aead_req);
> +
> +	ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size;
> +	ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size;
> +
> +	kfree(aead_req);
> +	return rc;
> +}

...

> +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
> +{

...

> +
> +	if (!sw_ctx->aead_send) {
> +		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
> +		if (IS_ERR(sw_ctx->aead_send)) {
> +			rc = PTR_ERR(sw_ctx->aead_send);
> +			sw_ctx->aead_send = NULL;
> +			goto free_rec_seq;
> +		}
> +	}
> +

When I look on how you allocate the aead transformation, it seems
that you should either register an asynchronous callback with
aead_request_set_callback(), or request for a synchronous algorithm.

Otherwise you will crash on an asynchronous crypto return, no?

Also, it seems that you have your scatterlists on a per crypto
transformation base istead of per crypto request. Is this intentional?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 3/4] tls: kernel TLS support
  2017-07-11  6:29   ` Steffen Klassert
@ 2017-07-11 18:53     ` Dave Watson
  2017-07-11 20:24       ` Eric Biggers
  2017-07-12  7:20       ` Steffen Klassert
  0 siblings, 2 replies; 20+ messages in thread
From: Dave Watson @ 2017-07-11 18:53 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On 07/11/17 08:29 AM, Steffen Klassert wrote:
> Sorry for replying to old mail...
> > +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
> > +{
> 
> ...
> 
> > +
> > +	if (!sw_ctx->aead_send) {
> > +		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
> > +		if (IS_ERR(sw_ctx->aead_send)) {
> > +			rc = PTR_ERR(sw_ctx->aead_send);
> > +			sw_ctx->aead_send = NULL;
> > +			goto free_rec_seq;
> > +		}
> > +	}
> > +
> 
> When I look on how you allocate the aead transformation, it seems
> that you should either register an asynchronous callback with
> aead_request_set_callback(), or request for a synchronous algorithm.
> 
> Otherwise you will crash on an asynchronous crypto return, no?

The intention is for it to be synchronous, and gather directly from
userspace buffers.  It looks like calling
crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC) is the correct way
to request synchronous algorithms only?

> Also, it seems that you have your scatterlists on a per crypto
> transformation base istead of per crypto request. Is this intentional?

We hold the socket lock and only one crypto op can happen at a time,
so we reuse the scatterlists.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 3/4] tls: kernel TLS support
  2017-07-11 18:53     ` Dave Watson
@ 2017-07-11 20:24       ` Eric Biggers
  2017-07-12  7:20       ` Steffen Klassert
  1 sibling, 0 replies; 20+ messages in thread
From: Eric Biggers @ 2017-07-11 20:24 UTC (permalink / raw)
  To: Dave Watson
  Cc: Steffen Klassert, Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny,
	Liran Liss, Matan Barak, David Miller, netdev, Tom Herbert,
	herbert, linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On Tue, Jul 11, 2017 at 11:53:11AM -0700, Dave Watson wrote:
> On 07/11/17 08:29 AM, Steffen Klassert wrote:
> > Sorry for replying to old mail...
> > > +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
> > > +{
> > 
> > ...
> > 
> > > +
> > > +	if (!sw_ctx->aead_send) {
> > > +		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
> > > +		if (IS_ERR(sw_ctx->aead_send)) {
> > > +			rc = PTR_ERR(sw_ctx->aead_send);
> > > +			sw_ctx->aead_send = NULL;
> > > +			goto free_rec_seq;
> > > +		}
> > > +	}
> > > +
> > 
> > When I look on how you allocate the aead transformation, it seems
> > that you should either register an asynchronous callback with
> > aead_request_set_callback(), or request for a synchronous algorithm.
> > 
> > Otherwise you will crash on an asynchronous crypto return, no?
> 
> The intention is for it to be synchronous, and gather directly from
> userspace buffers.  It looks like calling
> crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC) is the correct way
> to request synchronous algorithms only?
> 

Yes, that means the CRYPTO_ALG_ASYNC bit is required to be 0, i.e. the algorithm
must be synchronous.  Currently it's requesting either a synchronous or
asynchronous algorithm, and it will crash if it gets an async one.

Also I think even with a synchronous algorithm, tls_do_encryption() still needs
to call aead_request_set_callback(), passing NULL for the callback and data, so
that the request flags are initialized.

Eric

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 3/4] tls: kernel TLS support
  2017-07-11 18:53     ` Dave Watson
  2017-07-11 20:24       ` Eric Biggers
@ 2017-07-12  7:20       ` Steffen Klassert
  2017-07-12 18:34         ` Dave Watson
  1 sibling, 1 reply; 20+ messages in thread
From: Steffen Klassert @ 2017-07-12  7:20 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On Tue, Jul 11, 2017 at 11:53:11AM -0700, Dave Watson wrote:
> On 07/11/17 08:29 AM, Steffen Klassert wrote:
> > Sorry for replying to old mail...
> > > +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
> > > +{
> > 
> > ...
> > 
> > > +
> > > +	if (!sw_ctx->aead_send) {
> > > +		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
> > > +		if (IS_ERR(sw_ctx->aead_send)) {
> > > +			rc = PTR_ERR(sw_ctx->aead_send);
> > > +			sw_ctx->aead_send = NULL;
> > > +			goto free_rec_seq;
> > > +		}
> > > +	}
> > > +
> > 
> > When I look on how you allocate the aead transformation, it seems
> > that you should either register an asynchronous callback with
> > aead_request_set_callback(), or request for a synchronous algorithm.
> > 
> > Otherwise you will crash on an asynchronous crypto return, no?
> 
> The intention is for it to be synchronous, and gather directly from
> userspace buffers.  It looks like calling
> crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC) is the correct way
> to request synchronous algorithms only?

Yes, but then you loose the aes-ni based algorithms because they are
asynchronous. If you want to have good crypto performance, it is
better to implement the asynchronous callbacks.

> 
> > Also, it seems that you have your scatterlists on a per crypto
> > transformation base istead of per crypto request. Is this intentional?
> 
> We hold the socket lock and only one crypto op can happen at a time,
> so we reuse the scatterlists.

This is OK as long as the crypto happens synchronous. But as said above,
I think this is not what you want.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 3/4] tls: kernel TLS support
  2017-07-12  7:20       ` Steffen Klassert
@ 2017-07-12 18:34         ` Dave Watson
  0 siblings, 0 replies; 20+ messages in thread
From: Dave Watson @ 2017-07-12 18:34 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, netdev, Tom Herbert, herbert,
	linux-crypto, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, nmav, fridolin.pokorny

On 07/12/17 09:20 AM, Steffen Klassert wrote:
> On Tue, Jul 11, 2017 at 11:53:11AM -0700, Dave Watson wrote:
> > On 07/11/17 08:29 AM, Steffen Klassert wrote:
> > > Sorry for replying to old mail...
> > > > +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
> > > > +{
> > > 
> > > ...
> > > 
> > > > +
> > > > +	if (!sw_ctx->aead_send) {
> > > > +		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
> > > > +		if (IS_ERR(sw_ctx->aead_send)) {
> > > > +			rc = PTR_ERR(sw_ctx->aead_send);
> > > > +			sw_ctx->aead_send = NULL;
> > > > +			goto free_rec_seq;
> > > > +		}
> > > > +	}
> > > > +
> > > 
> > > When I look on how you allocate the aead transformation, it seems
> > > that you should either register an asynchronous callback with
> > > aead_request_set_callback(), or request for a synchronous algorithm.
> > > 
> > > Otherwise you will crash on an asynchronous crypto return, no?
> > 
> > The intention is for it to be synchronous, and gather directly from
> > userspace buffers.  It looks like calling
> > crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC) is the correct way
> > to request synchronous algorithms only?
> 
> Yes, but then you loose the aes-ni based algorithms because they are
> asynchronous. If you want to have good crypto performance, it is
> better to implement the asynchronous callbacks.

Right, the trick is we want both aesni, and to guarantee that we are
done using the input buffers before sendmsg() returns.  For now I can
set a callback, and wait on a completion.  The initial use case of
userspace openssl integration shouldn't hit the aesni async case
anyway (!irq_fpu_usable())

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure
  2017-06-14 18:37 ` [PATCH v3 net-next 1/4] tcp: ULP infrastructure Dave Watson
  2017-06-17  0:14   ` Christoph Paasch
  2017-06-25  2:42   ` Levin, Alexander (Sasha Levin)
@ 2017-07-29 20:12   ` Tom Herbert
  2017-07-31 22:16     ` Dave Watson
  2 siblings, 1 reply; 20+ messages in thread
From: Tom Herbert @ 2017-07-29 20:12 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, Linux Kernel Network Developers,
	Herbert Xu, Linux Crypto Mailing List, Hannes Frederic Sowa,
	Eric Dumazet, Alexei Starovoitov, Nikos Mavrogiannopoulos,
	Fridolín Pokorný

On Wed, Jun 14, 2017 at 11:37 AM, Dave Watson <davejwatson@fb.com> wrote:
> Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
> sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
> ULP can add its own logic by changing the TCP proto_ops structure to its own
> methods.
>
> Example usage:
>
> setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
>
Hi Dave,

Thanks for this work. I think this functionality is going to have many uses!

One question: is there a good reason why the ULP infrastructure should
just be for TCP sockets. For example, I'd really like to be able
something like:

setsockopt(sock, SOL_SOCKET, SO_ULP, &ulp_param, sizeof(ulp_param));

Where ulp_param is a structure containing the ULP name as well as some
ULP specific parameters that are passed to init_ulp. ulp_init could
determine whether the socket family is appropriate for the ULP being
requested.

Thanks,
Tom

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure
  2017-06-17  0:14   ` Christoph Paasch
@ 2017-07-29 20:19     ` Tom Herbert
  0 siblings, 0 replies; 20+ messages in thread
From: Tom Herbert @ 2017-07-29 20:19 UTC (permalink / raw)
  To: Christoph Paasch
  Cc: Dave Watson, Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny,
	Liran Liss, Matan Barak, David Miller,
	Linux Kernel Network Developers, Herbert Xu,
	Linux Crypto Mailing List, Hannes Frederic Sowa, Eric Dumazet,
	Alexei Starovoitov, Nikos Mavrogiannopoulos,
	Fridolín Pokorný

On Fri, Jun 16, 2017 at 5:14 PM, Christoph Paasch
<christoph.paasch@gmail.com> wrote:
> Hello,
>
> On 14/06/17 - 11:37:14, Dave Watson wrote:
>> Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
>> sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
>> ULP can add its own logic by changing the TCP proto_ops structure to its own
>> methods.
>>
>> Example usage:
>>
>> setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
>>
>> modules will call:
>> tcp_register_ulp(&tcp_tls_ulp_ops);
>>
>> to register/unregister their ulp, with an init function and name.
>>
>> A list of registered ulps will be returned by tcp_get_available_ulp, which is
>> hooked up to /proc.  Example:
>>
>> $ cat /proc/sys/net/ipv4/tcp_available_ulp
>> tls
>>
>> There is currently no functionality to remove or chain ULPs, but
>> it should be possible to add these in the future if needed.
>>
>> Signed-off-by: Boris Pismenny <borisp@mellanox.com>
>> Signed-off-by: Dave Watson <davejwatson@fb.com>
>> ---
>>  include/net/inet_connection_sock.h |   4 ++
>>  include/net/tcp.h                  |  25 +++++++
>>  include/uapi/linux/tcp.h           |   1 +
>>  net/ipv4/Makefile                  |   2 +-
>>  net/ipv4/sysctl_net_ipv4.c         |  25 +++++++
>>  net/ipv4/tcp.c                     |  28 ++++++++
>>  net/ipv4/tcp_ipv4.c                |   2 +
>>  net/ipv4/tcp_ulp.c                 | 134 +++++++++++++++++++++++++++++++++++++
>>  8 files changed, 220 insertions(+), 1 deletion(-)
>>  create mode 100644 net/ipv4/tcp_ulp.c
>
> I know I'm pretty late to the game (and maybe this has already been
> discussed but I couldn't find anything in the archives), but I am wondering
> what the take is on potential races of the setsockopt() vs other system-calls.
>
> For example one might race the setsockopt() with a sendmsg() and the sendmsg
> might end up blocking on the lock_sock in tcp_sendmsg, waiting for
> tcp_set_ulp() to finish changing sk_prot. When the setsockopt() finishes, we
> are then inside tcp_sendmsg() coming directly from sendmsg(), while we
> should have been in the ULP's sendmsg.
>
> It seems like TLS-ULP is resilient to this (or at least, won't cause a panic),
> but there might be more exotic users of ULP in the future, that change other
> callbacks and then things might go wrong.

Christoph,

I noticed this also. I think the easiest answer would be to just
assume the caller understands the race condition and can synchronize
itself. Other than that we'd probably have wake up everyone blocking
on the socket without something like EAGAIN so they're forced to retry
the operation. But even that might not easily yield an obvious point
at which the socket can be safely changed.

Tom

>
>
> Thoughts?
>
>
> Thanks,
> Christoph
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure
  2017-07-29 20:12   ` Tom Herbert
@ 2017-07-31 22:16     ` Dave Watson
  2017-08-01 18:27       ` Tom Herbert
  0 siblings, 1 reply; 20+ messages in thread
From: Dave Watson @ 2017-07-31 22:16 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, Linux Kernel Network Developers,
	Herbert Xu, Linux Crypto Mailing List, Hannes Frederic Sowa,
	Eric Dumazet, Alexei Starovoitov, Nikos Mavrogiannopoulos,
	Fridolín Pokorný

On 07/29/17 01:12 PM, Tom Herbert wrote:
> On Wed, Jun 14, 2017 at 11:37 AM, Dave Watson <davejwatson@fb.com> wrote:
> > Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
> > sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
> > ULP can add its own logic by changing the TCP proto_ops structure to its own
> > methods.
> >
> > Example usage:
> >
> > setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
> >
> One question: is there a good reason why the ULP infrastructure should
> just be for TCP sockets. For example, I'd really like to be able
> something like:
> 
> setsockopt(sock, SOL_SOCKET, SO_ULP, &ulp_param, sizeof(ulp_param));
> 
> Where ulp_param is a structure containing the ULP name as well as some
> ULP specific parameters that are passed to init_ulp. ulp_init could
> determine whether the socket family is appropriate for the ULP being
> requested.

Using SOL_SOCKET instead seems reasonable to me.  I can see how
ulp_params could have some use, perhaps at a slight loss in clarity.
TLS needs its own setsockopts anyway though, for renegotiate for
example.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 net-next 1/4] tcp: ULP infrastructure
  2017-07-31 22:16     ` Dave Watson
@ 2017-08-01 18:27       ` Tom Herbert
  0 siblings, 0 replies; 20+ messages in thread
From: Tom Herbert @ 2017-08-01 18:27 UTC (permalink / raw)
  To: Dave Watson
  Cc: Ilya Lesokhin, Aviad Yehezkel, Boris Pismenny, Liran Liss,
	Matan Barak, David Miller, Linux Kernel Network Developers,
	Herbert Xu, Linux Crypto Mailing List, Hannes Frederic Sowa,
	Eric Dumazet, Alexei Starovoitov, Nikos Mavrogiannopoulos,
	Fridolín Pokorný

On Mon, Jul 31, 2017 at 3:16 PM, Dave Watson <davejwatson@fb.com> wrote:
> On 07/29/17 01:12 PM, Tom Herbert wrote:
>> On Wed, Jun 14, 2017 at 11:37 AM, Dave Watson <davejwatson@fb.com> wrote:
>> > Add the infrustructure for attaching Upper Layer Protocols (ULPs) over TCP
>> > sockets. Based on a similar infrastructure in tcp_cong.  The idea is that any
>> > ULP can add its own logic by changing the TCP proto_ops structure to its own
>> > methods.
>> >
>> > Example usage:
>> >
>> > setsockopt(sock, SOL_TCP, TCP_ULP, "tls", sizeof("tls"));
>> >
>> One question: is there a good reason why the ULP infrastructure should
>> just be for TCP sockets. For example, I'd really like to be able
>> something like:
>>
>> setsockopt(sock, SOL_SOCKET, SO_ULP, &ulp_param, sizeof(ulp_param));
>>
>> Where ulp_param is a structure containing the ULP name as well as some
>> ULP specific parameters that are passed to init_ulp. ulp_init could
>> determine whether the socket family is appropriate for the ULP being
>> requested.
>
> Using SOL_SOCKET instead seems reasonable to me.  I can see how
> ulp_params could have some use, perhaps at a slight loss in clarity.
> TLS needs its own setsockopts anyway though, for renegotiate for
> example.

I'll post the changes shortly. The reason to include parameters with
the setsockopt is so that we can push the ULP and start operations in
one shot.

Tom

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2017-08-01 18:27 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <cover.1497465295.git.davejwatson@fb.com>
2017-06-14 18:37 ` [PATCH v3 net-next 1/4] tcp: ULP infrastructure Dave Watson
2017-06-17  0:14   ` Christoph Paasch
2017-07-29 20:19     ` Tom Herbert
2017-06-25  2:42   ` Levin, Alexander (Sasha Levin)
2017-06-26 14:30     ` Dave Watson
2017-06-26 15:07       ` Levin, Alexander (Sasha Levin)
2017-07-29 20:12   ` Tom Herbert
2017-07-31 22:16     ` Dave Watson
2017-08-01 18:27       ` Tom Herbert
2017-06-14 18:37 ` [PATCH v3 net-next 2/4] tcp: export do_tcp_sendpages and tcp_rate_check_app_limited functions Dave Watson
2017-06-14 18:37 ` [PATCH v3 net-next 3/4] tls: kernel TLS support Dave Watson
2017-06-16 20:56   ` Stephen Hemminger
2017-06-16 20:58   ` Stephen Hemminger
2017-06-17  0:35     ` Dave Watson
2017-07-11  6:29   ` Steffen Klassert
2017-07-11 18:53     ` Dave Watson
2017-07-11 20:24       ` Eric Biggers
2017-07-12  7:20       ` Steffen Klassert
2017-07-12 18:34         ` Dave Watson
2017-06-14 18:37 ` [PATCH v3 net-next 4/4] tls: Documentation Dave Watson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.