From: Peter Krystad Override the bind() and connect() methods of the MPTCP socket so they may act on the subflow socket and use the .sk_rx_dst_set() handler in the subflow proto to capture when the responding SYN-ACK is received. Signed-off-by: Peter Krystad --- include/net/mptcp.h | 46 +++++++++++- net/mptcp/Makefile | 2 +- net/mptcp/protocol.c | 151 ++++++++++++++++++++++++++++++++++++---- net/mptcp/subflow.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 376 insertions(+), 15 deletions(-) create mode 100644 net/mptcp/subflow.c diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 6b4ae84ddf38..981d782cab9a 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -37,7 +37,10 @@ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; - struct socket *subflow; + u64 local_key; + u64 remote_key; + struct socket *connection_list; /* @@ needs to be a list */ + struct socket *subflow; /* outgoing connect, listener or !mp_capable */ }; static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) @@ -45,6 +48,42 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +/* MPTCP subflow sock structure */ +struct subflow_sock { + /* tcp_sock must be the first member */ + struct tcp_sock sk; + u64 local_key; + u64 remote_key; + bool request_mptcp; // send MP_CAPABLE + bool checksum; + bool version; + bool mp_capable; // remote is MPTCP capable + bool fourth_ack; // send initial DSS + struct sock *conn; // parent mptcp_sock +}; + +static inline struct subflow_sock *subflow_sk(const struct sock *sk) +{ + return (struct subflow_sock *)sk; +} + +struct subflow_request_sock { + struct tcp_request_sock sk; + u8 mp_capable : 1, + mp_join : 1, + checksum : 1, + backup : 1, + version : 4; + u64 local_key; + u64 remote_key; +}; + +static inline +struct subflow_request_sock *subflow_rsk(const struct request_sock *rsk) +{ + return (struct subflow_request_sock *)rsk; +} + #ifdef CONFIG_MPTCP void mptcp_parse_option(const unsigned char *ptr, int opsize, @@ -53,6 +92,11 @@ unsigned int mptcp_syn_options(struct sock *sk, u64 *local_key); unsigned int mptcp_synack_options(struct request_sock *req, u64 *local_key, u64 *remote_key); +void mptcp_finish_connect(struct sock *sk, int mp_capable); + +int mptcp_subflow_init(void); +void mptcp_subflow_exit(void); + void mptcp_get_options(const struct sk_buff *skb, struct tcp_options_received *options); diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 2bd18e3b9fda..3f0e7163fe80 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o options.o +mptcp-y := protocol.o subflow.o options.o diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 393d214fe531..368854740333 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -20,11 +20,13 @@ #include #include #include +#include #include static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len) { struct mptcp_sock *msk = mptcp_sk(sk); + struct subflow_sock *subflow = subflow_sk(msk->subflow->sk); int err; saddr->sa_family = AF_INET; @@ -32,18 +34,28 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len) pr_debug("msk=%p, subflow=%p", msk, msk->subflow->sk); err = kernel_connect(msk->subflow, saddr, len, 0); - - sk->sk_state = TCP_ESTABLISHED; - + pr_debug("mp_capable=%d", subflow->mp_capable); + if (!err) { + msk->remote_key = subflow->remote_key; + msk->local_key = subflow->local_key; + msk->connection_list = msk->subflow; + msk->subflow = NULL; + } return err; } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *subflow = msk->subflow; + struct socket *subflow; - pr_debug("subflow=%p", subflow->sk); + if (msk->connection_list) { + subflow = msk->connection_list; + pr_debug("conn_list->subflow=%p", subflow->sk); + } else { + subflow = msk->subflow; + pr_debug("subflow=%p", subflow->sk); + } return sock_sendmsg(subflow, msg); } @@ -52,9 +64,15 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *subflow = msk->subflow; + struct socket *subflow; - pr_debug("subflow=%p", subflow->sk); + if (msk->connection_list) { + subflow = msk->connection_list; + pr_debug("conn_list->subflow=%p", subflow->sk); + } else { + subflow = msk->subflow; + pr_debug("subflow=%p", subflow->sk); + } return sock_recvmsg(subflow, msg, flags); } @@ -67,11 +85,19 @@ static int mptcp_init_sock(struct sock *sk) pr_debug("msk=%p", msk); - err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, + err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_SUBFLOW, &sf); if (!err) { - pr_debug("subflow=%p", sf->sk); + struct subflow_sock *subflow = subflow_sk(sf->sk); + + pr_debug("subflow=%p", subflow); + msk->local_key = 1234567887654321; // @@ fixed for now msk->subflow = sf; + subflow->conn = sk; + subflow->request_mptcp = 1; // @@ if MPTCP enabled + subflow->checksum = 1; // @@ if checksum enabled + subflow->version = 0; + subflow->local_key = msk->local_key; } return err; @@ -85,6 +111,60 @@ static void mptcp_close(struct sock *sk, long timeout) pr_debug("subflow=%p", msk->subflow->sk); sock_release(msk->subflow); } + + if (msk->connection_list) { + pr_debug("conn_list->subflow=%p", msk->connection_list->sk); + sock_release(msk->connection_list); + } +} + +static int mptcp_get_port(struct sock *sk, unsigned short snum) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sock *subflow = msk->subflow->sk; + + pr_debug("msk=%p, subflow=%p", sk, subflow); + + return inet_csk_get_port(subflow, snum); +} + +int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *subflow = msk->subflow; + + pr_debug("msk=%p, subflow=%p", msk, subflow->sk); + + return inet_bind(subflow, uaddr, addr_len); +} + +void mptcp_finish_connect(struct sock *sk, int mp_capable) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct subflow_sock *subflow = subflow_sk(msk->subflow->sk); + + pr_debug("msk=%p", msk); + + if (mp_capable) { + msk->remote_key = subflow->remote_key; + msk->local_key = subflow->local_key; + msk->connection_list = msk->subflow; + msk->subflow = NULL; + } + sk->sk_state = TCP_ESTABLISHED; +} + +int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + int err; + + pr_debug("msk=%p, subflow=%p", msk, msk->subflow->sk); + + err = inet_stream_connect(msk->subflow, uaddr, addr_len, flags); + + return err; } static struct proto mptcp_prot = { @@ -99,35 +179,80 @@ static struct proto mptcp_prot = { .recvmsg = mptcp_recvmsg, .hash = inet_hash, .unhash = inet_unhash, - .get_port = inet_csk_get_port, + .get_port = mptcp_get_port, .obj_size = sizeof(struct mptcp_sock), .no_autobind = 1, }; +const struct proto_ops mptcp_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + .poll = tcp_poll, + .ioctl = inet_ioctl, + .listen = inet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, + .splice_read = tcp_splice_read, + .read_sock = tcp_read_sock, + .peek_len = tcp_peek_len, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, + .compat_ioctl = inet_compat_ioctl, +#endif +}; + static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_prot, - .ops = &inet_stream_ops, + .ops = &mptcp_stream_ops, + .flags = INET_PROTOSW_ICSK, }; static int __init mptcp_init(void) { int err; - err = proto_register(&mptcp_prot, 1); + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + + err = mptcp_subflow_init(); if (err) - return err; + goto subflow_failed; + + err = proto_register(&mptcp_prot, 1); + if (err) { + goto proto_failed; + } inet_register_protosw(&mptcp_protosw); return 0; + +proto_failed: + mptcp_subflow_exit(); + +subflow_failed: + return err; } static void __exit mptcp_exit(void) { inet_unregister_protosw(&mptcp_protosw); proto_unregister(&mptcp_prot); + + mptcp_subflow_exit(); } module_init(mptcp_init); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c new file mode 100644 index 000000000000..5e5fdcb3175f --- /dev/null +++ b/net/mptcp/subflow.c @@ -0,0 +1,192 @@ +/* + * Multipath TCP + * + * Copyright (c) 2017, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int subflow_connect(struct sock *sk, struct sockaddr *saddr, int len) +{ + struct subflow_sock *subflow = subflow_sk(sk); + + saddr->sa_family = AF_INET; // @@ presume IPv4 for now + + pr_debug("subflow=%p", subflow); + + return tcp_v4_connect(sk, saddr, len); +} + +static int subflow_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +{ + struct subflow_sock *subflow = subflow_sk(sk); + + pr_debug("subflow=%p", subflow); + + return tcp_sendmsg(sk, msg, len); +} + +static int subflow_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct subflow_sock *subflow = subflow_sk(sk); + + pr_debug("subflow=%p", subflow); + + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); +} + +static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) +{ + struct subflow_sock *subflow = subflow_sk(sk); + + inet_sk_rx_dst_set(sk, skb); + + pr_debug("subflow=%p", subflow); + + if (subflow->conn) { + pr_debug("remote_key=%llu", subflow->remote_key); + mptcp_finish_connect(subflow->conn, subflow->mp_capable); + subflow->conn = NULL; + } +} + +const struct inet_connection_sock_af_ops subflow_specific = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = subflow_finish_connect, + .conn_request = tcp_v4_conn_request, + .syn_recv_sock = tcp_v4_syn_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = inet_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ip_setsockopt, + .compat_getsockopt = compat_ip_getsockopt, +#endif + .mtu_reduced = tcp_v4_mtu_reduced, +}; + +static int subflow_init_sock(struct sock *sk) +{ + struct subflow_sock *subflow = subflow_sk(sk); + struct tcp_sock *tsk = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int err; + + pr_debug("subflow=%p", subflow); + + err = tcp_v4_init_sock(sk); + if (!err) { // @@ AND mptcp is enabled + tsk->is_mptcp = 1; + icsk->icsk_af_ops = &subflow_specific; + } + + return err; +} + +static void subflow_close(struct sock *sk, long timeout) +{ + pr_debug("subflow=%p", sk); + + tcp_close(sk, timeout); +} + +static void subflow_destroy(struct sock *sk) +{ + pr_debug("subflow=%p", sk); + + tcp_v4_destroy_sock(sk); +} + +static struct proto subflow_prot = { + .name = "SUBFLOW", + .owner = THIS_MODULE, + .close = subflow_close, + .connect = subflow_connect, + .disconnect = tcp_disconnect, + .accept = inet_csk_accept, + .ioctl = tcp_ioctl, + .init = subflow_init_sock, + .destroy = subflow_destroy, + .shutdown = tcp_shutdown, + .keepalive = tcp_set_keepalive, + .recvmsg = subflow_recvmsg, + .sendmsg = subflow_sendmsg, + .sendpage = tcp_sendpage, + .backlog_rcv = tcp_v4_do_rcv, + .release_cb = tcp_release_cb, + .hash = inet_hash, + .unhash = inet_unhash, + .get_port = inet_csk_get_port, + .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, + .sockets_allocated = &tcp_sockets_allocated, + .orphan_count = &tcp_orphan_count, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct subflow_sock), + .slab_flags = SLAB_TYPESAFE_BY_RCU, + + .no_autobind = true, +}; + +static struct inet_protosw subflow_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SUBFLOW, + .prot = &subflow_prot, + .ops = &inet_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +int mptcp_subflow_init(void) +{ + int err = -ENOMEM; + + /* TODO: Register path manager callbacks. */ + + subflow_prot.twsk_prot = tcp_prot.twsk_prot; + subflow_prot.h.hashinfo = tcp_prot.h.hashinfo; + err = proto_register(&subflow_prot, 1); + if (err) + goto fail; + + inet_register_protosw(&subflow_protosw); + + return 0; + +fail: + return err; +} + +void mptcp_subflow_exit(void) +{ + inet_unregister_protosw(&subflow_protosw); + proto_unregister(&subflow_prot); +} + +MODULE_LICENSE("GPL"); -- 2.16.3