From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andy Zhou Subject: Re: [net-next 02/10] udp: Expand UDP tunnel common APIs Date: Thu, 24 Jul 2014 13:54:09 -0700 Message-ID: References: <1406024393-6778-1-git-send-email-azhou@nicira.com> <1406024393-6778-3-git-send-email-azhou@nicira.com> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Cc: David Miller , Linux Netdev List To: Tom Herbert Return-path: Received: from na3sys009aog113.obsmtp.com ([74.125.149.209]:53767 "HELO na3sys009aog113.obsmtp.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with SMTP id S934176AbaGXUyM (ORCPT ); Thu, 24 Jul 2014 16:54:12 -0400 Received: by mail-oa0-f52.google.com with SMTP id o6so4416151oag.39 for ; Thu, 24 Jul 2014 13:54:09 -0700 (PDT) In-Reply-To: Sender: netdev-owner@vger.kernel.org List-ID: On Thu, Jul 24, 2014 at 1:47 PM, Tom Herbert wrote: > On Thu, Jul 24, 2014 at 1:23 PM, Andy Zhou wrote: >> The general layering I see is tunnel_user (i.e. OVS) -> tuunel_driver >> (i.e. vxlan) -> udp_tunnel. >> > Simpler and more efficient if you stick with UDP->UDP_encap_handler as > the most general model for RX. I believe this is the case now. I don't plan to change this. Just not exposing the higher layer callback to the udp_tunnel layer. > >> The two receive functions are from two separate layers above >> udp_tunnel. I can restructure the APIs to make it >> cleaner. >> > The only necessary function for opening the UDP encap port is the UDP > receive handler (encap receive). If you want to implement more > indirection within your handler then it should be pretty easy to > create another layer of API for that purpose. > Yes, this is the direction I am going towards. >> On Wed, Jul 23, 2014 at 12:57 PM, Tom Herbert wrote: >>> On Tue, Jul 22, 2014 at 3:19 AM, Andy Zhou wrote: >>>> Added create_udp_tunnel_socket(), packet receive and transmit, and >>>> other related common functions for UDP tunnels. >>>> >>>> Per net open UDP tunnel ports are tracked in this common layer to >>>> prevent sharing of a single port with more than one UDP tunnel. >>>> >>>> Signed-off-by: Andy Zhou >>>> --- >>>> include/net/udp_tunnel.h | 57 +++++++++- >>>> net/ipv4/udp_tunnel.c | 257 +++++++++++++++++++++++++++++++++++++++++++++- >>>> 2 files changed, 312 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h >>>> index 3f34c65..b5e815a 100644 >>>> --- a/include/net/udp_tunnel.h >>>> +++ b/include/net/udp_tunnel.h >>>> @@ -1,7 +1,10 @@ >>>> #ifndef __NET_UDP_TUNNEL_H >>>> #define __NET_UDP_TUNNEL_H >>>> >>>> -#define UDP_TUNNEL_TYPE_VXLAN 0x01 >>>> +#include >>>> + >>>> +#define UDP_TUNNEL_TYPE_VXLAN 0x01 >>>> +#define UDP_TUNNEL_TYPE_GENEVE 0x02 >>>> >>>> struct udp_port_cfg { >>>> u8 family; >>>> @@ -28,7 +31,59 @@ struct udp_port_cfg { >>>> use_udp6_rx_checksums:1; >>>> }; >>>> >>>> +struct udp_tunnel_sock; >>>> + >>>> +typedef void (udp_tunnel_rcv_t)(struct udp_tunnel_sock *uts, >>>> + struct sk_buff *skb, ...); >>>> + >>>> +typedef int (udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); >>>> + >>>> +struct udp_tunnel_socket_cfg { >>>> + u8 tunnel_type; >>>> + struct udp_port_cfg port; >>>> + udp_tunnel_rcv_t *rcv; >>>> + udp_tunnel_encap_rcv_t *encap_rcv; >>> >>> Why do you need two receive functions or udp_tunnel_rcv_t? >>> >>>> + void *data; >>> >>> Similarly, why is this needed when we already have sk_user_data? >>> >>>> +}; >>>> + >>>> +struct udp_tunnel_sock { >>>> + u8 tunnel_type; >>>> + struct hlist_node hlist; >>>> + udp_tunnel_rcv_t *rcv; >>>> + void *data; >>>> + struct socket *sock; >>>> +}; >>>> + >>>> int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, >>>> struct socket **sockp); >>>> >>>> +struct udp_tunnel_sock *create_udp_tunnel_socket(struct net *net, size_t size, >>>> + struct udp_tunnel_socket_cfg >>>> + *socket_cfg); >>>> + >>>> +struct udp_tunnel_sock *udp_tunnel_find_sock(struct net *net, __be16 port); >>>> + >>>> +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, >>>> + struct sk_buff *skb, __be32 src, __be32 dst, >>>> + __u8 tos, __u8 ttl, __be16 df, __be16 src_port, >>>> + __be16 dst_port, bool xnet); >>>> + >>>> +#if IS_ENABLED(CONFIG_IPV6) >>>> +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, >>>> + struct sk_buff *skb, struct net_device *dev, >>>> + struct in6_addr *saddr, struct in6_addr *daddr, >>>> + __u8 prio, __u8 ttl, __be16 src_port, __be16 dst_port); >>>> + >>>> +#endif >>>> + >>>> +void udp_tunnel_sock_release(struct udp_tunnel_sock *uts); >>>> +void udp_tunnel_get_rx_port(struct net_device *dev); >>>> + >>>> +static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, >>>> + bool udp_csum) >>>> +{ >>>> + int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; >>>> + >>>> + return iptunnel_handle_offloads(skb, udp_csum, type); >>>> +} >>>> #endif >>>> diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c >>>> index 61ec1a6..3c14b16 100644 >>>> --- a/net/ipv4/udp_tunnel.c >>>> +++ b/net/ipv4/udp_tunnel.c >>>> @@ -7,6 +7,23 @@ >>>> #include >>>> #include >>>> #include >>>> +#include >>>> +#if IS_ENABLED(CONFIG_IPV6) >>>> +#include >>>> +#include >>>> +#include >>>> +#include >>>> +#endif >>>> + >>>> +#define PORT_HASH_BITS 8 >>>> +#define PORT_HASH_SIZE (1 << PORT_HASH_BITS) >>>> + >>>> +static int udp_tunnel_net_id; >>>> + >>>> +struct udp_tunnel_net { >>>> + struct hlist_head sock_list[PORT_HASH_SIZE]; >>>> + spinlock_t sock_lock; /* Protecting the sock_list */ >>>> +}; >>>> >>>> int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, >>>> struct socket **sockp) >>>> @@ -82,7 +99,6 @@ int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, >>>> return -EPFNOSUPPORT; >>>> } >>>> >>>> - >>>> *sockp = sock; >>>> >>>> return 0; >>>> @@ -97,4 +113,243 @@ error: >>>> } >>>> EXPORT_SYMBOL(udp_sock_create); >>>> >>>> + >>>> +/* Socket hash table head */ >>>> +static inline struct hlist_head *uts_head(struct net *net, const __be16 port) >>>> +{ >>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id); >>>> + >>>> + return &utn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; >>>> +} >>>> + >>>> +static int handle_offloads(struct sk_buff *skb) >>>> +{ >>>> + if (skb_is_gso(skb)) { >>>> + int err = skb_unclone(skb, GFP_ATOMIC); >>>> + >>>> + if (unlikely(err)) >>>> + return err; >>>> + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; >>>> + } else { >>>> + if (skb->ip_summed != CHECKSUM_PARTIAL) >>>> + skb->ip_summed = CHECKSUM_NONE; >>>> + } >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +struct udp_tunnel_sock *create_udp_tunnel_socket(struct net *net, size_t size, >>>> + struct udp_tunnel_socket_cfg >>>> + *cfg) >>>> +{ >>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id); >>>> + struct udp_tunnel_sock *uts; >>>> + struct socket *sock; >>>> + struct sock *sk; >>>> + const __be16 port = cfg->port.local_udp_port; >>>> + const int ipv6 = (cfg->port.family == AF_INET6); >>>> + int err; >>>> + >>>> + uts = kzalloc(size, GFP_KERNEL); >>>> + if (!uts) >>>> + return ERR_PTR(-ENOMEM); >>>> + >>>> + err = udp_sock_create(net, &cfg->port, &sock); >>>> + if (err < 0) { >>>> + kfree(uts); >>>> + return NULL; >>>> + } >>>> + >>>> + /* Disable multicast loopback */ >>>> + inet_sk(sock->sk)->mc_loop = 0; >>>> + >>>> + uts->sock = sock; >>>> + sk = sock->sk; >>>> + uts->rcv = cfg->rcv; >>>> + uts->data = cfg->data; >>>> + rcu_assign_sk_user_data(sock->sk, uts); >>>> + >>>> + spin_lock(&utn->sock_lock); >>>> + hlist_add_head_rcu(&uts->hlist, uts_head(net, port)); >>>> + spin_unlock(&utn->sock_lock); >>>> + >>>> + udp_sk(sk)->encap_type = 1; >>>> + udp_sk(sk)->encap_rcv = cfg->encap_rcv; >>>> + >>>> +#if IS_ENABLED(CONFIG_IPV6) >>>> + if (ipv6) >>>> + ipv6_stub->udpv6_encap_enable(); >>>> + else >>>> +#endif >>>> + udp_encap_enable(); >>>> + >>>> + return uts; >>>> +} >>>> +EXPORT_SYMBOL_GPL(create_udp_tunnel_socket); >>>> + >>>> +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, >>>> + struct sk_buff *skb, __be32 src, __be32 dst, >>>> + __u8 tos, __u8 ttl, __be16 df, __be16 src_port, >>>> + __be16 dst_port, bool xnet) >>>> +{ >>>> + struct udphdr *uh; >>>> + >>>> + __skb_push(skb, sizeof(*uh)); >>>> + skb_reset_transport_header(skb); >>>> + uh = udp_hdr(skb); >>>> + >>>> + uh->dest = dst_port; >>>> + uh->source = src_port; >>>> + uh->len = htons(skb->len); >>>> + >>>> + udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len); >>>> + >>>> + return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP, >>>> + tos, ttl, df, xnet); >>>> +} >>>> +EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); >>>> + >>>> +#if IS_ENABLED(CONFIG_IPV6) >>>> +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, >>>> + struct sk_buff *skb, struct net_device *dev, >>>> + struct in6_addr *saddr, struct in6_addr *daddr, >>>> + __u8 prio, __u8 ttl, __be16 src_port, __be16 dst_port) >>>> +{ >>>> + struct udphdr *uh; >>>> + struct ipv6hdr *ip6h; >>>> + int err; >>>> + >>>> + __skb_push(skb, sizeof(*uh)); >>>> + skb_reset_transport_header(skb); >>>> + uh = udp_hdr(skb); >>>> + >>>> + uh->dest = dst_port; >>>> + uh->source = src_port; >>>> + >>>> + uh->len = htons(skb->len); >>>> + uh->check = 0; >>>> + >>>> + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); >>>> + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED >>>> + | IPSKB_REROUTED); >>>> + skb_dst_set(skb, dst); >>>> + >>>> + if (!skb_is_gso(skb) && !(dst->dev->features & NETIF_F_IPV6_CSUM)) { >>>> + __wsum csum = skb_checksum(skb, 0, skb->len, 0); >>>> + >>>> + skb->ip_summed = CHECKSUM_UNNECESSARY; >>>> + uh->check = csum_ipv6_magic(saddr, daddr, skb->len, >>>> + IPPROTO_UDP, csum); >>>> + if (uh->check == 0) >>>> + uh->check = CSUM_MANGLED_0; >>>> + } else { >>>> + skb->ip_summed = CHECKSUM_PARTIAL; >>>> + skb->csum_start = skb_transport_header(skb) - skb->head; >>>> + skb->csum_offset = offsetof(struct udphdr, check); >>>> + uh->check = ~csum_ipv6_magic(saddr, daddr, >>>> + skb->len, IPPROTO_UDP, 0); >>>> + } >>>> + >>>> + __skb_push(skb, sizeof(*ip6h)); >>>> + skb_reset_network_header(skb); >>>> + ip6h = ipv6_hdr(skb); >>>> + ip6h->version = 6; >>>> + ip6h->priority = prio; >>>> + ip6h->flow_lbl[0] = 0; >>>> + ip6h->flow_lbl[1] = 0; >>>> + ip6h->flow_lbl[2] = 0; >>>> + ip6h->payload_len = htons(skb->len); >>>> + ip6h->nexthdr = IPPROTO_UDP; >>>> + ip6h->hop_limit = ttl; >>>> + ip6h->daddr = *daddr; >>>> + ip6h->saddr = *saddr; >>>> + >>>> + err = handle_offloads(skb); >>>> + if (err) >>>> + return err; >>>> + >>>> + ip6tunnel_xmit(skb, dev); >>>> + return 0; >>>> +} >>>> +EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); >>>> +#endif >>>> + >>>> +struct udp_tunnel_sock *udp_tunnel_find_sock(struct net *net, __be16 port) >>>> +{ >>>> + struct udp_tunnel_sock *uts; >>>> + >>>> + hlist_for_each_entry_rcu(uts, uts_head(net, port), hlist) { >>>> + if (inet_sk(uts->sock->sk)->inet_sport == port) >>>> + return uts; >>>> + } >>>> + >>>> + return NULL; >>>> +} >>>> +EXPORT_SYMBOL_GPL(udp_tunnel_find_sock); >>>> + >>>> +void udp_tunnel_sock_release(struct udp_tunnel_sock *uts) >>>> +{ >>>> + struct sock *sk = uts->sock->sk; >>>> + struct net *net = sock_net(sk); >>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id); >>>> + >>>> + spin_lock(&utn->sock_lock); >>>> + hlist_del_rcu(&uts->hlist); >>>> + rcu_assign_sk_user_data(uts->sock->sk, NULL); >>>> + spin_unlock(&utn->sock_lock); >>>> +} >>>> +EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); >>>> + >>>> +/* Calls the ndo_add_tunnel_port of the caller in order to >>>> + * supply the listening VXLAN udp ports. Callers are expected >>>> + * to implement the ndo_add_tunnle_port. >>>> + */ >>>> +void udp_tunnel_get_rx_port(struct net_device *dev) >>>> +{ >>>> + struct udp_tunnel_sock *uts; >>>> + struct net *net = dev_net(dev); >>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id); >>>> + sa_family_t sa_family; >>>> + __be16 port; >>>> + unsigned int i; >>>> + >>>> + spin_lock(&utn->sock_lock); >>>> + for (i = 0; i < PORT_HASH_SIZE; ++i) { >>>> + hlist_for_each_entry_rcu(uts, &utn->sock_list[i], hlist) { >>>> + port = inet_sk(uts->sock->sk)->inet_sport; >>>> + sa_family = uts->sock->sk->sk_family; >>>> + dev->netdev_ops->ndo_add_udp_tunnel_port(dev, >>>> + sa_family, port, uts->tunnel_type); >>>> + } >>>> + } >>>> + spin_unlock(&utn->sock_lock); >>>> +} >>>> +EXPORT_SYMBOL_GPL(udp_tunnel_get_rx_port); >>>> + >>>> +static int __net_init udp_tunnel_init_net(struct net *net) >>>> +{ >>>> + struct udp_tunnel_net *utn = net_generic(net, udp_tunnel_net_id); >>>> + unsigned int h; >>>> + >>>> + spin_lock_init(&utn->sock_lock); >>>> + >>>> + for (h = 0; h < PORT_HASH_SIZE; h++) >>>> + INIT_HLIST_HEAD(&utn->sock_list[h]); >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static struct pernet_operations udp_tunnel_net_ops = { >>>> + .init = udp_tunnel_init_net, >>>> + .exit = NULL, >>>> + .id = &udp_tunnel_net_id, >>>> + .size = sizeof(struct udp_tunnel_net), >>>> +}; >>>> + >>>> +static int __init udp_tunnel_init(void) >>>> +{ >>>> + return register_pernet_subsys(&udp_tunnel_net_ops); >>>> +} >>>> +late_initcall(udp_tunnel_init); >>>> + >>>> MODULE_LICENSE("GPL"); >>>> -- >>>> 1.7.9.5 >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe netdev" in >>>> the body of a message to majordomo@vger.kernel.org >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html