[PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel

* [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
@ 2018-04-02  0:47 Md. Islam
  2018-04-02 16:51 ` Stephen Hemminger
                   ` (3 more replies)
  0 siblings, 4 replies; 20+ messages in thread
From: Md. Islam @ 2018-04-02  0:47 UTC (permalink / raw)
  To: netdev, David Miller, David Ahern, stephen, agaceph,
	Pavel Emelyanov, Eric Dumazet, alexei.starovoitov, brouer

[-- Attachment #1: Type: text/plain, Size: 10604 bytes --]

This patch implements IPv4 forwarding on xdp_buff. I added a new
config option XDP_ROUTER. Kernel would forward packets through fast
path when this option is enabled. But it would require driver support.
Currently it only works with veth. Here I have modified veth such that
it outputs xdp_buff. I created a testbed in Mininet. The Mininet
script (topology.py) is attached. Here the topology is:

h1 -----r1-----h2 (r1 acts as a router)

This patch improves the throughput from 53.8Gb/s to 60Gb/s on my
machine. Median RTT also improved from around .055 ms to around .035
ms.

Then I disabled hyperthreading and cpu frequency scaling in order to
utilize CPU cache (DPDK also utilizes CPU cache to improve
forwarding). This further improves per-packet forwarding latency from
around 400ns to 200 ns. More specifically, header parsing and fib
lookup only takes around 82 ns. This shows that this could be used to
implement linerate packet forwarding in kernel.

The patch has been generated on 4.15.0+. Please let me know your
feedback and suggestions. Please feel free to let me know if this
approach make sense.

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 944ec3c..8474eef 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -328,6 +328,18 @@ config VETH
       When one end receives the packet it appears on its pair and vice
       versa.

+config XDP_ROUTER
+    bool "XDP router for veth"
+    depends on IP_ADVANCED_ROUTER
+    depends on VETH
+    default y
+    ---help---
+      This option will enable IP forwarding on incoming xdp_buff.
+      Currently it is only supported by veth. Say y or n.
+
+      Currently veth uses slow path for packet forwarding. This option
+      forwards packets as soon as it is received (as XDP generic).
+
 config VIRTIO_NET
     tristate "Virtio network driver"
     depends on VIRTIO
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39..76112f9 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -111,6 +111,29 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb,
struct net_device *dev)
         goto drop;
     }

+#ifdef CONFIG_XDP_ROUTER
+
+    /* if IP forwarding is enabled on the receiver, create xdp_buff
+     * from skb and call xdp_router_forward()
+     */
+    if (is_forwarding_enabled(rcv)) {
+        struct xdp_buff *xdp = kmalloc(sizeof(*xdp), GFP_KERNEL);
+
+        xdp->data = skb->data;
+        xdp->data_end = skb->data + (skb->len - skb->data_len);
+        xdp->data_meta = skb;
+        prefetch_xdp(xdp);
+        if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
+            struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+            u64_stats_update_begin(&stats->syncp);
+            stats->bytes += length;
+            stats->packets++;
+            u64_stats_update_end(&stats->syncp);
+            goto success;
+        }
+    }
+#endif
     if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
         struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);

@@ -122,6 +145,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb,
struct net_device *dev)
 drop:
         atomic64_inc(&priv->dropped);
     }
+success:
     rcu_read_unlock();
     return NETDEV_TX_OK;
 }
@@ -276,6 +300,62 @@ static void veth_set_rx_headroom(struct
net_device *dev, int new_hr)
     rcu_read_unlock();
 }

+#ifdef CONFIG_XDP_ROUTER
+int veth_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+{
+    struct veth_priv *priv = netdev_priv(dev);
+    struct net_device *rcv;
+    struct ethhdr *ethh;
+    struct sk_buff *skb;
+    int length = xdp->data_end - xdp->data;
+
+    rcu_read_lock();
+    rcv = rcu_dereference(priv->peer);
+    if (unlikely(!rcv)) {
+        kfree(xdp);
+        goto drop;
+    }
+
+    /* Update MAC address and checksum */
+    ethh = eth_hdr_xdp(xdp);
+    ether_addr_copy(ethh->h_source, dev->dev_addr);
+    ether_addr_copy(ethh->h_dest, rcv->dev_addr);
+
+    /* if IP forwarding is enabled on the receiver,
+     * call xdp_router_forward()
+     */
+    if (is_forwarding_enabled(rcv)) {
+        prefetch_xdp(xdp);
+        if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
+            struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+            u64_stats_update_begin(&stats->syncp);
+            stats->bytes += length;
+            stats->packets++;
+            u64_stats_update_end(&stats->syncp);
+            goto success;
+        }
+    }
+
+    /* Local deliver */
+    skb = (struct sk_buff *)xdp->data_meta;
+    if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
+        struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+        u64_stats_update_begin(&stats->syncp);
+        stats->bytes += length;
+        stats->packets++;
+        u64_stats_update_end(&stats->syncp);
+    } else {
+drop:
+        atomic64_inc(&priv->dropped);
+    }
+success:
+    rcu_read_unlock();
+    return NETDEV_TX_OK;
+}
+#endif
+
 static const struct net_device_ops veth_netdev_ops = {
     .ndo_init            = veth_dev_init,
     .ndo_open            = veth_open,
@@ -290,6 +370,9 @@ static const struct net_device_ops veth_netdev_ops = {
     .ndo_get_iflink        = veth_get_iflink,
     .ndo_features_check    = passthru_features_check,
     .ndo_set_rx_headroom    = veth_set_rx_headroom,
+#ifdef CONFIG_XDP_ROUTER
+    .ndo_xdp_xmit        = veth_xdp_xmit,
+#endif
 };

 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 492bc65..025a3ec 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -19,6 +19,29 @@

 #include <linux/skbuff.h>
 #include <uapi/linux/ip.h>
+#include <linux/filter.h>
+
+#ifdef CONFIG_XDP_ROUTER
+
+#define MIN_PACKET_SIZE 55
+
+static inline struct iphdr *ip_hdr_xdp(const struct xdp_buff *xdp)
+{
+    return (struct iphdr *)(xdp->data+ETH_HLEN);
+}
+
+static inline struct ethhdr *eth_hdr_xdp(const struct xdp_buff *xdp)
+{
+    return (struct ethhdr *)(xdp->data);
+}
+
+static inline bool is_xdp_forwardable(const struct xdp_buff *xdp)
+{
+    return xdp->data_end - xdp->data >= MIN_PACKET_SIZE;
+}
+
+#endif
+

 static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
 {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4c77f39..e3bf002 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3290,6 +3290,12 @@ static inline void dev_consume_skb_any(struct
sk_buff *skb)
     __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
 }

+#ifdef CONFIG_XDP_ROUTER
+bool is_forwarding_enabled(struct net_device *dev);
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp);
+void prefetch_xdp(struct xdp_buff *xdp);
+#endif
+
 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f805243..623b2de 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -369,6 +369,12 @@ int fib_sync_down_dev(struct net_device *dev,
unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);

+#ifdef CONFIG_XDP_ROUTER
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+                   u8 tos, struct net_device *dev,
+                   struct fib_result *res);
+#endif
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
                const struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index dda9d7b..9d92352 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4090,6 +4090,65 @@ int do_xdp_generic(struct bpf_prog *xdp_prog,
struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(do_xdp_generic);

+#ifdef CONFIG_XDP_ROUTER
+
+bool is_forwarding_enabled(struct net_device *dev)
+{
+    struct in_device *in_dev;
+
+    /* verify forwarding is enabled on this interface */
+    in_dev = __in_dev_get_rcu(dev);
+    if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+        return false;
+
+    return true;
+}
+EXPORT_SYMBOL_GPL(is_forwarding_enabled);
+
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp)
+{
+        int err;
+        struct fib_result res;
+        struct iphdr *iph;
+        struct net_device *rcv;
+
+        if (unlikely(xdp->data_end - xdp->data < MIN_PACKET_SIZE))
+            return NET_RX_DROP;
+
+        iph = (struct iphdr *)(xdp->data + ETH_HLEN);
+
+        /*currently only supports IPv4
+         */
+        if (unlikely(iph->version != 4))
+            return NET_RX_DROP;
+
+        err = ip_route_lookup(iph->daddr, iph->saddr,
+                      iph->tos, dev, &res);
+        if (unlikely(err))
+            return NET_RX_DROP;
+
+        rcv = FIB_RES_DEV(res);
+        if (likely(rcv)) {
+            if (likely(rcv->netdev_ops->ndo_xdp_xmit(rcv, xdp) ==
+                       NETDEV_TX_OK))
+                return NET_RX_SUCCESS;
+        }
+
+        return NET_RX_DROP;
+}
+EXPORT_SYMBOL_GPL(xdp_router_forward);
+
+inline void prefetch_xdp(struct xdp_buff *xdp)
+{
+        prefetch(xdp);
+        /* prefetch version, tos, saddr and daddr of IP header */
+        prefetch(xdp->data + ETH_HLEN);
+        prefetch(xdp->data + ETH_HLEN + 12);
+}
+EXPORT_SYMBOL_GPL(prefetch_xdp);
+
+#endif
+
 static int netif_rx_internal(struct sk_buff *skb)
 {
     int ret;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 49cc1c1..2333205 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1866,6 +1866,35 @@ static int ip_mkroute_input(struct sk_buff *skb,
     return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 }

+#ifdef CONFIG_XDP_ROUTER
+
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+            u8 tos, struct net_device *dev,
+            struct fib_result *res)
+{
+    struct flowi4    fl4;
+    int        err;
+    struct net    *net = dev_net(dev);
+
+    fl4.flowi4_oif = 0;
+    fl4.flowi4_iif = dev->ifindex;
+    fl4.flowi4_mark = 0;
+    fl4.flowi4_tos = tos & IPTOS_RT_MASK;
+    fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+    fl4.flowi4_flags = 0;
+    fl4.daddr = daddr;
+    fl4.saddr = saddr;
+
+    err = fib_lookup(net, &fl4, res, 0);
+
+    if (unlikely(err != 0 || res->type != RTN_UNICAST))
+        return -EINVAL;
+
+    return 0;
+}
+EXPORT_SYMBOL_GPL(ip_route_lookup);
+#endif
+
 /*
  *    NOTE. We drop all the packets that has local source
  *    addresses, because every properly looped back packet

Many thanks
Tamim

[-- Attachment #2: topology.py --]
[-- Type: text/x-python, Size: 3289 bytes --]

#!/usr/bin/python

"""

##############################################################################
# Topology with one router and two hosts with static routes
#
#       172.16.101.0/24         172.16.102.0/24   
#  h1 ------------------- r1 ------------------ h2
#    .1                .2   .3               .1   
#
##############################################################################

Here r1 acts as a Linux router. There are two veth-pairs in our topology as following.

h1-eth0-------r1-eth2       r1-eth3-----h2eth0    

Packets received on r1-eth2 is being transmitted to r1-eth3 using XDP fast path. Packets are generated on h1 towards h2 using iperf.

"""


from mininet.topo import Topo
from mininet.net import Mininet
from mininet.link import TCLink
from mininet.node import Node, CPULimitedHost
from mininet.log import setLogLevel, info
from mininet.util import custom, waitListening
from mininet.cli import CLI
import sys
import time

class LinuxRouter( Node ):
    "A Node with IP forwarding enabled."

    def config( self, **params ):
        super( LinuxRouter, self).config( **params )
        # Enable forwarding on the router
        self.cmd( 'sysctl net.ipv4.ip_forward=1' )

    def terminate( self ):
        self.cmd( 'sysctl net.ipv4.ip_forward=0' )
        super( LinuxRouter, self ).terminate()

class NetworkTopo( Topo ):
    def build( self, **_opts ):	
        h1 = self.addHost( 'h1', ip='172.16.101.1/24', defaultRoute='via 172.16.101.2' )
        h2 = self.addHost( 'h2', ip='172.16.102.1/24', defaultRoute='via 172.16.102.3' )
	r1 = self.addNode( 'r1', cls=LinuxRouter, ip='172.16.101.2/24' )

        self.addLink( h1, r1, intfName2='r1-eth2', params2={ 'ip' : '172.16.101.2/24' })
        self.addLink( h2, r1, intfName2='r1-eth3', params2={ 'ip' : '172.16.102.3/24' })

def main(cli=0):
    "Test linux router"
    topo = NetworkTopo()

    net = Mininet( topo=topo, controller = None )
    net.start()

#   testing using port 45678, TCP window size 20MB and 10 connection. 
    res = net['h2'].cmd('iperf -s -p 45678 -w 20MB &')
#Anyhing that blocks shouldn't be used in cmd(). Use popen() instead. It will create a new process. Now monitor the output of the process
    proc = net['h1'].popen('iperf -c 172.16.102.1 -p 45678 -t 30  -w 20MB -P 10')


#    print res #Don't uncomment this. Strange things happen :-(

#Parse the res to find out the PID of iperf server. The program can crash if res isn't formatted properly. Try again
    pid = res.split(" ")
    iperf_s_pid = pid[1]
    iperf_c_pid = int(pid[1]) + 1

    print pid[1], int(pid[1]) + 1

#Pin iperf server and client to core 0 and 1 respectively. Note that throughput you get depends on other applications running on a CPU. So if you get bad throughput, try restatring. Even though you close some applications, the process can sit in the backgroud
    net['h2'].cmd('sudo taskset -pc 0 {0}'.format(iperf_s_pid))
    net['h1'].cmd('sudo taskset -pc 1 {0}'.format(iperf_c_pid)) 

    for line in iter(proc.stdout.readline, b''):
	print line

    net['h2'].cmd('sudo kill -9 {0}'.format(iperf_s_pid))
    net['h1'].cmd('sudo kill -9 {0}'.format(iperf_c_pid))

    CLI( net )
    net.stop()

if __name__ == '__main__':
    args = sys.argv
    setLogLevel( 'info' )
    main()

^ permalink raw reply related	[flat|nested] 20+ messages in thread