All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
@ 2018-04-02  0:47 Md. Islam
  2018-04-02 16:51 ` Stephen Hemminger
                   ` (3 more replies)
  0 siblings, 4 replies; 20+ messages in thread
From: Md. Islam @ 2018-04-02  0:47 UTC (permalink / raw)
  To: netdev, David Miller, David Ahern, stephen, agaceph,
	Pavel Emelyanov, Eric Dumazet, alexei.starovoitov, brouer

[-- Attachment #1: Type: text/plain, Size: 10604 bytes --]

This patch implements IPv4 forwarding on xdp_buff. I added a new
config option XDP_ROUTER. Kernel would forward packets through fast
path when this option is enabled. But it would require driver support.
Currently it only works with veth. Here I have modified veth such that
it outputs xdp_buff. I created a testbed in Mininet. The Mininet
script (topology.py) is attached. Here the topology is:

h1 -----r1-----h2 (r1 acts as a router)

This patch improves the throughput from 53.8Gb/s to 60Gb/s on my
machine. Median RTT also improved from around .055 ms to around .035
ms.

Then I disabled hyperthreading and cpu frequency scaling in order to
utilize CPU cache (DPDK also utilizes CPU cache to improve
forwarding). This further improves per-packet forwarding latency from
around 400ns to 200 ns. More specifically, header parsing and fib
lookup only takes around 82 ns. This shows that this could be used to
implement linerate packet forwarding in kernel.

The patch has been generated on 4.15.0+. Please let me know your
feedback and suggestions. Please feel free to let me know if this
approach make sense.

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 944ec3c..8474eef 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -328,6 +328,18 @@ config VETH
       When one end receives the packet it appears on its pair and vice
       versa.

+config XDP_ROUTER
+    bool "XDP router for veth"
+    depends on IP_ADVANCED_ROUTER
+    depends on VETH
+    default y
+    ---help---
+      This option will enable IP forwarding on incoming xdp_buff.
+      Currently it is only supported by veth. Say y or n.
+
+      Currently veth uses slow path for packet forwarding. This option
+      forwards packets as soon as it is received (as XDP generic).
+
 config VIRTIO_NET
     tristate "Virtio network driver"
     depends on VIRTIO
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39..76112f9 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -111,6 +111,29 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb,
struct net_device *dev)
         goto drop;
     }

+#ifdef CONFIG_XDP_ROUTER
+
+    /* if IP forwarding is enabled on the receiver, create xdp_buff
+     * from skb and call xdp_router_forward()
+     */
+    if (is_forwarding_enabled(rcv)) {
+        struct xdp_buff *xdp = kmalloc(sizeof(*xdp), GFP_KERNEL);
+
+        xdp->data = skb->data;
+        xdp->data_end = skb->data + (skb->len - skb->data_len);
+        xdp->data_meta = skb;
+        prefetch_xdp(xdp);
+        if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
+            struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+            u64_stats_update_begin(&stats->syncp);
+            stats->bytes += length;
+            stats->packets++;
+            u64_stats_update_end(&stats->syncp);
+            goto success;
+        }
+    }
+#endif
     if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
         struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);

@@ -122,6 +145,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb,
struct net_device *dev)
 drop:
         atomic64_inc(&priv->dropped);
     }
+success:
     rcu_read_unlock();
     return NETDEV_TX_OK;
 }
@@ -276,6 +300,62 @@ static void veth_set_rx_headroom(struct
net_device *dev, int new_hr)
     rcu_read_unlock();
 }

+#ifdef CONFIG_XDP_ROUTER
+int veth_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+{
+    struct veth_priv *priv = netdev_priv(dev);
+    struct net_device *rcv;
+    struct ethhdr *ethh;
+    struct sk_buff *skb;
+    int length = xdp->data_end - xdp->data;
+
+    rcu_read_lock();
+    rcv = rcu_dereference(priv->peer);
+    if (unlikely(!rcv)) {
+        kfree(xdp);
+        goto drop;
+    }
+
+    /* Update MAC address and checksum */
+    ethh = eth_hdr_xdp(xdp);
+    ether_addr_copy(ethh->h_source, dev->dev_addr);
+    ether_addr_copy(ethh->h_dest, rcv->dev_addr);
+
+    /* if IP forwarding is enabled on the receiver,
+     * call xdp_router_forward()
+     */
+    if (is_forwarding_enabled(rcv)) {
+        prefetch_xdp(xdp);
+        if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
+            struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+            u64_stats_update_begin(&stats->syncp);
+            stats->bytes += length;
+            stats->packets++;
+            u64_stats_update_end(&stats->syncp);
+            goto success;
+        }
+    }
+
+    /* Local deliver */
+    skb = (struct sk_buff *)xdp->data_meta;
+    if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
+        struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+        u64_stats_update_begin(&stats->syncp);
+        stats->bytes += length;
+        stats->packets++;
+        u64_stats_update_end(&stats->syncp);
+    } else {
+drop:
+        atomic64_inc(&priv->dropped);
+    }
+success:
+    rcu_read_unlock();
+    return NETDEV_TX_OK;
+}
+#endif
+
 static const struct net_device_ops veth_netdev_ops = {
     .ndo_init            = veth_dev_init,
     .ndo_open            = veth_open,
@@ -290,6 +370,9 @@ static const struct net_device_ops veth_netdev_ops = {
     .ndo_get_iflink        = veth_get_iflink,
     .ndo_features_check    = passthru_features_check,
     .ndo_set_rx_headroom    = veth_set_rx_headroom,
+#ifdef CONFIG_XDP_ROUTER
+    .ndo_xdp_xmit        = veth_xdp_xmit,
+#endif
 };

 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 492bc65..025a3ec 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -19,6 +19,29 @@

 #include <linux/skbuff.h>
 #include <uapi/linux/ip.h>
+#include <linux/filter.h>
+
+#ifdef CONFIG_XDP_ROUTER
+
+#define MIN_PACKET_SIZE 55
+
+static inline struct iphdr *ip_hdr_xdp(const struct xdp_buff *xdp)
+{
+    return (struct iphdr *)(xdp->data+ETH_HLEN);
+}
+
+static inline struct ethhdr *eth_hdr_xdp(const struct xdp_buff *xdp)
+{
+    return (struct ethhdr *)(xdp->data);
+}
+
+static inline bool is_xdp_forwardable(const struct xdp_buff *xdp)
+{
+    return xdp->data_end - xdp->data >= MIN_PACKET_SIZE;
+}
+
+#endif
+

 static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
 {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4c77f39..e3bf002 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3290,6 +3290,12 @@ static inline void dev_consume_skb_any(struct
sk_buff *skb)
     __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
 }

+#ifdef CONFIG_XDP_ROUTER
+bool is_forwarding_enabled(struct net_device *dev);
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp);
+void prefetch_xdp(struct xdp_buff *xdp);
+#endif
+
 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f805243..623b2de 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -369,6 +369,12 @@ int fib_sync_down_dev(struct net_device *dev,
unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);

+#ifdef CONFIG_XDP_ROUTER
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+                   u8 tos, struct net_device *dev,
+                   struct fib_result *res);
+#endif
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
                const struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index dda9d7b..9d92352 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4090,6 +4090,65 @@ int do_xdp_generic(struct bpf_prog *xdp_prog,
struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(do_xdp_generic);

+#ifdef CONFIG_XDP_ROUTER
+
+bool is_forwarding_enabled(struct net_device *dev)
+{
+    struct in_device *in_dev;
+
+    /* verify forwarding is enabled on this interface */
+    in_dev = __in_dev_get_rcu(dev);
+    if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+        return false;
+
+    return true;
+}
+EXPORT_SYMBOL_GPL(is_forwarding_enabled);
+
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp)
+{
+        int err;
+        struct fib_result res;
+        struct iphdr *iph;
+        struct net_device *rcv;
+
+        if (unlikely(xdp->data_end - xdp->data < MIN_PACKET_SIZE))
+            return NET_RX_DROP;
+
+        iph = (struct iphdr *)(xdp->data + ETH_HLEN);
+
+        /*currently only supports IPv4
+         */
+        if (unlikely(iph->version != 4))
+            return NET_RX_DROP;
+
+        err = ip_route_lookup(iph->daddr, iph->saddr,
+                      iph->tos, dev, &res);
+        if (unlikely(err))
+            return NET_RX_DROP;
+
+        rcv = FIB_RES_DEV(res);
+        if (likely(rcv)) {
+            if (likely(rcv->netdev_ops->ndo_xdp_xmit(rcv, xdp) ==
+                       NETDEV_TX_OK))
+                return NET_RX_SUCCESS;
+        }
+
+        return NET_RX_DROP;
+}
+EXPORT_SYMBOL_GPL(xdp_router_forward);
+
+inline void prefetch_xdp(struct xdp_buff *xdp)
+{
+        prefetch(xdp);
+        /* prefetch version, tos, saddr and daddr of IP header */
+        prefetch(xdp->data + ETH_HLEN);
+        prefetch(xdp->data + ETH_HLEN + 12);
+}
+EXPORT_SYMBOL_GPL(prefetch_xdp);
+
+#endif
+
 static int netif_rx_internal(struct sk_buff *skb)
 {
     int ret;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 49cc1c1..2333205 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1866,6 +1866,35 @@ static int ip_mkroute_input(struct sk_buff *skb,
     return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 }

+#ifdef CONFIG_XDP_ROUTER
+
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+            u8 tos, struct net_device *dev,
+            struct fib_result *res)
+{
+    struct flowi4    fl4;
+    int        err;
+    struct net    *net = dev_net(dev);
+
+    fl4.flowi4_oif = 0;
+    fl4.flowi4_iif = dev->ifindex;
+    fl4.flowi4_mark = 0;
+    fl4.flowi4_tos = tos & IPTOS_RT_MASK;
+    fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+    fl4.flowi4_flags = 0;
+    fl4.daddr = daddr;
+    fl4.saddr = saddr;
+
+    err = fib_lookup(net, &fl4, res, 0);
+
+    if (unlikely(err != 0 || res->type != RTN_UNICAST))
+        return -EINVAL;
+
+    return 0;
+}
+EXPORT_SYMBOL_GPL(ip_route_lookup);
+#endif
+
 /*
  *    NOTE. We drop all the packets that has local source
  *    addresses, because every properly looped back packet

Many thanks
Tamim

[-- Attachment #2: topology.py --]
[-- Type: text/x-python, Size: 3289 bytes --]

#!/usr/bin/python

"""

##############################################################################
# Topology with one router and two hosts with static routes
#
#       172.16.101.0/24         172.16.102.0/24   
#  h1 ------------------- r1 ------------------ h2
#    .1                .2   .3               .1   
#
##############################################################################

Here r1 acts as a Linux router. There are two veth-pairs in our topology as following.

h1-eth0-------r1-eth2       r1-eth3-----h2eth0    

Packets received on r1-eth2 is being transmitted to r1-eth3 using XDP fast path. Packets are generated on h1 towards h2 using iperf.

"""


from mininet.topo import Topo
from mininet.net import Mininet
from mininet.link import TCLink
from mininet.node import Node, CPULimitedHost
from mininet.log import setLogLevel, info
from mininet.util import custom, waitListening
from mininet.cli import CLI
import sys
import time

class LinuxRouter( Node ):
    "A Node with IP forwarding enabled."

    def config( self, **params ):
        super( LinuxRouter, self).config( **params )
        # Enable forwarding on the router
        self.cmd( 'sysctl net.ipv4.ip_forward=1' )

    def terminate( self ):
        self.cmd( 'sysctl net.ipv4.ip_forward=0' )
        super( LinuxRouter, self ).terminate()

class NetworkTopo( Topo ):
    def build( self, **_opts ):	
        h1 = self.addHost( 'h1', ip='172.16.101.1/24', defaultRoute='via 172.16.101.2' )
        h2 = self.addHost( 'h2', ip='172.16.102.1/24', defaultRoute='via 172.16.102.3' )
	r1 = self.addNode( 'r1', cls=LinuxRouter, ip='172.16.101.2/24' )

        self.addLink( h1, r1, intfName2='r1-eth2', params2={ 'ip' : '172.16.101.2/24' })
        self.addLink( h2, r1, intfName2='r1-eth3', params2={ 'ip' : '172.16.102.3/24' })

def main(cli=0):
    "Test linux router"
    topo = NetworkTopo()

    net = Mininet( topo=topo, controller = None )
    net.start()

#   testing using port 45678, TCP window size 20MB and 10 connection. 
    res = net['h2'].cmd('iperf -s -p 45678 -w 20MB &')
#Anyhing that blocks shouldn't be used in cmd(). Use popen() instead. It will create a new process. Now monitor the output of the process
    proc = net['h1'].popen('iperf -c 172.16.102.1 -p 45678 -t 30  -w 20MB -P 10')


#    print res #Don't uncomment this. Strange things happen :-(

#Parse the res to find out the PID of iperf server. The program can crash if res isn't formatted properly. Try again
    pid = res.split(" ")
    iperf_s_pid = pid[1]
    iperf_c_pid = int(pid[1]) + 1

    print pid[1], int(pid[1]) + 1

#Pin iperf server and client to core 0 and 1 respectively. Note that throughput you get depends on other applications running on a CPU. So if you get bad throughput, try restatring. Even though you close some applications, the process can sit in the backgroud
    net['h2'].cmd('sudo taskset -pc 0 {0}'.format(iperf_s_pid))
    net['h1'].cmd('sudo taskset -pc 1 {0}'.format(iperf_c_pid)) 

    for line in iter(proc.stdout.readline, b''):
	print line

    net['h2'].cmd('sudo kill -9 {0}'.format(iperf_s_pid))
    net['h1'].cmd('sudo kill -9 {0}'.format(iperf_c_pid))

    CLI( net )
    net.stop()

if __name__ == '__main__':
    args = sys.argv
    setLogLevel( 'info' )
    main()

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-02  0:47 [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel Md. Islam
@ 2018-04-02 16:51 ` Stephen Hemminger
  2018-04-02 18:03 ` John Fastabend
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2018-04-02 16:51 UTC (permalink / raw)
  To: Md. Islam
  Cc: netdev, David Miller, David Ahern, agaceph, Pavel Emelyanov,
	Eric Dumazet, alexei.starovoitov, brouer

On Sun, 1 Apr 2018 20:47:28 -0400
"Md. Islam" <mislam4@kent.edu> wrote:

> This patch implements IPv4 forwarding on xdp_buff. I added a new
> config option XDP_ROUTER. Kernel would forward packets through fast
> path when this option is enabled. But it would require driver support.
> Currently it only works with veth. Here I have modified veth such that
> it outputs xdp_buff. I created a testbed in Mininet. The Mininet
> script (topology.py) is attached. Here the topology is:

Having XDP routing would be great.

The solution you have chosen that by changing each driver does
not scale well since it requires lots of changes and is not ready
for upstream.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-02  0:47 [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel Md. Islam
  2018-04-02 16:51 ` Stephen Hemminger
@ 2018-04-02 18:03 ` John Fastabend
  2018-04-02 18:09   ` David Ahern
  2018-04-04  1:16 ` David Ahern
  2018-04-04  6:16 ` Jesper Dangaard Brouer
  3 siblings, 1 reply; 20+ messages in thread
From: John Fastabend @ 2018-04-02 18:03 UTC (permalink / raw)
  To: Md. Islam, netdev, David Miller, David Ahern, stephen, agaceph,
	Pavel Emelyanov, Eric Dumazet, alexei.starovoitov, brouer

On 04/01/2018 05:47 PM, Md. Islam wrote:
> This patch implements IPv4 forwarding on xdp_buff. I added a new
> config option XDP_ROUTER. Kernel would forward packets through fast
> path when this option is enabled. But it would require driver support.
> Currently it only works with veth. Here I have modified veth such that
> it outputs xdp_buff. I created a testbed in Mininet. The Mininet
> script (topology.py) is attached. Here the topology is:
> 
> h1 -----r1-----h2 (r1 acts as a router)
> 
> This patch improves the throughput from 53.8Gb/s to 60Gb/s on my
> machine. Median RTT also improved from around .055 ms to around .035
> ms.
> 
> Then I disabled hyperthreading and cpu frequency scaling in order to
> utilize CPU cache (DPDK also utilizes CPU cache to improve
> forwarding). This further improves per-packet forwarding latency from
> around 400ns to 200 ns. More specifically, header parsing and fib
> lookup only takes around 82 ns. This shows that this could be used to
> implement linerate packet forwarding in kernel.
> 
> The patch has been generated on 4.15.0+. Please let me know your
> feedback and suggestions. Please feel free to let me know if this
> approach make sense.

Make sense although lets try to avoid hard coded routing into
XDP xmit routines. See details below.


> +#ifdef CONFIG_XDP_ROUTER
> +int veth_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
> +{

This is nice but instead of building a new config_xdp_router
just enable standard XDP for veth + a new helper call to
do routing. Then it will be immediately usable from any XDP
enabled device.

> +    struct veth_priv *priv = netdev_priv(dev);
> +    struct net_device *rcv;
> +    struct ethhdr *ethh;
> +    struct sk_buff *skb;
> +    int length = xdp->data_end - xdp->data;
> +
> +    rcu_read_lock();
> +    rcv = rcu_dereference(priv->peer);
> +    if (unlikely(!rcv)) {
> +        kfree(xdp);
> +        goto drop;
> +    }
> +
> +    /* Update MAC address and checksum */
> +    ethh = eth_hdr_xdp(xdp);
> +    ether_addr_copy(ethh->h_source, dev->dev_addr);
> +    ether_addr_copy(ethh->h_dest, rcv->dev_addr);
> +
> +    /* if IP forwarding is enabled on the receiver,
> +     * call xdp_router_forward()
> +     */
> +    if (is_forwarding_enabled(rcv)) {
> +        prefetch_xdp(xdp);
> +        if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
> +            struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
> +
> +            u64_stats_update_begin(&stats->syncp);
> +            stats->bytes += length;
> +            stats->packets++;
> +            u64_stats_update_end(&stats->syncp);
> +            goto success;
> +        }
> +    }
> +
> +    /* Local deliver */
> +    skb = (struct sk_buff *)xdp->data_meta;
> +    if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
> +        struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
> +
> +        u64_stats_update_begin(&stats->syncp);
> +        stats->bytes += length;
> +        stats->packets++;
> +        u64_stats_update_end(&stats->syncp);
> +    } else {
> +drop:
> +        atomic64_inc(&priv->dropped);
> +    }
> +success:
> +    rcu_read_unlock();
> +    return NETDEV_TX_OK;
> +}
> +#endif
> +
>  static const struct net_device_ops veth_netdev_ops = {
>      .ndo_init            = veth_dev_init,
>      .ndo_open            = veth_open,
> @@ -290,6 +370,9 @@ static const struct net_device_ops veth_netdev_ops = {
>      .ndo_get_iflink        = veth_get_iflink,
>      .ndo_features_check    = passthru_features_check,
>      .ndo_set_rx_headroom    = veth_set_rx_headroom,
> +#ifdef CONFIG_XDP_ROUTER
> +    .ndo_xdp_xmit        = veth_xdp_xmit,
> +#endif
>  };
> 

[...]

> +#ifdef CONFIG_XDP_ROUTER
> +int ip_route_lookup(__be32 daddr, __be32 saddr,
> +                   u8 tos, struct net_device *dev,
> +                   struct fib_result *res);
> +#endif
> +

Can the above be a normal BPF helper that returns an
ifindex? Then something roughly like this patter would
work for all drivers with redirect support,


     route_ifindex = ip_route_lookup(__daddr, ....)
     if (!route_ifindex)
           return do_foo()
     return xdp_redirect(route_ifindex);
     
So my suggestion is,

  1. enable veth xdp (including redirect support)
  2. add a helper to lookup route from routing table

Alternatively you can skip step (2) and encode the routing
table in BPF directly. Maybe we need a more efficient data
structure but that should also work.

Thanks,
John

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-02 18:03 ` John Fastabend
@ 2018-04-02 18:09   ` David Ahern
  2018-04-02 18:16     ` Alexei Starovoitov
  0 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2018-04-02 18:09 UTC (permalink / raw)
  To: John Fastabend, Md. Islam, netdev, David Miller, stephen,
	agaceph, Pavel Emelyanov, Eric Dumazet, alexei.starovoitov,
	brouer

On 4/2/18 12:03 PM, John Fastabend wrote:
> 
> Can the above be a normal BPF helper that returns an
> ifindex? Then something roughly like this patter would
> work for all drivers with redirect support,
> 
> 
>      route_ifindex = ip_route_lookup(__daddr, ....)
>      if (!route_ifindex)
>            return do_foo()
>      return xdp_redirect(route_ifindex);
>      
> So my suggestion is,
> 
>   1. enable veth xdp (including redirect support)
>   2. add a helper to lookup route from routing table
> 
> Alternatively you can skip step (2) and encode the routing
> table in BPF directly. Maybe we need a more efficient data
> structure but that should also work.
> 

That's what I have here:

https://github.com/dsahern/linux/commit/bab42f158c0925339f7519df7fb2cde8eac33aa8

And Jesper has done some measurements showing a 400% improvement in
throughput.

I have not had time to come back to the xdp forwarding set. It needs to
handle vlan devices at a minimum before I send an RFC.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-02 18:09   ` David Ahern
@ 2018-04-02 18:16     ` Alexei Starovoitov
  2018-04-03 15:07       ` David Ahern
  0 siblings, 1 reply; 20+ messages in thread
From: Alexei Starovoitov @ 2018-04-02 18:16 UTC (permalink / raw)
  To: David Ahern
  Cc: John Fastabend, Md. Islam, netdev, David Miller, stephen,
	agaceph, Pavel Emelyanov, Eric Dumazet, brouer

On Mon, Apr 02, 2018 at 12:09:44PM -0600, David Ahern wrote:
> On 4/2/18 12:03 PM, John Fastabend wrote:
> > 
> > Can the above be a normal BPF helper that returns an
> > ifindex? Then something roughly like this patter would
> > work for all drivers with redirect support,
> > 
> > 
> >      route_ifindex = ip_route_lookup(__daddr, ....)
> >      if (!route_ifindex)
> >            return do_foo()
> >      return xdp_redirect(route_ifindex);
> >      
> > So my suggestion is,
> > 
> >   1. enable veth xdp (including redirect support)
> >   2. add a helper to lookup route from routing table
> > 
> > Alternatively you can skip step (2) and encode the routing
> > table in BPF directly. Maybe we need a more efficient data
> > structure but that should also work.
> > 
> 
> That's what I have here:
> 
> https://github.com/dsahern/linux/commit/bab42f158c0925339f7519df7fb2cde8eac33aa8

was wondering what's up with the delay and when are you going to
submit them officially...
The use case came up several times.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-02 18:16     ` Alexei Starovoitov
@ 2018-04-03 15:07       ` David Ahern
  2018-04-03 16:41         ` John Fastabend
  0 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2018-04-03 15:07 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: John Fastabend, Md. Islam, netdev, David Miller, stephen,
	agaceph, Pavel Emelyanov, Eric Dumazet, brouer

On 4/2/18 12:16 PM, Alexei Starovoitov wrote:
> On Mon, Apr 02, 2018 at 12:09:44PM -0600, David Ahern wrote:
>> On 4/2/18 12:03 PM, John Fastabend wrote:
>>>
>>> Can the above be a normal BPF helper that returns an
>>> ifindex? Then something roughly like this patter would
>>> work for all drivers with redirect support,
>>>
>>>
>>>      route_ifindex = ip_route_lookup(__daddr, ....)
>>>      if (!route_ifindex)
>>>            return do_foo()
>>>      return xdp_redirect(route_ifindex);
>>>      
>>> So my suggestion is,
>>>
>>>   1. enable veth xdp (including redirect support)
>>>   2. add a helper to lookup route from routing table
>>>
>>> Alternatively you can skip step (2) and encode the routing
>>> table in BPF directly. Maybe we need a more efficient data
>>> structure but that should also work.
>>>
>>
>> That's what I have here:
>>
>> https://github.com/dsahern/linux/commit/bab42f158c0925339f7519df7fb2cde8eac33aa8
> 
> was wondering what's up with the delay and when are you going to
> submit them officially...
> The use case came up several times.
> 

I need to find time to come back to that set. As I recall there a number
of outstanding issues:

1. you and Daniel had comments about the bpf_func_proto declarations

2. Jesper had concerns about xdp redirect to any netdev. e.g., How does
the lookup know the egress netdev supports xdp? Right now you can try
and the packet is dropped if it is not supported.

3. VLAN devices. I suspect these will affect the final bpf function
prototype. It would awkward to have 1 forwarding API for non-vlan
devices and a second for vlan devices, hence the need to resolve this
before it goes in.

4. What about other stacked devices - bonds and bridges - will those
just work with the bpf helper? VRF is already handled of course. ;-)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-03 15:07       ` David Ahern
@ 2018-04-03 16:41         ` John Fastabend
  2018-04-03 16:45           ` David Miller
  2018-04-03 17:00           ` David Ahern
  0 siblings, 2 replies; 20+ messages in thread
From: John Fastabend @ 2018-04-03 16:41 UTC (permalink / raw)
  To: David Ahern, Alexei Starovoitov
  Cc: Md. Islam, netdev, David Miller, stephen, agaceph,
	Pavel Emelyanov, Eric Dumazet, brouer

On 04/03/2018 08:07 AM, David Ahern wrote:
> On 4/2/18 12:16 PM, Alexei Starovoitov wrote:
>> On Mon, Apr 02, 2018 at 12:09:44PM -0600, David Ahern wrote:
>>> On 4/2/18 12:03 PM, John Fastabend wrote:
>>>>
>>>> Can the above be a normal BPF helper that returns an
>>>> ifindex? Then something roughly like this patter would
>>>> work for all drivers with redirect support,
>>>>
>>>>
>>>>      route_ifindex = ip_route_lookup(__daddr, ....)
>>>>      if (!route_ifindex)
>>>>            return do_foo()
>>>>      return xdp_redirect(route_ifindex);
>>>>      
>>>> So my suggestion is,
>>>>
>>>>   1. enable veth xdp (including redirect support)
>>>>   2. add a helper to lookup route from routing table
>>>>
>>>> Alternatively you can skip step (2) and encode the routing
>>>> table in BPF directly. Maybe we need a more efficient data
>>>> structure but that should also work.
>>>>
>>>
>>> That's what I have here:
>>>
>>> https://github.com/dsahern/linux/commit/bab42f158c0925339f7519df7fb2cde8eac33aa8
>>
>> was wondering what's up with the delay and when are you going to
>> submit them officially...
>> The use case came up several times.
>>
> 
> I need to find time to come back to that set. As I recall there a number
> of outstanding issues:
> 
> 1. you and Daniel had comments about the bpf_func_proto declarations
> 
> 2. Jesper had concerns about xdp redirect to any netdev. e.g., How does
> the lookup know the egress netdev supports xdp? Right now you can try
> and the packet is dropped if it is not supported.
> 

There should probably be a tracepoint there if not already. Otherwise
I think the orchestration/loader layer should be ensuring that xdp
support is sufficient. I don't think we need anything specific in the
XDP/BPF code to handle unsupported devices.

> 3. VLAN devices. I suspect these will affect the final bpf function
> prototype. It would awkward to have 1 forwarding API for non-vlan
> devices and a second for vlan devices, hence the need to resolve this
> before it goes in.
> 

Interesting. Do we need stacked XDP, I could imagine having 802.1Q
simply call the lower dev XDP xmit routine. Possibly adding the 8021q
header first.

Or alternatively a new dev type could let users query things like
vlan-id from the dev rather than automatically doing the tagging. I
suspect though if you forward to a vlan device automatically adding
the tag is the correct behavior.


> 4. What about other stacked devices - bonds and bridges - will those
> just work with the bpf helper? VRF is already handled of course. ;-)
> 

So if we simply handle this like other stacked devices and call the
lower devs xdp_xmit routine we should get reasonable behavior. For
bonds and bridges I guess some generalization is needed though because
everything at the moment is skb centric. I don't think its necessary
in the first series though. It can be added later.

.John

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-03 16:41         ` John Fastabend
@ 2018-04-03 16:45           ` David Miller
  2018-04-03 17:00           ` David Ahern
  1 sibling, 0 replies; 20+ messages in thread
From: David Miller @ 2018-04-03 16:45 UTC (permalink / raw)
  To: john.fastabend
  Cc: dsahern, alexei.starovoitov, mislam4, netdev, stephen, agaceph,
	xemul, edumazet, brouer

From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 3 Apr 2018 09:41:08 -0700

> On 04/03/2018 08:07 AM, David Ahern wrote:
>> 4. What about other stacked devices - bonds and bridges - will those
>> just work with the bpf helper? VRF is already handled of course. ;-)
> 
> So if we simply handle this like other stacked devices and call the
> lower devs xdp_xmit routine we should get reasonable behavior. For
> bonds and bridges I guess some generalization is needed though because
> everything at the moment is skb centric. I don't think its necessary
> in the first series though. It can be added later.

I wonder if we need some feature bit gating this passthrough, just
like the various ->vlan_features and ->hw_enc_features.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-03 16:41         ` John Fastabend
  2018-04-03 16:45           ` David Miller
@ 2018-04-03 17:00           ` David Ahern
  2018-04-03 17:06             ` Alexei Starovoitov
  2018-04-03 18:21             ` Jesper Dangaard Brouer
  1 sibling, 2 replies; 20+ messages in thread
From: David Ahern @ 2018-04-03 17:00 UTC (permalink / raw)
  To: John Fastabend, Alexei Starovoitov
  Cc: Md. Islam, netdev, David Miller, stephen, agaceph,
	Pavel Emelyanov, Eric Dumazet, brouer

On 4/3/18 10:41 AM, John Fastabend wrote:
> On 04/03/2018 08:07 AM, David Ahern wrote:
>> On 4/2/18 12:16 PM, Alexei Starovoitov wrote:
>>> On Mon, Apr 02, 2018 at 12:09:44PM -0600, David Ahern wrote:
>>>> On 4/2/18 12:03 PM, John Fastabend wrote:
>>>>>
>>>>> Can the above be a normal BPF helper that returns an
>>>>> ifindex? Then something roughly like this patter would
>>>>> work for all drivers with redirect support,
>>>>>
>>>>>
>>>>>      route_ifindex = ip_route_lookup(__daddr, ....)
>>>>>      if (!route_ifindex)
>>>>>            return do_foo()
>>>>>      return xdp_redirect(route_ifindex);
>>>>>      
>>>>> So my suggestion is,
>>>>>
>>>>>   1. enable veth xdp (including redirect support)
>>>>>   2. add a helper to lookup route from routing table
>>>>>
>>>>> Alternatively you can skip step (2) and encode the routing
>>>>> table in BPF directly. Maybe we need a more efficient data
>>>>> structure but that should also work.
>>>>>
>>>>
>>>> That's what I have here:
>>>>
>>>> https://github.com/dsahern/linux/commit/bab42f158c0925339f7519df7fb2cde8eac33aa8
>>>
>>> was wondering what's up with the delay and when are you going to
>>> submit them officially...
>>> The use case came up several times.
>>>
>>
>> I need to find time to come back to that set. As I recall there a number
>> of outstanding issues:
>>
>> 1. you and Daniel had comments about the bpf_func_proto declarations
>>
>> 2. Jesper had concerns about xdp redirect to any netdev. e.g., How does
>> the lookup know the egress netdev supports xdp? Right now you can try
>> and the packet is dropped if it is not supported.
>>
> 
> There should probably be a tracepoint there if not already. Otherwise
> I think the orchestration/loader layer should be ensuring that xdp
> support is sufficient. I don't think we need anything specific in the
> XDP/BPF code to handle unsupported devices.

ok. I am fine with starting with that.

> 
>> 3. VLAN devices. I suspect these will affect the final bpf function
>> prototype. It would awkward to have 1 forwarding API for non-vlan
>> devices and a second for vlan devices, hence the need to resolve this
>> before it goes in.
>>
> 
> Interesting. Do we need stacked XDP, I could imagine having 802.1Q
> simply call the lower dev XDP xmit routine. Possibly adding the 8021q
> header first.
> 
> Or alternatively a new dev type could let users query things like
> vlan-id from the dev rather than automatically doing the tagging. I
> suspect though if you forward to a vlan device automatically adding
> the tag is the correct behavior.
> 
> 
>> 4. What about other stacked devices - bonds and bridges - will those
>> just work with the bpf helper? VRF is already handled of course. ;-)
>>
> 
> So if we simply handle this like other stacked devices and call the
> lower devs xdp_xmit routine we should get reasonable behavior. For
> bonds and bridges I guess some generalization is needed though because
> everything at the moment is skb centric. I don't think its necessary
> in the first series though. It can be added later.
> 

For 3 and 4 above I was referring to the route lookup part of it; sorry
for not being clear.

For example, eth1 is enslaved to bond1 which is in VRF red. The lookup
needs to go to the table associated with the VRF. That is not known by
just looking at eth1. The code exists to walk the upper layers and do
the effective translations, just need to cover those cases.

The VLAN part of it is a bit more difficult - ingress device for the
lookup should be eth1.100 for example not eth1, and then if eth1.100 is
enslaved to a VRF, ...

None of it is that complex, just need to walk through the various use
cases and make sure bpf_ipv4_fwd_lookup and bpf_ipv6_fwd_lookup can do
the right thing for these common use cases.

Handling lwt and mpls for example, can certainly be a follow on.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-03 17:00           ` David Ahern
@ 2018-04-03 17:06             ` Alexei Starovoitov
  2018-04-03 17:14               ` David Ahern
  2018-04-03 18:21             ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 20+ messages in thread
From: Alexei Starovoitov @ 2018-04-03 17:06 UTC (permalink / raw)
  To: David Ahern
  Cc: John Fastabend, Md. Islam, netdev, David Miller, stephen,
	agaceph, Pavel Emelyanov, Eric Dumazet, brouer

On Tue, Apr 03, 2018 at 11:00:20AM -0600, David Ahern wrote:
> On 4/3/18 10:41 AM, John Fastabend wrote:
> > On 04/03/2018 08:07 AM, David Ahern wrote:
> >> On 4/2/18 12:16 PM, Alexei Starovoitov wrote:
> >>> On Mon, Apr 02, 2018 at 12:09:44PM -0600, David Ahern wrote:
> >>>> On 4/2/18 12:03 PM, John Fastabend wrote:
> >>>>>
> >>>>> Can the above be a normal BPF helper that returns an
> >>>>> ifindex? Then something roughly like this patter would
> >>>>> work for all drivers with redirect support,
> >>>>>
> >>>>>
> >>>>>      route_ifindex = ip_route_lookup(__daddr, ....)
> >>>>>      if (!route_ifindex)
> >>>>>            return do_foo()
> >>>>>      return xdp_redirect(route_ifindex);
> >>>>>      
> >>>>> So my suggestion is,
> >>>>>
> >>>>>   1. enable veth xdp (including redirect support)
> >>>>>   2. add a helper to lookup route from routing table
> >>>>>
> >>>>> Alternatively you can skip step (2) and encode the routing
> >>>>> table in BPF directly. Maybe we need a more efficient data
> >>>>> structure but that should also work.
> >>>>>
> >>>>
> >>>> That's what I have here:
> >>>>
> >>>> https://github.com/dsahern/linux/commit/bab42f158c0925339f7519df7fb2cde8eac33aa8
> >>>
> >>> was wondering what's up with the delay and when are you going to
> >>> submit them officially...
> >>> The use case came up several times.
> >>>
> >>
> >> I need to find time to come back to that set. As I recall there a number
> >> of outstanding issues:
> >>
> >> 1. you and Daniel had comments about the bpf_func_proto declarations
> >>
> >> 2. Jesper had concerns about xdp redirect to any netdev. e.g., How does
> >> the lookup know the egress netdev supports xdp? Right now you can try
> >> and the packet is dropped if it is not supported.
> >>
> > 
> > There should probably be a tracepoint there if not already. Otherwise
> > I think the orchestration/loader layer should be ensuring that xdp
> > support is sufficient. I don't think we need anything specific in the
> > XDP/BPF code to handle unsupported devices.
> 
> ok. I am fine with starting with that.
> 
> > 
> >> 3. VLAN devices. I suspect these will affect the final bpf function
> >> prototype. It would awkward to have 1 forwarding API for non-vlan
> >> devices and a second for vlan devices, hence the need to resolve this
> >> before it goes in.
> >>
> > 
> > Interesting. Do we need stacked XDP, I could imagine having 802.1Q
> > simply call the lower dev XDP xmit routine. Possibly adding the 8021q
> > header first.
> > 
> > Or alternatively a new dev type could let users query things like
> > vlan-id from the dev rather than automatically doing the tagging. I
> > suspect though if you forward to a vlan device automatically adding
> > the tag is the correct behavior.
> > 
> > 
> >> 4. What about other stacked devices - bonds and bridges - will those
> >> just work with the bpf helper? VRF is already handled of course. ;-)
> >>
> > 
> > So if we simply handle this like other stacked devices and call the
> > lower devs xdp_xmit routine we should get reasonable behavior. For
> > bonds and bridges I guess some generalization is needed though because
> > everything at the moment is skb centric. I don't think its necessary
> > in the first series though. It can be added later.
> > 
> 
> For 3 and 4 above I was referring to the route lookup part of it; sorry
> for not being clear.
> 
> For example, eth1 is enslaved to bond1 which is in VRF red. The lookup
> needs to go to the table associated with the VRF. That is not known by
> just looking at eth1. The code exists to walk the upper layers and do
> the effective translations, just need to cover those cases.
> 
> The VLAN part of it is a bit more difficult - ingress device for the
> lookup should be eth1.100 for example not eth1, and then if eth1.100 is
> enslaved to a VRF, ...
> 
> None of it is that complex, just need to walk through the various use
> cases and make sure bpf_ipv4_fwd_lookup and bpf_ipv6_fwd_lookup can do
> the right thing for these common use cases.

I'm a bit lost here. Why this is a concern?
'index' as argument that bpf prog is passing into the helper.
The clsbpf program may choose to pass ifindex of the netdev
it's attached to or some other one.
In your patch you have:
+BPF_CALL_3(bpf_ipv4_fwd_lookup, int, index, const struct iphdr *, iph,
+	   struct ethhdr *, eth)
+{
+	struct flowi4 fl4 = {
+		.daddr = iph->daddr,
+		.saddr = iph->saddr,
+		.flowi4_iif = index,
+		.flowi4_tos = iph->tos & IPTOS_RT_MASK,
+		.flowi4_scope = RT_SCOPE_UNIVERSE,
+	};

As you saying there is concern with .flowi4_iif = index line ?
In the above the only thing Daniel and myself pointed out that
passing struct iphdr * like this is not safe.
We either need size argument which would be a bit cumbersome or
extend verifier a little to specify size as part of helper proto,
so that verifier can eforce it without having program to pass it.
imo that's the only bit missing from that patch to upstream it.

Also the helper isn't really related to XDP. It should work as-is
for clsbpf and xdp programs as far as I can tell.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-03 17:06             ` Alexei Starovoitov
@ 2018-04-03 17:14               ` David Ahern
  2018-04-03 17:37                 ` Alexei Starovoitov
  0 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2018-04-03 17:14 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: John Fastabend, Md. Islam, netdev, David Miller, stephen,
	agaceph, Pavel Emelyanov, Eric Dumazet, brouer

On 4/3/18 11:06 AM, Alexei Starovoitov wrote:
>> For 3 and 4 above I was referring to the route lookup part of it; sorry
>> for not being clear.
>>
>> For example, eth1 is enslaved to bond1 which is in VRF red. The lookup
>> needs to go to the table associated with the VRF. That is not known by
>> just looking at eth1. The code exists to walk the upper layers and do
>> the effective translations, just need to cover those cases.
>>
>> The VLAN part of it is a bit more difficult - ingress device for the
>> lookup should be eth1.100 for example not eth1, and then if eth1.100 is
>> enslaved to a VRF, ...
>>
>> None of it is that complex, just need to walk through the various use
>> cases and make sure bpf_ipv4_fwd_lookup and bpf_ipv6_fwd_lookup can do
>> the right thing for these common use cases.
> I'm a bit lost here. Why this is a concern?
> 'index' as argument that bpf prog is passing into the helper.
> The clsbpf program may choose to pass ifindex of the netdev
> it's attached to or some other one.
> In your patch you have:
> +BPF_CALL_3(bpf_ipv4_fwd_lookup, int, index, const struct iphdr *, iph,
> +	   struct ethhdr *, eth)
> +{
> +	struct flowi4 fl4 = {
> +		.daddr = iph->daddr,
> +		.saddr = iph->saddr,
> +		.flowi4_iif = index,
> +		.flowi4_tos = iph->tos & IPTOS_RT_MASK,
> +		.flowi4_scope = RT_SCOPE_UNIVERSE,
> +	};
> 
> As you saying there is concern with .flowi4_iif = index line ?

yes. BPF / XDP programs are installed on the bottom device ... e.g.,
eth1. The L3 lookup is not necessarily done on that device index.


> In the above the only thing Daniel and myself pointed out that
> passing struct iphdr * like this is not safe.
> We either need size argument which would be a bit cumbersome or
> extend verifier a little to specify size as part of helper proto,
> so that verifier can eforce it without having program to pass it.
> imo that's the only bit missing from that patch to upstream it.

sure. I did not mean that item 1. was a big deal, just something that
needed to be fixed.

> 
> Also the helper isn't really related to XDP. It should work as-is
> for clsbpf and xdp programs as far as I can tell.
> 

yes.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-03 17:14               ` David Ahern
@ 2018-04-03 17:37                 ` Alexei Starovoitov
  2018-04-04  1:09                   ` David Ahern
  0 siblings, 1 reply; 20+ messages in thread
From: Alexei Starovoitov @ 2018-04-03 17:37 UTC (permalink / raw)
  To: David Ahern
  Cc: John Fastabend, Md. Islam, netdev, David Miller, stephen,
	agaceph, Pavel Emelyanov, Eric Dumazet, brouer

On Tue, Apr 03, 2018 at 11:14:00AM -0600, David Ahern wrote:
> On 4/3/18 11:06 AM, Alexei Starovoitov wrote:
> >> For 3 and 4 above I was referring to the route lookup part of it; sorry
> >> for not being clear.
> >>
> >> For example, eth1 is enslaved to bond1 which is in VRF red. The lookup
> >> needs to go to the table associated with the VRF. That is not known by
> >> just looking at eth1. The code exists to walk the upper layers and do
> >> the effective translations, just need to cover those cases.
> >>
> >> The VLAN part of it is a bit more difficult - ingress device for the
> >> lookup should be eth1.100 for example not eth1, and then if eth1.100 is
> >> enslaved to a VRF, ...
> >>
> >> None of it is that complex, just need to walk through the various use
> >> cases and make sure bpf_ipv4_fwd_lookup and bpf_ipv6_fwd_lookup can do
> >> the right thing for these common use cases.
> > I'm a bit lost here. Why this is a concern?
> > 'index' as argument that bpf prog is passing into the helper.
> > The clsbpf program may choose to pass ifindex of the netdev
> > it's attached to or some other one.
> > In your patch you have:
> > +BPF_CALL_3(bpf_ipv4_fwd_lookup, int, index, const struct iphdr *, iph,
> > +	   struct ethhdr *, eth)
> > +{
> > +	struct flowi4 fl4 = {
> > +		.daddr = iph->daddr,
> > +		.saddr = iph->saddr,
> > +		.flowi4_iif = index,
> > +		.flowi4_tos = iph->tos & IPTOS_RT_MASK,
> > +		.flowi4_scope = RT_SCOPE_UNIVERSE,
> > +	};
> > 
> > As you saying there is concern with .flowi4_iif = index line ?
> 
> yes. BPF / XDP programs are installed on the bottom device ... e.g.,
> eth1. The L3 lookup is not necessarily done on that device index.

right, but I still don't see any problem with this helper and vlans.
If xdp program passes incorrect ifindex, it's program's mistake.
If clsbpf attached to vlan passed good ifindex, the lookup will
happen in the correct scope, but even in this case the prog
can pass whatever ifindex it wants.

> 
> > In the above the only thing Daniel and myself pointed out that
> > passing struct iphdr * like this is not safe.
> > We either need size argument which would be a bit cumbersome or
> > extend verifier a little to specify size as part of helper proto,
> > so that verifier can eforce it without having program to pass it.
> > imo that's the only bit missing from that patch to upstream it.
> 
> sure. I did not mean that item 1. was a big deal, just something that
> needed to be fixed.
> 
> > 
> > Also the helper isn't really related to XDP. It should work as-is
> > for clsbpf and xdp programs as far as I can tell.
> > 
> 
> yes.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-03 17:00           ` David Ahern
  2018-04-03 17:06             ` Alexei Starovoitov
@ 2018-04-03 18:21             ` Jesper Dangaard Brouer
  1 sibling, 0 replies; 20+ messages in thread
From: Jesper Dangaard Brouer @ 2018-04-03 18:21 UTC (permalink / raw)
  To: David Ahern
  Cc: John Fastabend, Alexei Starovoitov, Md. Islam, netdev,
	David Miller, stephen, agaceph, Pavel Emelyanov, Eric Dumazet,
	brouer


On Tue, 3 Apr 2018 11:00:20 -0600 David Ahern <dsahern@gmail.com> wrote:

> On 4/3/18 10:41 AM, John Fastabend wrote:
> > On 04/03/2018 08:07 AM, David Ahern wrote:  
> >> On 4/2/18 12:16 PM, Alexei Starovoitov wrote:  
> >>> On Mon, Apr 02, 2018 at 12:09:44PM -0600, David Ahern wrote:  
> >>>> On 4/2/18 12:03 PM, John Fastabend wrote:  
> >>>>>
[...]
> >>>> That's what I have here:
> >>>>
> >>>> https://github.com/dsahern/linux/commit/bab42f158c0925339f7519df7fb2cde8eac33aa8  
> >>>
> >>> was wondering what's up with the delay and when are you going to
> >>> submit them officially...

I'm also eager to see these go upstream! :-)


[...]
> >>
> >> 2. Jesper had concerns about xdp redirect to any netdev. e.g., How does
> >> the lookup know the egress netdev supports xdp? Right now you can try
> >> and the packet is dropped if it is not supported.
> >>  
> > 
> > There should probably be a tracepoint there if not already. Otherwise
> > I think the orchestration/loader layer should be ensuring that xdp
> > support is sufficient. I don't think we need anything specific in the
> > XDP/BPF code to handle unsupported devices.  
> 
> ok. I am fine with starting with that.

David this concern was not related to your code.  Your code need to
stay generic, to be usable from both XDP and clsact.  As John writes
the error cases can be troubleshooted through XDP tracepoints.

My request/concern was actually performance related.  I wanted to add a
flag to the helper, to allow for a later optimization.  Today, we have
pushed the responsibility, of knowing if an ifindex is XDP-xmit
capable, to the BPF program(mer).  This implies that the BPF-prog need
extra code/lookup into a map to validate the ifindex returned from your
helper.  Later, we might get some XDP feature bits, and a flag could
specify to only return the looked up device if XDP-xmit is enabled.
Thus, simplifying the work need in the bpf-prog.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-03 17:37                 ` Alexei Starovoitov
@ 2018-04-04  1:09                   ` David Ahern
  0 siblings, 0 replies; 20+ messages in thread
From: David Ahern @ 2018-04-04  1:09 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: John Fastabend, Md. Islam, netdev, David Miller, stephen,
	agaceph, Pavel Emelyanov, Eric Dumazet, brouer

On 4/3/18 11:37 AM, Alexei Starovoitov wrote:
> On Tue, Apr 03, 2018 at 11:14:00AM -0600, David Ahern wrote:
>> On 4/3/18 11:06 AM, Alexei Starovoitov wrote:
>>>> For 3 and 4 above I was referring to the route lookup part of it; sorry
>>>> for not being clear.
>>>>
>>>> For example, eth1 is enslaved to bond1 which is in VRF red. The lookup
>>>> needs to go to the table associated with the VRF. That is not known by
>>>> just looking at eth1. The code exists to walk the upper layers and do
>>>> the effective translations, just need to cover those cases.
>>>>
>>>> The VLAN part of it is a bit more difficult - ingress device for the
>>>> lookup should be eth1.100 for example not eth1, and then if eth1.100 is
>>>> enslaved to a VRF, ...
>>>>
>>>> None of it is that complex, just need to walk through the various use
>>>> cases and make sure bpf_ipv4_fwd_lookup and bpf_ipv6_fwd_lookup can do
>>>> the right thing for these common use cases.
>>> I'm a bit lost here. Why this is a concern?
>>> 'index' as argument that bpf prog is passing into the helper.
>>> The clsbpf program may choose to pass ifindex of the netdev
>>> it's attached to or some other one.
>>> In your patch you have:
>>> +BPF_CALL_3(bpf_ipv4_fwd_lookup, int, index, const struct iphdr *, iph,
>>> +	   struct ethhdr *, eth)
>>> +{
>>> +	struct flowi4 fl4 = {
>>> +		.daddr = iph->daddr,
>>> +		.saddr = iph->saddr,
>>> +		.flowi4_iif = index,
>>> +		.flowi4_tos = iph->tos & IPTOS_RT_MASK,
>>> +		.flowi4_scope = RT_SCOPE_UNIVERSE,
>>> +	};
>>>
>>> As you saying there is concern with .flowi4_iif = index line ?
>>
>> yes. BPF / XDP programs are installed on the bottom device ... e.g.,
>> eth1. The L3 lookup is not necessarily done on that device index.
> 
> right, but I still don't see any problem with this helper and vlans.
> If xdp program passes incorrect ifindex, it's program's mistake.
> If clsbpf attached to vlan passed good ifindex, the lookup will
> happen in the correct scope, but even in this case the prog
> can pass whatever ifindex it wants.
> 

I'll find some time update the bpf forwarding helpers and look at these
other cases in the next few weeks.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-02  0:47 [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel Md. Islam
  2018-04-02 16:51 ` Stephen Hemminger
  2018-04-02 18:03 ` John Fastabend
@ 2018-04-04  1:16 ` David Ahern
  2018-04-04  3:15   ` Md. Islam
  2018-04-04  6:16 ` Jesper Dangaard Brouer
  3 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2018-04-04  1:16 UTC (permalink / raw)
  To: Md. Islam, netdev, David Miller, stephen, agaceph,
	Pavel Emelyanov, Eric Dumazet, alexei.starovoitov, brouer

On 4/1/18 6:47 PM, Md. Islam wrote:
> This patch implements IPv4 forwarding on xdp_buff. I added a new
> config option XDP_ROUTER. Kernel would forward packets through fast
> path when this option is enabled. But it would require driver support.
> Currently it only works with veth. Here I have modified veth such that
> it outputs xdp_buff. I created a testbed in Mininet. The Mininet
> script (topology.py) is attached. Here the topology is:
> 
> h1 -----r1-----h2 (r1 acts as a router)
> 
> This patch improves the throughput from 53.8Gb/s to 60Gb/s on my
> machine. Median RTT also improved from around .055 ms to around .035
> ms.
> 
> Then I disabled hyperthreading and cpu frequency scaling in order to
> utilize CPU cache (DPDK also utilizes CPU cache to improve
> forwarding). This further improves per-packet forwarding latency from
> around 400ns to 200 ns. More specifically, header parsing and fib
> lookup only takes around 82 ns. This shows that this could be used to
> implement linerate packet forwarding in kernel.
> 
> The patch has been generated on 4.15.0+. Please let me know your
> feedback and suggestions. Please feel free to let me know if this
> approach make sense.

This patch is not really using eBPF and XDP but rather trying to
shortcircuit forwarding through a veth pair.

Have you looked at the loss in performance with this config enabled if
there is no r1? i.e., h1 {veth1}  <---> {veth2} / h2. You are adding a
lookup per-packet to the Tx path.

Have you looked at what I would consider a more interesting use case of
packets into a node and delivered to a namespace via veth?

   +--------------------------+---------------
   | Host                     | container
   |                          |
   |        +-------{ veth1 }-|-{veth2}----
   |       |                  |
   +----{ eth1 }------------------

Can xdp / bpf on eth1 be used to speed up delivery to the container?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-04  1:16 ` David Ahern
@ 2018-04-04  3:15   ` Md. Islam
  2018-04-06  2:55     ` David Ahern
  0 siblings, 1 reply; 20+ messages in thread
From: Md. Islam @ 2018-04-04  3:15 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, David Miller, Stephen Hemminger, Anton Gary Ceph,
	Pavel Emelyanov, Eric Dumazet, alexei.starovoitov, brouer

On Tue, Apr 3, 2018 at 9:16 PM, David Ahern <dsahern@gmail.com> wrote:
> On 4/1/18 6:47 PM, Md. Islam wrote:
>> This patch implements IPv4 forwarding on xdp_buff. I added a new
>> config option XDP_ROUTER. Kernel would forward packets through fast
>> path when this option is enabled. But it would require driver support.
>> Currently it only works with veth. Here I have modified veth such that
>> it outputs xdp_buff. I created a testbed in Mininet. The Mininet
>> script (topology.py) is attached. Here the topology is:
>>
>> h1 -----r1-----h2 (r1 acts as a router)
>>
>> This patch improves the throughput from 53.8Gb/s to 60Gb/s on my
>> machine. Median RTT also improved from around .055 ms to around .035
>> ms.
>>
>> Then I disabled hyperthreading and cpu frequency scaling in order to
>> utilize CPU cache (DPDK also utilizes CPU cache to improve
>> forwarding). This further improves per-packet forwarding latency from
>> around 400ns to 200 ns. More specifically, header parsing and fib
>> lookup only takes around 82 ns. This shows that this could be used to
>> implement linerate packet forwarding in kernel.
>>
>> The patch has been generated on 4.15.0+. Please let me know your
>> feedback and suggestions. Please feel free to let me know if this
>> approach make sense.
>
> This patch is not really using eBPF and XDP but rather trying to
> shortcircuit forwarding through a veth pair.
>
> Have you looked at the loss in performance with this config enabled if
> there is no r1? i.e., h1 {veth1}  <---> {veth2} / h2. You are adding a
> lookup per-packet to the Tx path.

Yes, it works. If there is no r1, it falls back to dev_forward_skb()
to forward the packet. My main objective here was to implement router
datapath. Personally I feel like it should be a part of kernel rather
than an eBPF. But still looking forward to your patch and performance
number.

>
> Have you looked at what I would consider a more interesting use case of
> packets into a node and delivered to a namespace via veth?
>
>    +--------------------------+---------------
>    | Host                     | container
>    |                          |
>    |        +-------{ veth1 }-|-{veth2}----
>    |       |                  |
>    +----{ eth1 }------------------
>
> Can xdp / bpf on eth1 be used to speed up delivery to the container?

I didn't consider that, but it sounds like an important use case. How
do we determine which namespace gets the packet?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-02  0:47 [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel Md. Islam
                   ` (2 preceding siblings ...)
  2018-04-04  1:16 ` David Ahern
@ 2018-04-04  6:16 ` Jesper Dangaard Brouer
  2018-04-04 21:09   ` Md. Islam
  3 siblings, 1 reply; 20+ messages in thread
From: Jesper Dangaard Brouer @ 2018-04-04  6:16 UTC (permalink / raw)
  To: Md. Islam
  Cc: netdev, David Miller, David Ahern, stephen, agaceph,
	Pavel Emelyanov, Eric Dumazet, alexei.starovoitov, brouer


On Sun, 1 Apr 2018 20:47:28 -0400 Md. Islam" <mislam4@kent.edu> wrote:

> [...] More specifically, header parsing and fib
> lookup only takes around 82 ns. This shows that this could be used to
> implement linerate packet forwarding in kernel.

I cannot resist correcting you...

You didn't specify the link speed, but assuming 10Gbit/s, then the
linerate is 14.88Mpps, which is 67.2 ns between arriving packets. Thus,
if the lookup cost is 82 ns, thus you cannot claim linerate performance
with these numbers.


Details:

This is calculated based on the the minimum Ethernet frame size
84-bytes, see https://en.wikipedia.org/wiki/Ethernet_frame for why this
is the minimum size.

10*10^9/(84*8) = 14,880,952 pps
1/last*10^9    = 67.2 ns

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-04  6:16 ` Jesper Dangaard Brouer
@ 2018-04-04 21:09   ` Md. Islam
  0 siblings, 0 replies; 20+ messages in thread
From: Md. Islam @ 2018-04-04 21:09 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, David Miller, David Ahern, Stephen Hemminger,
	Anton Gary Ceph, Pavel Emelyanov, Eric Dumazet,
	alexei.starovoitov

On Wed, Apr 4, 2018 at 2:16 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> On Sun, 1 Apr 2018 20:47:28 -0400 Md. Islam" <mislam4@kent.edu> wrote:
>
>> [...] More specifically, header parsing and fib
>> lookup only takes around 82 ns. This shows that this could be used to
>> implement linerate packet forwarding in kernel.
>
> I cannot resist correcting you...
>
> You didn't specify the link speed, but assuming 10Gbit/s, then the
> linerate is 14.88Mpps, which is 67.2 ns between arriving packets. Thus,
> if the lookup cost is 82 ns, thus you cannot claim linerate performance
> with these numbers.
>
>
> Details:
>
> This is calculated based on the the minimum Ethernet frame size
> 84-bytes, see https://en.wikipedia.org/wiki/Ethernet_frame for why this
> is the minimum size.
>
> 10*10^9/(84*8) = 14,880,952 pps
> 1/last*10^9    = 67.2 ns
>

Yes, it's not actually line-rate forwarding, but it shows the intent
towards that. Currently we are doing many things in fib_table_lookup()
that can be simplified for a router. fib_get_table() and FIB_RES_DEV()
would be simplified if we disable IP_ROUTE_MULTIPATH and
IP_MULTIPLE_TABLES. We can increase throughput by doing less :-)
Moreover if a network mostly carries larger packets (for instance, a
network exclusively used for video streaming), then a 40Gb NIC
produces packets in every 300ns.

40*10^9/(1500*8) = 3.4mpps
1/last*10^9    = 300 ns

> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-04  3:15   ` Md. Islam
@ 2018-04-06  2:55     ` David Ahern
  2018-04-10  4:27       ` Md. Islam
  0 siblings, 1 reply; 20+ messages in thread
From: David Ahern @ 2018-04-06  2:55 UTC (permalink / raw)
  To: Md. Islam
  Cc: netdev, David Miller, Stephen Hemminger, Anton Gary Ceph,
	Pavel Emelyanov, Eric Dumazet, alexei.starovoitov, brouer

On 4/3/18 9:15 PM, Md. Islam wrote:
>> Have you looked at what I would consider a more interesting use case of
>> packets into a node and delivered to a namespace via veth?
>>
>>    +--------------------------+---------------
>>    | Host                     | container
>>    |                          |
>>    |        +-------{ veth1 }-|-{veth2}----
>>    |       |                  |
>>    +----{ eth1 }------------------
>>
>> Can xdp / bpf on eth1 be used to speed up delivery to the container?
> 
> I didn't consider that, but it sounds like an important use case. How
> do we determine which namespace gets the packet?
> 

FIB lookups of course. Starting with my patch set that handles
forwarding on eth1, what is needed for XDP with veth? ie., a program on
eth1 does the lookup and redirects the packet to veth1 for Tx.
ndo_xdp_xmit for veth knows the packet needs to be forwarded to veth2
internally and there is no skb allocated for the packet yet.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel
  2018-04-06  2:55     ` David Ahern
@ 2018-04-10  4:27       ` Md. Islam
  0 siblings, 0 replies; 20+ messages in thread
From: Md. Islam @ 2018-04-10  4:27 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, David Miller, Stephen Hemminger, Anton Gary Ceph,
	Pavel Emelyanov, Eric Dumazet, alexei.starovoitov,
	Jesper Dangaard Brouer

Gotcha. I'm working on it. I've created a function that creates
sk_buff from xdp_buff. But still getting an error while the sk_buff is
being processed by tcp. I will send you the patch once I'm done.

Thanks!

On Thu, Apr 5, 2018 at 10:55 PM, David Ahern <dsahern@gmail.com> wrote:
> On 4/3/18 9:15 PM, Md. Islam wrote:
>>> Have you looked at what I would consider a more interesting use case of
>>> packets into a node and delivered to a namespace via veth?
>>>
>>>    +--------------------------+---------------
>>>    | Host                     | container
>>>    |                          |
>>>    |        +-------{ veth1 }-|-{veth2}----
>>>    |       |                  |
>>>    +----{ eth1 }------------------
>>>
>>> Can xdp / bpf on eth1 be used to speed up delivery to the container?
>>
>> I didn't consider that, but it sounds like an important use case. How
>> do we determine which namespace gets the packet?
>>
>
> FIB lookups of course. Starting with my patch set that handles
> forwarding on eth1, what is needed for XDP with veth? ie., a program on
> eth1 does the lookup and redirects the packet to veth1 for Tx.
> ndo_xdp_xmit for veth knows the packet needs to be forwarded to veth2
> internally and there is no skb allocated for the packet yet.



-- 
Tamim
PhD Candidate,
Kent State University
http://web.cs.kent.edu/~mislam4/

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2018-04-10  4:27 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-02  0:47 [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel Md. Islam
2018-04-02 16:51 ` Stephen Hemminger
2018-04-02 18:03 ` John Fastabend
2018-04-02 18:09   ` David Ahern
2018-04-02 18:16     ` Alexei Starovoitov
2018-04-03 15:07       ` David Ahern
2018-04-03 16:41         ` John Fastabend
2018-04-03 16:45           ` David Miller
2018-04-03 17:00           ` David Ahern
2018-04-03 17:06             ` Alexei Starovoitov
2018-04-03 17:14               ` David Ahern
2018-04-03 17:37                 ` Alexei Starovoitov
2018-04-04  1:09                   ` David Ahern
2018-04-03 18:21             ` Jesper Dangaard Brouer
2018-04-04  1:16 ` David Ahern
2018-04-04  3:15   ` Md. Islam
2018-04-06  2:55     ` David Ahern
2018-04-10  4:27       ` Md. Islam
2018-04-04  6:16 ` Jesper Dangaard Brouer
2018-04-04 21:09   ` Md. Islam

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.