All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC net-next] net/fib: Poptrie based FIB lookup
@ 2018-08-27  2:28 Md. Islam
  2018-08-27 16:24 ` Stephen Hemminger
  0 siblings, 1 reply; 5+ messages in thread
From: Md. Islam @ 2018-08-27  2:28 UTC (permalink / raw)
  To: Netdev, David Miller, David Ahern, Eric Dumazet,
	Alexey Kuznetsov, Stephen Hemminger, makita.toshiaki, panda,
	yasuhiro.ohara, john.fastabend, alexei.starovoitov

This patch implements Poptrie [1] based FIB lookup. It exhibits pretty
impressive lookup performance compared to LC-trie. This poptrie
implementation however somewhat deviates from the original
implementation [2]. I tested this patch very rigorously with several
FIB tables containing half a million routes. I got same result as
LC-trie based fib_lookup().

Poptrie is intended to work in conjunction with LC-trie (not replace
it). It is primarily designed to overcome many issues of TCAM based
router [1]. It [1] shows that the Poptrie can achieve very impressive
lookup performance on CPU. This patch will mainly be used by XDP
forwarding.

1. Asai, Hirochika, and Yasuhiro Ohara. "Poptrie: A compressed trie
with population count for fast and scalable software IP routing table
lookup." ACM SIGCOMM Computer Communication Review. 2015.

2. https://github.com/pixos/poptrie

>From c5e05ea66b06eb9313749bc8969b4c2798fcf96a Mon Sep 17 00:00:00 2001
From: tamimcse <tamim@csebuet.org>
Date: Sun, 26 Aug 2018 21:12:38 -0400
Subject: [PATCH] Implented Poptrie

Signed-off-by: tamimcse <tamim@csebuet.org>
---
 include/net/ip_fib.h   |  40 +++++++
 net/ipv4/Makefile      |   2 +-
 net/ipv4/fib_poptrie.c | 295 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/fib_trie.c    |   3 +
 4 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/fib_poptrie.c

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 81d0f21..c4374a1 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -197,6 +197,37 @@ struct fib_entry_notifier_info {
     u32 tb_id;
 };

+/*Maximum number of next-hop*/
+#define NEXT_HOP_MAX 255
+
+struct next_hops {
+    struct net_device    *netdev_arr[NEXT_HOP_MAX];
+    /*Total number of next-hops*/
+    u8 count;
+};
+
+struct poptrie_node {
+    u64 vector;
+    u64 leafvec;
+    u64 nodevec;
+    struct poptrie_node *chield_nodes;
+    u8 *leaves;
+    u8 *prefixes;
+};
+
+struct poptrie {
+    char    def_nh;
+    struct next_hops    nhs;
+    struct poptrie_node *root;
+    spinlock_t            lock;
+};
+
+void poptrie_insert(struct poptrie *pt, u32 key,
+        u8 prefix_len, struct net_device *dev);
+void poptrie_lookup(struct poptrie *pt, __be32 dest,
+        struct net_device **dev);
+
+
 struct fib_nh_notifier_info {
     struct fib_notifier_info info; /* must be first */
     struct fib_nh *fib_nh;
@@ -219,6 +250,7 @@ struct fib_table {
     int            tb_num_default;
     struct rcu_head        rcu;
     unsigned long         *tb_data;
+    struct poptrie    pt;
     unsigned long        __data[0];
 };

@@ -268,6 +300,14 @@ static inline int fib_lookup(struct net *net,
const struct flowi4 *flp,
     rcu_read_lock();

     tb = fib_get_table(net, RT_TABLE_MAIN);
+
+    /*Testing poptrie_lookup*/
+    if (tb && tb->pt.root) {
+        struct net_device *dev;
+
+        poptrie_lookup(&tb->pt, flp->daddr, &dev);
+    }
+
     if (tb)
         err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF);

diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b379520..b1246d2 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y     := route.o inetpeer.o protocol.o \
          udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
          fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
          inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
-         metrics.o
+         metrics.o fib_poptrie.o

 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/fib_poptrie.c b/net/ipv4/fib_poptrie.c
new file mode 100644
index 0000000..b3a88ab
--- /dev/null
+++ b/net/ipv4/fib_poptrie.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version
+ *   2 of the License, or (at your option) any later version.
+ *
+ * Author: MD Iftakharul Islam (Tamim) <mislam4@kent.edu>.
+ *
+ * Asai, Hirochika, and Yasuhiro Ohara. "Poptrie: A compressed trie
+ * with population count for fast and scalable software IP routing
+ * table lookup." ACM SIGCOMM Computer Communication Review. 2015.
+ *
+ */
+
+#include <net/ip_fib.h>
+
+/*Get next-hop index from next-hop*/
+static u8 get_fib_index(struct next_hops *nhs, struct net_device *dev)
+{
+    u8 i;
+
+    for (i = 0; i < nhs->count; i++) {
+        if (nhs->netdev_arr[i] == dev)
+            return i;
+    }
+    nhs->netdev_arr[nhs->count++] = dev;
+    return nhs->count - 1;
+}
+
+/*Converts next-hop index into actual next-hop*/
+static struct net_device *get_fib(struct next_hops *nhs, u8 fib_index)
+{
+    return nhs->netdev_arr[fib_index];
+}
+
+/*Extracts 6 bytes from key starting from offset*/
+static inline u32 extract(u32 key, int offset)
+{
+    if (likely(offset < 26))
+        return (key >> (26 - offset)) & 63;
+    else
+        return (key << 4) & 63;
+}
+
+/*Set FIB index and prefix length to a leaf*/
+static void set_fib_index(struct poptrie_node *node,
+        unsigned long leaf_index, char fib_index, char prefix_len)
+{
+    node->leaves[leaf_index] = fib_index;
+    node->prefixes[leaf_index] = prefix_len;
+}
+
+/*Insert a leaf at index*/
+static bool insert_leaf(struct poptrie_node *node,
+        char index, char fib_index, char prefix_len)
+{
+    int i, j;
+    char *leaves;
+    char *prefixes;
+    int size = (int)hweight64(node->leafvec);
+
+    if (index > size) {
+        pr_err("Index needs to be smaller or equal to size");
+        return false;
+    }
+
+    leaves = kcalloc(size + 1, sizeof(*leaves), GFP_ATOMIC);
+    prefixes = kcalloc(size + 1, sizeof(*prefixes), GFP_ATOMIC);
+
+    for (i = 0, j = 0; i < (size + 1); i++) {
+        if (i == index) {
+            leaves[i] = fib_index;
+            prefixes[i] = prefix_len;
+        } else {
+            leaves[i] = node->leaves[j];
+            prefixes[i] = node->prefixes[j];
+            j++;
+        }
+    }
+
+    kfree(node->leaves);
+    kfree(node->prefixes);
+    node->leaves = leaves;
+    node->prefixes = prefixes;
+    return true;
+}
+
+/*Insert a new node at index*/
+static void insert_chield_node(struct poptrie_node *node,
+        char index)
+{
+    int i, j;
+    struct poptrie_node *arr;
+    int arr_size  = (int)hweight64(node->nodevec);
+
+    arr = kcalloc(arr_size + 1, sizeof(*arr), GFP_ATOMIC);
+    for (i = 0, j = 0; i < (arr_size + 1); i++) {
+        if (i != index && j < arr_size)
+            arr[i] = node->chield_nodes[j++];
+    }
+
+    kfree(node->chield_nodes);
+    node->chield_nodes = arr;
+}
+
+void poptrie_insert(struct poptrie *pt, u32 key,
+        u8 prefix_len, struct net_device *dev)
+{
+    int offset, i;
+    u32 index;
+    u8 consecutive_leafs;
+    u64 bitmap;
+    u64 bitmap_hp;
+    int arr_size;
+    unsigned long chield_index;
+    unsigned long leaf_index, prev_leaf_index;
+    unsigned long index_hp;
+    struct poptrie_node *node;
+    u8 prev_fib_index, prev_prefix_len;
+    u8 fib_index = get_fib_index(&pt->nhs, dev);
+
+    spin_lock(&pt->lock);
+
+    if (!pt->root)
+        pt->root = kzalloc(sizeof(*pt->root), GFP_ATOMIC);
+
+    /* Default route */
+    if (prefix_len == 0) {
+        pt->def_nh = fib_index;
+        goto finish;
+    }
+
+    /*Iterate through the nodes*/
+    offset = 0;
+    node = pt->root;
+    while (prefix_len > (offset + 6)) {
+        index = extract(key, offset);
+        bitmap = 1ULL << index;
+        chield_index = hweight64(node->nodevec & (bitmap - 1));
+
+        /*No node for this index, so need to insert a node*/
+        if (!(node->nodevec & bitmap)) {
+            insert_chield_node(node, chield_index);
+            node->nodevec |= bitmap;
+        }
+        node = &node->chield_nodes[chield_index];
+        offset += 6;
+    }
+
+    /*Now need to insert a leaf*/
+
+    index = extract(key, offset);
+    bitmap = 1ULL << index;
+    consecutive_leafs = 1 << (offset + 6 - prefix_len);
+
+    if (node->vector & bitmap && node->leafvec & bitmap) {
+        /*A leaf already exist for this index, so update the existing leaf*/
+        leaf_index = hweight64(node->leafvec & (bitmap - 1));
+        arr_size = (int)hweight64(node->leafvec);
+        if (leaf_index >= arr_size)
+            goto error;
+        /*Ignore the prefix*/
+        if (node->prefixes[leaf_index] > prefix_len) {
+            goto finish;
+        } else if (node->prefixes[leaf_index] == prefix_len) {
+            set_fib_index(node, leaf_index, fib_index, prefix_len);
+        } else {
+            /*hole punching*/
+            bitmap_hp = bitmap << consecutive_leafs;
+            if (!(node->leafvec & bitmap_hp)) {
+                index_hp = hweight64(node->leafvec & (bitmap_hp - 1)) - 1;
+                if (node->prefixes[index_hp] <= prefix_len) {
+                    insert_leaf(node, index_hp, fib_index, prefix_len);
+                    node->leafvec |= bitmap_hp;
+                }
+
+                for (i = leaf_index; i < index_hp ; i++) {
+                    if (node->prefixes[i] <= prefix_len)
+                        set_fib_index(node, i, fib_index, prefix_len);
+                }
+            } else {
+                index_hp = hweight64(node->leafvec & (bitmap_hp - 1)) - 1;
+                for (i = leaf_index; i <= index_hp ; i++) {
+                    if (node->prefixes[i] <= prefix_len)
+                        set_fib_index(node, i, fib_index, prefix_len);
+                }
+            }
+        }
+    } else if (!(node->vector & bitmap)) {
+        /*No leaf for this index, so need to insert a leaf*/
+        leaf_index = hweight64(node->leafvec & (bitmap - 1));
+        insert_leaf(node, leaf_index, fib_index, prefix_len);
+        node->leafvec |= bitmap;
+    } else if (node->vector & bitmap && !(node->leafvec & bitmap)) {
+        /*There is a leaf for this index created by another
+         *  prefix with smaller length
+         */
+        prev_leaf_index = hweight64(node->leafvec & (bitmap - 1)) - 1;
+        arr_size = (int)hweight64(node->leafvec);
+        if (prev_leaf_index >= arr_size)
+            goto error;
+        if (node->prefixes[prev_leaf_index] <= prefix_len) {
+            insert_leaf(node, prev_leaf_index + 1, fib_index, prefix_len);
+            node->leafvec |= bitmap;
+        }
+
+        /*hole punching*/
+        prev_fib_index = node->leaves[prev_leaf_index];
+        prev_prefix_len = node->prefixes[prev_leaf_index];
+
+        bitmap_hp = bitmap << consecutive_leafs;
+        if (!(node->leafvec & bitmap_hp)) {
+            index_hp = hweight64(node->leafvec & (bitmap_hp - 1)) - 1;
+            if (node->prefixes[index_hp] <= prefix_len) {
+                if (prev_leaf_index < 0)
+                    goto error;
+                insert_leaf(node, index_hp + 1,
+                        prev_fib_index, prev_prefix_len);
+                node->leafvec |= bitmap_hp;
+            }
+        }
+
+        for (i = 2; i < consecutive_leafs; i++) {
+            bitmap_hp = bitmap << (i - 1);
+            if (node->leafvec & bitmap_hp) {
+                index_hp = hweight64(node->leafvec & (bitmap_hp - 1)) - 1;
+                insert_leaf(node, index_hp + 1,
+                        fib_index, prefix_len);
+                node->leafvec |= bitmap_hp;
+            }
+        }
+    }
+
+    if (consecutive_leafs > 1)
+        node->vector |= ((1ULL << consecutive_leafs) - 1) << index;
+    else
+        node->vector |= bitmap;
+
+    goto finish;
+
+error:
+    pr_err("Something is very wrong !!!!");
+finish:
+    spin_unlock(&pt->lock);
+}
+
+/*We assume that pt->root is not NULL*/
+void poptrie_lookup(struct poptrie *pt, __be32 dest, struct net_device **dev)
+{
+    register u32 index;
+    register u64 bitmap, bitmask;
+    register unsigned long leaf_index;
+    register unsigned long node_index;
+    register struct poptrie_node *node = pt->root;
+    register u8 fib_index = pt->def_nh;
+    register u8 carry = 0;
+    register u8 carry_bit = 2;
+
+    while (1) {
+        /*Extract 6 bytes from dest */
+        if (likely(carry_bit != 8)) {
+            index = ((dest & 252) >> carry_bit) | carry;
+            carry = (dest & ((1 << carry_bit) - 1)) << (6 - carry_bit);
+            carry_bit = carry_bit + 2;
+            dest = dest >> 8;
+        } else {
+            index = carry;
+            carry = 0;
+            carry_bit = 2;
+        }
+
+        /*Create a bitmap based on the the extracted value*/
+        bitmap = 1ULL << index;
+        bitmask = bitmap - 1;
+
+        /*Find corresponding leaf*/
+        if (likely(node->vector & bitmap)) {
+            leaf_index = hweight64(node->leafvec & bitmask);
+            if (!(node->leafvec & bitmap))
+                leaf_index--;
+            fib_index = node->leaves[leaf_index];
+        }
+
+        /*Find corresponding node*/
+        if (likely(node->nodevec & bitmap)) {
+            node_index = hweight64(node->nodevec & bitmask);
+            node = &node->chield_nodes[node_index];
+            continue;
+        }
+
+        *dev = get_fib(&pt->nhs, fib_index);
+        return;
+    }
+}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3dcffd3..0509a24 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1280,6 +1280,9 @@ int fib_table_insert(struct net *net, struct
fib_table *tb,
     if (err)
         goto out_fib_notif;

+    /*This should be done when Poptrie is enabled from CONFIG*/
+    poptrie_insert(&tb->pt, key, plen, fi->fib_dev);
+
     if (!plen)
         tb->tb_num_default++;

-- 
2.7.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH RFC net-next] net/fib: Poptrie based FIB lookup
  2018-08-27  2:28 [PATCH RFC net-next] net/fib: Poptrie based FIB lookup Md. Islam
@ 2018-08-27 16:24 ` Stephen Hemminger
  2018-08-27 16:56   ` David Ahern
  2018-08-27 22:29   ` Md. Islam
  0 siblings, 2 replies; 5+ messages in thread
From: Stephen Hemminger @ 2018-08-27 16:24 UTC (permalink / raw)
  To: Md. Islam
  Cc: Netdev, David Miller, David Ahern, Eric Dumazet,
	Alexey Kuznetsov, makita.toshiaki, panda, yasuhiro.ohara,
	john.fastabend, alexei.starovoitov

On Sun, 26 Aug 2018 22:28:48 -0400
"Md. Islam" <mislam4@kent.edu> wrote:

> This patch implements Poptrie [1] based FIB lookup. It exhibits pretty
> impressive lookup performance compared to LC-trie. This poptrie
> implementation however somewhat deviates from the original
> implementation [2]. I tested this patch very rigorously with several
> FIB tables containing half a million routes. I got same result as
> LC-trie based fib_lookup().
> 
> Poptrie is intended to work in conjunction with LC-trie (not replace
> it). It is primarily designed to overcome many issues of TCAM based
> router [1]. It [1] shows that the Poptrie can achieve very impressive
> lookup performance on CPU. This patch will mainly be used by XDP
> forwarding.
> 
> 1. Asai, Hirochika, and Yasuhiro Ohara. "Poptrie: A compressed trie
> with population count for fast and scalable software IP routing table
> lookup." ACM SIGCOMM Computer Communication Review. 2015.
> 
> 2. https://github.com/pixos/poptrie


I am glad to see more research in to lookup speed. Here are some non-technical
feedback. Looking deeper takes longer.

The license in github version is not compatiable with GPL. If you based your
code off that, you need to get approval from original copyright holder.

The code is not formatted according to current kernel coding style.
Please use checkpatch to see what the issues are.

It is preferred that a function return a value, rather than being void
and returing result by reference. Example:

> +
> +/*We assume that pt->root is not NULL*/
> +void poptrie_lookup(struct poptrie *pt, __be32 dest, struct net_device **dev)
> +{
...

> +        *dev = get_fib(&pt->nhs, fib_index);
> +        return;
> +    }

Why not?
static struct net_device *poptrie_lookup(struct poptrie *pt, __be32 dest)

Also, as Dave mentioned any implementation needs to handle multiple namespaces
and routing tables.

Could this alternative lookup be enabled via sysctl at runtime rather than kernel config?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH RFC net-next] net/fib: Poptrie based FIB lookup
  2018-08-27 16:24 ` Stephen Hemminger
@ 2018-08-27 16:56   ` David Ahern
  2018-08-27 23:03     ` Md. Islam
  2018-08-27 22:29   ` Md. Islam
  1 sibling, 1 reply; 5+ messages in thread
From: David Ahern @ 2018-08-27 16:56 UTC (permalink / raw)
  To: Stephen Hemminger, Md. Islam
  Cc: Netdev, David Miller, Eric Dumazet, Alexey Kuznetsov,
	makita.toshiaki, panda, yasuhiro.ohara, john.fastabend,
	alexei.starovoitov

On 8/27/18 10:24 AM, Stephen Hemminger wrote:
> 
> Also, as Dave mentioned any implementation needs to handle multiple namespaces
> and routing tables.
> 
> Could this alternative lookup be enabled via sysctl at runtime rather than kernel config?
> 

I spent time a couple of years ago refactoring IPv4 fib lookups with the
intent of allowing different algorithms - for use cases like this:

https://github.com/dsahern/linux/commits/net/ipv4-fib-ops

(it is also another way to solve the API nightmare that ipv6 has become).

But the poptrie patches that have been sent so far have much bigger
problems that need to be addressed before anyone worries about how to
select poptrie vs lc-trie.

The patch does not handle errors (e.g., if attributes such as tos,
metric/priority and multipath are not allowed you need to fail the route
insert; further, what happens if someone creates > 255 netdevices?),
last patch has both fib tables populated (a no-go), does not handle
delete or dumps. In the current form, the poptrie algorithm can not be
taken for a test drive. My suggestion to make it a compile time
selection is just so people can actually try it out using current admin
tools.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH RFC net-next] net/fib: Poptrie based FIB lookup
  2018-08-27 16:24 ` Stephen Hemminger
  2018-08-27 16:56   ` David Ahern
@ 2018-08-27 22:29   ` Md. Islam
  1 sibling, 0 replies; 5+ messages in thread
From: Md. Islam @ 2018-08-27 22:29 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Netdev, David Miller, David Ahern, Eric Dumazet,
	Alexey Kuznetsov, makita.toshiaki, panda, yasuhiro.ohara,
	john fastabend, alexei.starovoitov

On Mon, Aug 27, 2018 at 12:24 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Sun, 26 Aug 2018 22:28:48 -0400
> "Md. Islam" <mislam4@kent.edu> wrote:
>
>> This patch implements Poptrie [1] based FIB lookup. It exhibits pretty
>> impressive lookup performance compared to LC-trie. This poptrie
>> implementation however somewhat deviates from the original
>> implementation [2]. I tested this patch very rigorously with several
>> FIB tables containing half a million routes. I got same result as
>> LC-trie based fib_lookup().
>>
>> Poptrie is intended to work in conjunction with LC-trie (not replace
>> it). It is primarily designed to overcome many issues of TCAM based
>> router [1]. It [1] shows that the Poptrie can achieve very impressive
>> lookup performance on CPU. This patch will mainly be used by XDP
>> forwarding.
>>
>> 1. Asai, Hirochika, and Yasuhiro Ohara. "Poptrie: A compressed trie
>> with population count for fast and scalable software IP routing table
>> lookup." ACM SIGCOMM Computer Communication Review. 2015.
>>
>> 2. https://github.com/pixos/poptrie
>
>
> I am glad to see more research in to lookup speed. Here are some non-technical
> feedback. Looking deeper takes longer.
>
> The license in github version is not compatiable with GPL. If you based your
> code off that, you need to get approval from original copyright holder.

No, I developed it from scratch. It was not developed off the original
code. To make it consistent with the paper, I name few variables as in
the paper. But nothing has been taken from the copyrighted code.
Poptrie formation and lookup is also very different from the paper.
>
> The code is not formatted according to current kernel coding style.
> Please use checkpatch to see what the issues are.
>
> It is preferred that a function return a value, rather than being void
> and returing result by reference. Example:

I will fix those in next patches. I will also add a CONFIG option so
that it can be disable/enabled.

>
>> +
>> +/*We assume that pt->root is not NULL*/
>> +void poptrie_lookup(struct poptrie *pt, __be32 dest, struct net_device **dev)
>> +{
> ...
>
>> +        *dev = get_fib(&pt->nhs, fib_index);
>> +        return;
>> +    }
>
> Why not?

pt->root will not be NULL when we call it XDP forwarding. Checking
this for every packet in a high speed router is redundant, I think.
Currently this function is being called during system startup, and
pt->root was NULL at that time. That's why I checked it before the
function is being called.

> static struct net_device *poptrie_lookup(struct poptrie *pt, __be32 dest)
>
> Also, as Dave mentioned any implementation needs to handle multiple namespaces
> and routing tables.
>

Currently it supports multiple routing tables. poptrie is an instance
of fib_table, Each fib_table has its poptrie. Supporting multiple
namespace wouldn't be difficult. Once the core functionality is
accepted added, those can be implemented incrementally.

> Could this alternative lookup be enabled via sysctl at runtime rather than kernel config?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH RFC net-next] net/fib: Poptrie based FIB lookup
  2018-08-27 16:56   ` David Ahern
@ 2018-08-27 23:03     ` Md. Islam
  0 siblings, 0 replies; 5+ messages in thread
From: Md. Islam @ 2018-08-27 23:03 UTC (permalink / raw)
  To: David Ahern
  Cc: Stephen Hemminger, Netdev, David Miller, Eric Dumazet,
	Alexey Kuznetsov, makita.toshiaki, panda, yasuhiro.ohara,
	john fastabend, alexei.starovoitov

On Mon, Aug 27, 2018 at 12:56 PM, David Ahern <dsahern@gmail.com> wrote:
> On 8/27/18 10:24 AM, Stephen Hemminger wrote:
>>
>> Also, as Dave mentioned any implementation needs to handle multiple namespaces
>> and routing tables.
>>
>> Could this alternative lookup be enabled via sysctl at runtime rather than kernel config?
>>
>
> I spent time a couple of years ago refactoring IPv4 fib lookups with the
> intent of allowing different algorithms - for use cases like this:
>
> https://github.com/dsahern/linux/commits/net/ipv4-fib-ops
>
> (it is also another way to solve the API nightmare that ipv6 has become).
>
> But the poptrie patches that have been sent so far have much bigger
> problems that need to be addressed before anyone worries about how to
> select poptrie vs lc-trie.
>

> The patch does not handle errors (e.g., if attributes such as tos,
> metric/priority and multipath are not allowed you need to fail the route
> insert;

Poptrie is not intended to replace LC-trie for processing incoming
packets. It rather tries to provide an alternative way to do FIB
lookup in XDP forwarding. I know, its confusing that in the patch,
fib_lookup calls poptrie_lookup. This is just to show how
poptrie_lookup should be called. We shouldn't actually use
poptrie_lookup in fib_lookup.

TOS, metric/priority and multipath can  easily be incorporated by
storing fib_alias rather than netdevice, But the main objective here
is not to worry about TOS, metric/priority, and so on. Let's assume
that we want Linux to work as a TCAM/ ASIC based router. The only job
of Linux here is to forward incoming packet to a destination port ASAP
without worrying about those TOS, metric/priority, and so on.

further, what happens if someone creates > 255 netdevices?),

Most of the commercial ASIC/TCAM routers have no more than 64 ports
these days. I think, 255 netdevice is sufficient in that case. If we
need more than 255 NICs, we can accommodate that by using u16 rather
than u8.

> last patch has both fib tables populated (a no-go), does not handle
> delete or dumps. In the current form, the poptrie algorithm can not be

Yeah, we will need to implement delete/update and dumps. Those will
not be the hardest part, I think. Insertion and lookup are the main
challenge. Once everyone agree on Insertion and Lookup, those can be
implemented incrementally.

Yes, delete and dumps will be needed. This
> taken for a test drive. My suggestion to make it a compile time
> selection is just so people can actually try it out using current admin
> tools.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-08-28  2:52 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-27  2:28 [PATCH RFC net-next] net/fib: Poptrie based FIB lookup Md. Islam
2018-08-27 16:24 ` Stephen Hemminger
2018-08-27 16:56   ` David Ahern
2018-08-27 23:03     ` Md. Islam
2018-08-27 22:29   ` Md. Islam

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.