All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
@ 2012-01-16 12:13 Štefan Gula
  2012-01-16 13:19 ` Eric Dumazet
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Štefan Gula @ 2012-01-16 12:13 UTC (permalink / raw)
  To: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy
  Cc: netdev, linux-kernel

From: Stefan Gula <steweg@gmail.com

This patch is an extension for current Ethernet over GRE
implementation, which allows user to create virtual bridge (multipoint
VPN) and forward traffic based on Ethernet MAC address informations in
it. It simulates the Bridge bahaviour learing mechanism, but instead
of learning port ID from which given MAC address comes, it learns IP
address of peer which encapsulated given packet. Multicast, Broadcast
and unknown-multicast traffic is send over network as multicast
enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
represented as one single virtual switch on logical level and be also
represented as one multicast IPv4 address on network level.

Signed-off-by: Stefan Gula <steweg@gmail.com>

---

Patch was tested for more than one year in real network infrastructure
consisting of 98 Linux based access-points

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-my/Documentation/dontdiff
--- linux-3.2.1-orig/Documentation/dontdiff	2012-01-16 12:32:18.000000000 +0100
+++ linux-3.2.1-my/Documentation/dontdiff	2012-01-12 20:42:45.000000000 +0100
@@ -260,5 +260,3 @@ wakeup.lds
 zImage*
 zconf.hash.c
 zoffset.h
-mkpiggy
-mdp
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h
--- linux-3.2.1-orig/include/net/ipip.h	2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/include/net/ipip.h	2012-01-16 11:17:01.000000000 +0100
@@ -27,6 +27,14 @@ struct ip_tunnel {
 	__u32			o_seqno;	/* The last output seqno */
 	int			hlen;		/* Precalculated GRE header length */
 	int			mlink;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#define GRETAP_BR_HASH_BITS 8
+#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
+	struct hlist_head	hash[GRETAP_BR_HASH_SIZE];
+	spinlock_t		hash_lock;
+	unsigned long		ageing_time;
+	struct timer_list	gc_timer;
+#endif

 	struct ip_tunnel_parm	parms;

diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig
--- linux-3.2.1-orig/net/ipv4/Kconfig	2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/Kconfig	2012-01-16 12:37:00.000000000 +0100
@@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
 	  Network), but can be distributed all over the Internet. If you want
 	  to do that, say Y here and to "IP multicast routing" below.

+config NET_IPGRE_BRIDGE
+	bool "IP: Ethernet over multipoint GRE over IP"
+	depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
+	help
+	  Allows you to use multipoint GRE VPN as virtual switch and interconnect
+	  several L2 endpoints over L3 routed infrastructure. It is useful for
+	  creating multipoint L2 VPNs which can be later used inside bridge
+	  interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
+
 config IP_MROUTE
 	bool "IP: multicast routing"
 	depends on IP_MULTICAST
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c
--- linux-3.2.1-orig/net/ipv4/ip_gre.c	2012-01-12 20:42:45.000000000 +0100
+++ linux-3.2.1-my/net/ipv4/ip_gre.c	2012-01-16 12:29:03.000000000 +0100
@@ -52,6 +52,11 @@
 #include <net/ip6_route.h>
 #endif

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#endif
+
 /*
    Problems & solutions
    --------------------
@@ -134,6 +139,199 @@ struct ipgre_net {
 	struct net_device *fb_tunnel_dev;
 };

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	/*
+	 * This part of code includes codes to enable L2 ethernet
+	 * switch virtualization over IP routed infrastructure with
+	 * utilization of multicast capable endpoint using Ethernet
+	 * over GRE
+	 *
+	 * Author: Stefan Gula
+	 * Signed-off-by: Stefan Gula <steweg@gmail.com>
+	 */
+struct mac_addr {
+	unsigned char   addr[6];
+};
+
+struct ipgre_tap_bridge_entry {
+	struct hlist_node	hlist;
+	u32			raddr;
+	struct mac_addr		addr;
+	struct net_device	*dev;
+	struct rcu_head		rcu;
+	atomic_t		use_count;
+	unsigned long		ageing_timer;
+};
+
+static struct kmem_cache *ipgre_tap_bridge_cache __read_mostly;
+static u32 ipgre_salt __read_mostly;
+
+static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
+{
+	u32 key = get_unaligned((u32 *)(mac + 2));
+	return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
+}
+
+static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
+		const struct ipgre_tap_bridge_entry *entry)
+{
+	return time_before_eq(entry->ageing_timer + tunnel->ageing_time,
+		jiffies);
+}
+
+static void ipgre_tap_bridge_rcu_free(struct rcu_head *head)
+{
+	struct ipgre_tap_bridge_entry *entry
+		= container_of(head, struct ipgre_tap_bridge_entry, rcu);
+	kmem_cache_free(ipgre_tap_bridge_cache, entry);
+}
+
+void ipgre_tap_bridge_put(struct ipgre_tap_bridge_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->use_count))
+		call_rcu(&entry->rcu, ipgre_tap_bridge_rcu_free);
+}
+
+static inline void ipgre_tap_bridge_delete(struct
ipgre_tap_bridge_entry *entry)
+{
+	hlist_del_rcu(&entry->hlist);
+	ipgre_tap_bridge_put(entry);
+}
+
+static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
+		struct hlist_head *head,
+		u32 source,
+		const unsigned char *addr, struct net_device *dev)
+{
+	struct ipgre_tap_bridge_entry *entry;
+	entry = kmem_cache_alloc(ipgre_tap_bridge_cache, GFP_ATOMIC);
+	if (entry) {
+		memcpy(entry->addr.addr, addr, ETH_ALEN);
+		atomic_set(&entry->use_count, 1);
+		hlist_add_head_rcu(&entry->hlist, head);
+		entry->raddr = source;
+		entry->ageing_timer = jiffies;
+		entry->dev = dev;
+	}
+	return entry;
+}
+
+struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(struct ip_tunnel *tunnel,
+	const unsigned char *addr)
+{
+	struct hlist_node *h;
+	struct ipgre_tap_bridge_entry *entry;
+	hlist_for_each_entry_rcu(entry, h,
+		&tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist)
+	{
+		if (!compare_ether_addr(entry->addr.addr, addr)) {
+			if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
+				entry)))
+				break;
+			return entry;
+		}
+	}
+
+	return NULL;
+}
+
+struct ipgre_tap_bridge_entry *ipgre_tap_bridge_get(struct ip_tunnel *tunnel,
+	const unsigned char *addr)
+{
+	struct ipgre_tap_bridge_entry *entry;
+	rcu_read_lock();
+	entry = __ipgre_tap_bridge_get(tunnel, addr);
+	if (entry && !atomic_inc_not_zero(&entry->use_count))
+		entry = NULL;
+	rcu_read_unlock();
+	return entry;
+}
+
+__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
+	const unsigned char *addr)
+{
+	struct ipgre_tap_bridge_entry *entry;
+	entry = ipgre_tap_bridge_get(tunnel, addr);
+	if (entry == NULL)
+		return 0;
+	else
+		return entry->raddr;
+}
+
+
+static inline struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
+	struct hlist_head *head,
+	const unsigned char *addr)
+{
+	struct hlist_node *h;
+	struct ipgre_tap_bridge_entry *entry;
+	hlist_for_each_entry_rcu(entry, h, head, hlist) {
+		if (!compare_ether_addr(entry->addr.addr, addr))
+			return entry;
+	}
+	return NULL;
+}
+
+int ipgre_tap_bridge_init(void)
+{
+	ipgre_tap_bridge_cache = kmem_cache_create("ipgre_tap_bridge_cache",
+		sizeof(struct ipgre_tap_bridge_entry),
+		0,
+		SLAB_HWCACHE_ALIGN, NULL);
+	if (!ipgre_tap_bridge_cache)
+		return -ENOMEM;
+	get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
+	return 0;
+}
+
+void ipgre_tap_bridge_fini(void)
+{
+	kmem_cache_destroy(ipgre_tap_bridge_cache);
+}
+
+void ipgre_tap_bridge_cleanup(unsigned long _data)
+{
+	struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
+	unsigned long delay = tunnel->ageing_time;
+	unsigned long next_timer = jiffies + tunnel->ageing_time;
+	int i;
+	spin_lock_bh(&tunnel->hash_lock);
+	for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
+		struct ipgre_tap_bridge_entry *entry;
+		struct hlist_node *h, *n;
+		hlist_for_each_entry_safe(entry, h, n,
+			&tunnel->hash[i], hlist)
+		{
+			unsigned long this_timer;
+			this_timer = entry->ageing_timer + delay;
+			if (time_before_eq(this_timer, jiffies))
+				ipgre_tap_bridge_delete(entry);
+			else if (time_before(this_timer, next_timer))
+				next_timer = this_timer;
+		}
+	}
+	spin_unlock_bh(&tunnel->hash_lock);
+	mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));
+}
+
+void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
+{
+	int i;
+	spin_lock_bh(&tunnel->hash_lock);
+	for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
+		struct ipgre_tap_bridge_entry *entry;
+		struct hlist_node *h, *n;
+		hlist_for_each_entry_safe(entry, h, n,
+			&tunnel->hash[i], hlist)
+		{
+			ipgre_tap_bridge_delete(entry);
+		}
+	}
+	spin_unlock_bh(&tunnel->hash_lock);
+}
+
+#endif
+
 /* Tunnel hash table */

 /*
@@ -563,6 +761,13 @@ static int ipgre_rcv(struct sk_buff *skb
 	int    offset = 4;
 	__be16 gre_proto;

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	u32 orig_source;
+	struct hlist_head *head;
+	struct ipgre_tap_bridge_entry *entry;
+	struct ethhdr *tethhdr;
+#endif
+
 	if (!pskb_may_pull(skb, 16))
 		goto drop_nolock;

@@ -659,10 +864,38 @@ static int ipgre_rcv(struct sk_buff *skb
 				tunnel->dev->stats.rx_errors++;
 				goto drop;
 			}
-
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+			orig_source = iph->saddr;
+#endif
 			iph = ip_hdr(skb);
 			skb->protocol = eth_type_trans(skb, tunnel->dev);
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+			if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
+				tethhdr = eth_hdr(skb);
+				if ((tethhdr->h_source[0]&1) == 0) {
+					head = &tunnel->hash[
+						ipgre_tap_bridge_hash(
+							tethhdr->h_source)];
+					entry = ipgre_tap_bridge_find(head,
+						tethhdr->h_source);
+					if (likely(entry)) {
+						entry->raddr = orig_source;
+						entry->ageing_timer = jiffies;
+					} else {
+						spin_lock(&tunnel->hash_lock);
+						if (!ipgre_tap_bridge_find(head,
+							tethhdr->h_source))
+							ipgre_tap_bridge_create(
+							 head,
+							 orig_source,
+							 tethhdr->h_source,
+							 tunnel->dev);
+						spin_unlock(&tunnel->hash_lock);
+					}
+				}
+			}
+#endif
 		}

 		tstats = this_cpu_ptr(tunnel->dev->tstats);
@@ -716,7 +949,17 @@ static netdev_tx_t ipgre_tunnel_xmit(str
 		tiph = &tunnel->parms.iph;
 	}

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	if ((dev->type == ARPHRD_ETHER) && ipv4_is_multicast(
+		tunnel->parms.iph.daddr))
+		dst = ipgre_tap_bridge_get_raddr(tunnel,
+			((struct ethhdr *)skb->data)->h_dest);
+	if (dst == 0)
+		dst = tiph->daddr;
+	if (dst == 0) {
+#else
 	if ((dst = tiph->daddr) == 0) {
+#endif
 		/* NBMA tunnel */

 		if (skb_dst(skb) == NULL) {
@@ -1209,6 +1452,16 @@ static int ipgre_open(struct net_device
 			return -EADDRNOTAVAIL;
 		t->mlink = dev->ifindex;
 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+		if (t->dev->type == ARPHRD_ETHER) {
+			INIT_HLIST_HEAD(t->hash);
+			spin_lock_init(&t->hash_lock);
+			t->ageing_time = 300 * HZ;
+			setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
+				(unsigned long) t);
+			mod_timer(&t->gc_timer, jiffies + t->ageing_time);
+		}
+#endif
 	}
 	return 0;
 }
@@ -1219,6 +1472,12 @@ static int ipgre_close(struct net_device

 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 		struct in_device *in_dev;
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+		if (t->dev->type == ARPHRD_ETHER) {
+			ipgre_tap_bridge_flush(t);
+			del_timer_sync(&t->gc_timer);
+		}
+#endif
 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
 		if (in_dev)
 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
@@ -1341,6 +1600,12 @@ static int __net_init ipgre_init_net(str
 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 	int err;

+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	err = ipgre_tap_bridge_init();
+	if (err)
+		goto err_out;
+#endif
+
 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
 					   ipgre_tunnel_setup);
 	if (!ign->fb_tunnel_dev) {
@@ -1362,6 +1627,10 @@ static int __net_init ipgre_init_net(str
 err_reg_dev:
 	ipgre_dev_free(ign->fb_tunnel_dev);
 err_alloc_dev:
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	ipgre_tap_bridge_fini();
+err_out:
+#endif
 	return err;
 }

@@ -1375,6 +1644,9 @@ static void __net_exit ipgre_exit_net(st
 	ipgre_destroy_tunnels(ign, &list);
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
+#ifdef CONFIG_NET_IPGRE_BRIDGE
+	ipgre_tap_bridge_fini();
+#endif
 }

 static struct pernet_operations ipgre_net_ops = {

-- 
Stefan Gula
CCNP, CCIP

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 12:13 [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP Štefan Gula
@ 2012-01-16 13:19 ` Eric Dumazet
  2012-01-16 13:30   ` David Laight
  2012-01-16 14:05   ` Štefan Gula
  2012-01-16 13:35 ` David Lamparter
  2012-01-16 16:36 ` Stephen Hemminger
  2 siblings, 2 replies; 19+ messages in thread
From: Eric Dumazet @ 2012-01-16 13:19 UTC (permalink / raw)
  To: Štefan Gula
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

Le lundi 16 janvier 2012 à 13:13 +0100, Štefan Gula a écrit :
> From: Stefan Gula <steweg@gmail.com
> 
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address informations in
> it. It simulates the Bridge bahaviour learing mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
> 
> Signed-off-by: Stefan Gula <steweg@gmail.com>
> 
> ---
> 
> Patch was tested for more than one year in real network infrastructure
> consisting of 98 Linux based access-points
> 

OK, but please always send new patches on top on net-next tree.

> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
> linux-3.2.1-orig/Documentation/dontdiff
> linux-3.2.1-my/Documentation/dontdiff
> --- linux-3.2.1-orig/Documentation/dontdiff	2012-01-16 12:32:18.000000000 +0100
> +++ linux-3.2.1-my/Documentation/dontdiff	2012-01-12 20:42:45.000000000 +0100
> @@ -260,5 +260,3 @@ wakeup.lds
>  zImage*
>  zconf.hash.c
>  zoffset.h
> -mkpiggy
> -mdp

Hmm, what is this ?

> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
> linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h
> --- linux-3.2.1-orig/include/net/ipip.h	2012-01-12 20:42:45.000000000 +0100
> +++ linux-3.2.1-my/include/net/ipip.h	2012-01-16 11:17:01.000000000 +0100
> @@ -27,6 +27,14 @@ struct ip_tunnel {
>  	__u32			o_seqno;	/* The last output seqno */
>  	int			hlen;		/* Precalculated GRE header length */
>  	int			mlink;
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +#define GRETAP_BR_HASH_BITS 8
> +#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
> +	struct hlist_head	hash[GRETAP_BR_HASH_SIZE];
> +	spinlock_t		hash_lock;
> +	unsigned long		ageing_time;
> +	struct timer_list	gc_timer;
> +#endif
> 
>  	struct ip_tunnel_parm	parms;
> 
> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
> linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig
> --- linux-3.2.1-orig/net/ipv4/Kconfig	2012-01-12 20:42:45.000000000 +0100
> +++ linux-3.2.1-my/net/ipv4/Kconfig	2012-01-16 12:37:00.000000000 +0100
> @@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
>  	  Network), but can be distributed all over the Internet. If you want
>  	  to do that, say Y here and to "IP multicast routing" below.
> 
> +config NET_IPGRE_BRIDGE
> +	bool "IP: Ethernet over multipoint GRE over IP"
> +	depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
> +	help
> +	  Allows you to use multipoint GRE VPN as virtual switch and interconnect
> +	  several L2 endpoints over L3 routed infrastructure. It is useful for
> +	  creating multipoint L2 VPNs which can be later used inside bridge
> +	  interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
> +
>  config IP_MROUTE
>  	bool "IP: multicast routing"
>  	depends on IP_MULTICAST
> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
> linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c
> --- linux-3.2.1-orig/net/ipv4/ip_gre.c	2012-01-12 20:42:45.000000000 +0100
> +++ linux-3.2.1-my/net/ipv4/ip_gre.c	2012-01-16 12:29:03.000000000 +0100
> @@ -52,6 +52,11 @@
>  #include <net/ip6_route.h>
>  #endif
> 
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +#include <linux/jhash.h>
> +#include <asm/unaligned.h>
> +#endif
> +
>  /*
>     Problems & solutions
>     --------------------
> @@ -134,6 +139,199 @@ struct ipgre_net {
>  	struct net_device *fb_tunnel_dev;
>  };
> 
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +	/*
> +	 * This part of code includes codes to enable L2 ethernet
> +	 * switch virtualization over IP routed infrastructure with
> +	 * utilization of multicast capable endpoint using Ethernet
> +	 * over GRE
> +	 *
> +	 * Author: Stefan Gula
> +	 * Signed-off-by: Stefan Gula <steweg@gmail.com>
> +	 */
> +struct mac_addr {
> +	unsigned char   addr[6];

Not sure if you need a 'struct mac_addr' for this...

	unsigned char mac_addr[ETH_ALEN] ?
> +};
> +
> +struct ipgre_tap_bridge_entry {
> +	struct hlist_node	hlist;
> +	u32			raddr;
> +	struct mac_addr		addr;
> +	struct net_device	*dev;
> +	struct rcu_head		rcu;
> +	atomic_t		use_count;
> +	unsigned long		ageing_timer;


> +};
> +
> +static struct kmem_cache *ipgre_tap_bridge_cache __read_mostly;
> +static u32 ipgre_salt __read_mostly;
> +
> +static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
> +{
> +	u32 key = get_unaligned((u32 *)(mac + 2));
> +	return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
> +}
> +
> +static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
> +		const struct ipgre_tap_bridge_entry *entry)
> +{
> +	return time_before_eq(entry->ageing_timer + tunnel->ageing_time,
> +		jiffies);
> +}
> +
> +static void ipgre_tap_bridge_rcu_free(struct rcu_head *head)
> +{
> +	struct ipgre_tap_bridge_entry *entry
> +		= container_of(head, struct ipgre_tap_bridge_entry, rcu);
> +	kmem_cache_free(ipgre_tap_bridge_cache, entry);
> +}
> +
> +void ipgre_tap_bridge_put(struct ipgre_tap_bridge_entry *entry)
> +{
> +	if (atomic_dec_and_test(&entry->use_count))
> +		call_rcu(&entry->rcu, ipgre_tap_bridge_rcu_free);

	You can now use kfree_rcu() and get rid of ipgre_tap_bridge_rcu_free()

> +}
> +
> +static inline void ipgre_tap_bridge_delete(struct
> ipgre_tap_bridge_entry *entry)
> +{
> +	hlist_del_rcu(&entry->hlist);
> +	ipgre_tap_bridge_put(entry);
> +}
> +
> +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
> +		struct hlist_head *head,
> +		u32 source,
> +		const unsigned char *addr, struct net_device *dev)
> +{
> +	struct ipgre_tap_bridge_entry *entry;
> +	entry = kmem_cache_alloc(ipgre_tap_bridge_cache, GFP_ATOMIC);
> +	if (entry) {
> +		memcpy(entry->addr.addr, addr, ETH_ALEN);
> +		atomic_set(&entry->use_count, 1);
> +		hlist_add_head_rcu(&entry->hlist, head);
> +		entry->raddr = source;
> +		entry->ageing_timer = jiffies;
> +		entry->dev = dev;
> +	}
> +	return entry;
> +}
> +
> +struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(struct ip_tunnel *tunnel,
> +	const unsigned char *addr)
> +{
> +	struct hlist_node *h;
> +	struct ipgre_tap_bridge_entry *entry;
> +	hlist_for_each_entry_rcu(entry, h,
> +		&tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist)
> +	{
> +		if (!compare_ether_addr(entry->addr.addr, addr)) {
> +			if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
> +				entry)))
> +				break;
> +			return entry;
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +struct ipgre_tap_bridge_entry *ipgre_tap_bridge_get(struct ip_tunnel *tunnel,
> +	const unsigned char *addr)
> +{
> +	struct ipgre_tap_bridge_entry *entry;
> +	rcu_read_lock();
> +	entry = __ipgre_tap_bridge_get(tunnel, addr);
> +	if (entry && !atomic_inc_not_zero(&entry->use_count))
> +		entry = NULL;
> +	rcu_read_unlock();
> +	return entry;
> +}
> +

Unfortunately you call ipgre_tap_bridge_get() from
ipgre_tap_bridge_get_raddr() but you dont release the refcount on
use_count. Leak in ipgre_tunnel_xmit()


> +__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
> +	const unsigned char *addr)
> +{
> +	struct ipgre_tap_bridge_entry *entry;
> +	entry = ipgre_tap_bridge_get(tunnel, addr);

	So maybe you want __ipgre_tap_bridge_get(tunnel, addr); here ?

> +	if (entry == NULL)
> +		return 0;
> +	else
> +		return entry->raddr;
> +}
> +
> +
> +static inline struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
> +	struct hlist_head *head,
> +	const unsigned char *addr)
> +{
> +	struct hlist_node *h;
> +	struct ipgre_tap_bridge_entry *entry;
> +	hlist_for_each_entry_rcu(entry, h, head, hlist) {
> +		if (!compare_ether_addr(entry->addr.addr, addr))



> +			return entry;
> +	}
> +	return NULL;
> +}
> +
> +int ipgre_tap_bridge_init(void)
> +{
> +	ipgre_tap_bridge_cache = kmem_cache_create("ipgre_tap_bridge_cache",
> +		sizeof(struct ipgre_tap_bridge_entry),
> +		0,
> +		SLAB_HWCACHE_ALIGN, NULL);
> +	if (!ipgre_tap_bridge_cache)
> +		return -ENOMEM;
> +	get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
> +	return 0;
> +}
> +
> +void ipgre_tap_bridge_fini(void)
> +{
> +	kmem_cache_destroy(ipgre_tap_bridge_cache);
> +}
> +
> +void ipgre_tap_bridge_cleanup(unsigned long _data)
> +{
> +	struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
> +	unsigned long delay = tunnel->ageing_time;
> +	unsigned long next_timer = jiffies + tunnel->ageing_time;
> +	int i;
> +	spin_lock_bh(&tunnel->hash_lock);

Since you are called from timer, no need to use _bh() variant.

	
	spin_lock(&tunnel->hash_lock);

> +	for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> +		struct ipgre_tap_bridge_entry *entry;
> +		struct hlist_node *h, *n;
> +		hlist_for_each_entry_safe(entry, h, n,
> +			&tunnel->hash[i], hlist)
> +		{
> +			unsigned long this_timer;
> +			this_timer = entry->ageing_timer + delay;
> +			if (time_before_eq(this_timer, jiffies))
> +				ipgre_tap_bridge_delete(entry);
> +			else if (time_before(this_timer, next_timer))
> +				next_timer = this_timer;
> +		}
> +	}
> +	spin_unlock_bh(&tunnel->hash_lock);
> +	mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));

wow... why setup a 250 ms timer, if entries are valid 300 seconds ?



> +}
> +
> +void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
> +{
> +	int i;
> +	spin_lock_bh(&tunnel->hash_lock);
> +	for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> +		struct ipgre_tap_bridge_entry *entry;
> +		struct hlist_node *h, *n;
> +		hlist_for_each_entry_safe(entry, h, n,
> +			&tunnel->hash[i], hlist)
> +		{
> +			ipgre_tap_bridge_delete(entry);
> +		}
> +	}
> +	spin_unlock_bh(&tunnel->hash_lock);
> +}
> +
> +#endif
> +
>  /* Tunnel hash table */
> 
>  /*
> @@ -563,6 +761,13 @@ static int ipgre_rcv(struct sk_buff *skb
>  	int    offset = 4;
>  	__be16 gre_proto;
> 
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +	u32 orig_source;
> +	struct hlist_head *head;
> +	struct ipgre_tap_bridge_entry *entry;
> +	struct ethhdr *tethhdr;
> +#endif
> +
>  	if (!pskb_may_pull(skb, 16))
>  		goto drop_nolock;
> 
> @@ -659,10 +864,38 @@ static int ipgre_rcv(struct sk_buff *skb
>  				tunnel->dev->stats.rx_errors++;
>  				goto drop;
>  			}
> -
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +			orig_source = iph->saddr;
> +#endif
>  			iph = ip_hdr(skb);
>  			skb->protocol = eth_type_trans(skb, tunnel->dev);
>  			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +			if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
> +				tethhdr = eth_hdr(skb);

> +				if ((tethhdr->h_source[0]&1) == 0) {

	if (!is_multicast_ether_addr(tethhdr->h_source)) ?

> +					head = &tunnel->hash[
> +						ipgre_tap_bridge_hash(
> +							tethhdr->h_source)];
> +					entry = ipgre_tap_bridge_find(head,
> +						tethhdr->h_source);
> +					if (likely(entry)) {
> +						entry->raddr = orig_source;
> +						entry->ageing_timer = jiffies;
> +					} else {
> +						spin_lock(&tunnel->hash_lock);
> +						if (!ipgre_tap_bridge_find(head,
> +							tethhdr->h_source))
> +							ipgre_tap_bridge_create(
> +							 head,
> +							 orig_source,
> +							 tethhdr->h_source,
> +							 tunnel->dev);
> +						spin_unlock(&tunnel->hash_lock);
> +					}
> +				}
> +			}
> +#endif
>  		}
> 
>  		tstats = this_cpu_ptr(tunnel->dev->tstats);
> @@ -716,7 +949,17 @@ static netdev_tx_t ipgre_tunnel_xmit(str
>  		tiph = &tunnel->parms.iph;
>  	}
> 
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +	if ((dev->type == ARPHRD_ETHER) && ipv4_is_multicast(

please format like this :

	if ((dev->type == ARPHRD_ETHER) &&
	    ipv4_is_multicast(tunnel->parms.iph.daddr))

> +		tunnel->parms.iph.daddr))
> +		dst = ipgre_tap_bridge_get_raddr(tunnel,
> +			((struct ethhdr *)skb->data)->h_dest);
> +	if (dst == 0)
> +		dst = tiph->daddr;
> +	if (dst == 0) {
> +#else
>  	if ((dst = tiph->daddr) == 0) {
> +#endif
>  		/* NBMA tunnel */
> 
>  		if (skb_dst(skb) == NULL) {
> @@ -1209,6 +1452,16 @@ static int ipgre_open(struct net_device
>  			return -EADDRNOTAVAIL;
>  		t->mlink = dev->ifindex;
>  		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +		if (t->dev->type == ARPHRD_ETHER) {
> +			INIT_HLIST_HEAD(t->hash);
> +			spin_lock_init(&t->hash_lock);
> +			t->ageing_time = 300 * HZ;
> +			setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
> +				(unsigned long) t);
> +			mod_timer(&t->gc_timer, jiffies + t->ageing_time);


> +		}
> +#endif
>  	}
>  	return 0;
>  }
> @@ -1219,6 +1472,12 @@ static int ipgre_close(struct net_device
> 
>  	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
>  		struct in_device *in_dev;
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +		if (t->dev->type == ARPHRD_ETHER) {
> +			ipgre_tap_bridge_flush(t);
> +			del_timer_sync(&t->gc_timer);
> +		}
> +#endif
>  		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
>  		if (in_dev)
>  			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
> @@ -1341,6 +1600,12 @@ static int __net_init ipgre_init_net(str
>  	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
>  	int err;
> 
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +	err = ipgre_tap_bridge_init();
> +	if (err)
> +		goto err_out;
> +#endif
> +
>  	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
>  					   ipgre_tunnel_setup);
>  	if (!ign->fb_tunnel_dev) {
> @@ -1362,6 +1627,10 @@ static int __net_init ipgre_init_net(str
>  err_reg_dev:
>  	ipgre_dev_free(ign->fb_tunnel_dev);
>  err_alloc_dev:
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +	ipgre_tap_bridge_fini();
> +err_out:
> +#endif
>  	return err;
>  }
> 
> @@ -1375,6 +1644,9 @@ static void __net_exit ipgre_exit_net(st
>  	ipgre_destroy_tunnels(ign, &list);
>  	unregister_netdevice_many(&list);
>  	rtnl_unlock();
> +#ifdef CONFIG_NET_IPGRE_BRIDGE
> +	ipgre_tap_bridge_fini();
> +#endif
>  }
> 
>  static struct pernet_operations ipgre_net_ops = {
> 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 13:19 ` Eric Dumazet
@ 2012-01-16 13:30   ` David Laight
  2012-01-16 13:42     ` Štefan Gula
  2012-01-16 14:05   ` Štefan Gula
  1 sibling, 1 reply; 19+ messages in thread
From: David Laight @ 2012-01-16 13:30 UTC (permalink / raw)
  To: Eric Dumazet, Štefan Gula
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

> > +	for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
> > +		struct ipgre_tap_bridge_entry *entry;
> > +		struct hlist_node *h, *n;
> > +		hlist_for_each_entry_safe(entry, h, n,
> > +			&tunnel->hash[i], hlist)
> > +		{
> > +			unsigned long this_timer;
> > +			this_timer = entry->ageing_timer + delay;
> > +			if (time_before_eq(this_timer, jiffies))
> > +				ipgre_tap_bridge_delete(entry);
> > +			else if (time_before(this_timer, next_timer))
> > +				next_timer = this_timer;
> > +		}
> > +	}
> > +	spin_unlock_bh(&tunnel->hash_lock);
> > +	mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));
> 
> wow... why setup a 250 ms timer, if entries are valid 300 seconds ?

Isn't that code trying to wakeup 250ms after the first expiry of any
of its items?

Do you even need a timer at all?

It may be enough to just put a timestamp into each entry
and tidy up when scanning one of the hash chains in (say)
the 'add item' path - when you need write access anyway.

A hash table might not be the best structure either!
The hash lookup is still o(n) for n >> table_size.
It also may be likely that you'll do repeated lookups for a small
number of items - in which case using a hash to cache recent lookups
might be useful (maybe with some type of balanced tree structure)

	David



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 12:13 [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP Štefan Gula
  2012-01-16 13:19 ` Eric Dumazet
@ 2012-01-16 13:35 ` David Lamparter
  2012-01-16 16:36 ` Stephen Hemminger
  2 siblings, 0 replies; 19+ messages in thread
From: David Lamparter @ 2012-01-16 13:35 UTC (permalink / raw)
  To: Štefan Gula
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

On Mon, Jan 16, 2012 at 01:13:19PM +0100, Štefan Gula wrote:
> it learns IP address of peer which encapsulated given packet.

This feature is already present in the Linux kernel, albeit only for IP
to IP lookup. Please look at "userspace ARPd" and its relation to
multipoint GRE. OpenNHRP is the associated userspace part.

That code reuses the existing neighbor management infrastructure. Adding
support for gretap and automatic learning from incoming packets
shouldn't be too hard...


-David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 13:30   ` David Laight
@ 2012-01-16 13:42     ` Štefan Gula
  0 siblings, 0 replies; 19+ messages in thread
From: Štefan Gula @ 2012-01-16 13:42 UTC (permalink / raw)
  To: David Laight
  Cc: Eric Dumazet, Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

2012/1/16 David Laight <David.Laight@aculab.com>:
>> > +   for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
>> > +           struct ipgre_tap_bridge_entry *entry;
>> > +           struct hlist_node *h, *n;
>> > +           hlist_for_each_entry_safe(entry, h, n,
>> > +                   &tunnel->hash[i], hlist)
>> > +           {
>> > +                   unsigned long this_timer;
>> > +                   this_timer = entry->ageing_timer + delay;
>> > +                   if (time_before_eq(this_timer, jiffies))
>> > +                           ipgre_tap_bridge_delete(entry);
>> > +                   else if (time_before(this_timer, next_timer))
>> > +                           next_timer = this_timer;
>> > +           }
>> > +   }
>> > +   spin_unlock_bh(&tunnel->hash_lock);
>> > +   mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));
>>
>> wow... why setup a 250 ms timer, if entries are valid 300 seconds ?
>
> Isn't that code trying to wakeup 250ms after the first expiry of any
> of its items?
>
> Do you even need a timer at all?
>
> It may be enough to just put a timestamp into each entry
> and tidy up when scanning one of the hash chains in (say)
> the 'add item' path - when you need write access anyway.
>
> A hash table might not be the best structure either!
> The hash lookup is still o(n) for n >> table_size.
> It also may be likely that you'll do repeated lookups for a small
> number of items - in which case using a hash to cache recent lookups
> might be useful (maybe with some type of balanced tree structure)
>
>        David
>
>

That part of code is modified from original linux bridge code, as I
wanted to avoid of developing something that was already developed.
The timer is actually needed to network


-- 
Stefan Gula

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 13:19 ` Eric Dumazet
  2012-01-16 13:30   ` David Laight
@ 2012-01-16 14:05   ` Štefan Gula
  2012-01-16 14:25     ` Eric Dumazet
  1 sibling, 1 reply; 19+ messages in thread
From: Štefan Gula @ 2012-01-16 14:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

Dňa 16. januára 2012 14:19, Eric Dumazet <eric.dumazet@gmail.com> napísal/a:
> Le lundi 16 janvier 2012 à 13:13 +0100, Štefan Gula a écrit :
>> From: Stefan Gula <steweg@gmail.com
>>
>> This patch is an extension for current Ethernet over GRE
>> implementation, which allows user to create virtual bridge (multipoint
>> VPN) and forward traffic based on Ethernet MAC address informations in
>> it. It simulates the Bridge bahaviour learing mechanism, but instead
>> of learning port ID from which given MAC address comes, it learns IP
>> address of peer which encapsulated given packet. Multicast, Broadcast
>> and unknown-multicast traffic is send over network as multicast
>> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>> represented as one single virtual switch on logical level and be also
>> represented as one multicast IPv4 address on network level.
>>
>> Signed-off-by: Stefan Gula <steweg@gmail.com>
>>
>> ---
>>
>> Patch was tested for more than one year in real network infrastructure
>> consisting of 98 Linux based access-points
>>
>
> OK, but please always send new patches on top on net-next tree.
>
>> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
>> linux-3.2.1-orig/Documentation/dontdiff
>> linux-3.2.1-my/Documentation/dontdiff
>> --- linux-3.2.1-orig/Documentation/dontdiff   2012-01-16 12:32:18.000000000 +0100
>> +++ linux-3.2.1-my/Documentation/dontdiff     2012-01-12 20:42:45.000000000 +0100
>> @@ -260,5 +260,3 @@ wakeup.lds
>>  zImage*
>>  zconf.hash.c
>>  zoffset.h
>> -mkpiggy
>> -mdp
>
> Hmm, what is this ?
Sorry, this one is not related to my patch, I have follwed the
guideline and those 2 files always end-up in my diff file even I
didn't modify them, so I have added them into dontdiff file assuming
that that file will not pop-up in my diff file. Apparently wrong
assumption, so please ignore those lines

>
>> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
>> linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h
>> --- linux-3.2.1-orig/include/net/ipip.h       2012-01-12 20:42:45.000000000 +0100
>> +++ linux-3.2.1-my/include/net/ipip.h 2012-01-16 11:17:01.000000000 +0100
>> @@ -27,6 +27,14 @@ struct ip_tunnel {
>>       __u32                   o_seqno;        /* The last output seqno */
>>       int                     hlen;           /* Precalculated GRE header length */
>>       int                     mlink;
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +#define GRETAP_BR_HASH_BITS 8
>> +#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS)
>> +     struct hlist_head       hash[GRETAP_BR_HASH_SIZE];
>> +     spinlock_t              hash_lock;
>> +     unsigned long           ageing_time;
>> +     struct timer_list       gc_timer;
>> +#endif
>>
>>       struct ip_tunnel_parm   parms;
>>
>> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
>> linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig
>> --- linux-3.2.1-orig/net/ipv4/Kconfig 2012-01-12 20:42:45.000000000 +0100
>> +++ linux-3.2.1-my/net/ipv4/Kconfig   2012-01-16 12:37:00.000000000 +0100
>> @@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST
>>         Network), but can be distributed all over the Internet. If you want
>>         to do that, say Y here and to "IP multicast routing" below.
>>
>> +config NET_IPGRE_BRIDGE
>> +     bool "IP: Ethernet over multipoint GRE over IP"
>> +     depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST
>> +     help
>> +       Allows you to use multipoint GRE VPN as virtual switch and interconnect
>> +       several L2 endpoints over L3 routed infrastructure. It is useful for
>> +       creating multipoint L2 VPNs which can be later used inside bridge
>> +       interfaces If you want to use. GRE multipoint L2 VPN feature say Y.
>> +
>>  config IP_MROUTE
>>       bool "IP: multicast routing"
>>       depends on IP_MULTICAST
>> diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff
>> linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c
>> --- linux-3.2.1-orig/net/ipv4/ip_gre.c        2012-01-12 20:42:45.000000000 +0100
>> +++ linux-3.2.1-my/net/ipv4/ip_gre.c  2012-01-16 12:29:03.000000000 +0100
>> @@ -52,6 +52,11 @@
>>  #include <net/ip6_route.h>
>>  #endif
>>
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +#include <linux/jhash.h>
>> +#include <asm/unaligned.h>
>> +#endif
>> +
>>  /*
>>     Problems & solutions
>>     --------------------
>> @@ -134,6 +139,199 @@ struct ipgre_net {
>>       struct net_device *fb_tunnel_dev;
>>  };
>>
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +     /*
>> +      * This part of code includes codes to enable L2 ethernet
>> +      * switch virtualization over IP routed infrastructure with
>> +      * utilization of multicast capable endpoint using Ethernet
>> +      * over GRE
>> +      *
>> +      * Author: Stefan Gula
>> +      * Signed-off-by: Stefan Gula <steweg@gmail.com>
>> +      */
>> +struct mac_addr {
>> +     unsigned char   addr[6];
>
> Not sure if you need a 'struct mac_addr' for this...
>
>        unsigned char mac_addr[ETH_ALEN] ?
not sure either, but I used in that time when I developed the code. Do
I have to change that to make this patch to kernel?
>> +};
>> +
>> +struct ipgre_tap_bridge_entry {
>> +     struct hlist_node       hlist;
>> +     u32                     raddr;
>> +     struct mac_addr         addr;
>> +     struct net_device       *dev;
>> +     struct rcu_head         rcu;
>> +     atomic_t                use_count;
>> +     unsigned long           ageing_timer;
>
>
>> +};
>> +
>> +static struct kmem_cache *ipgre_tap_bridge_cache __read_mostly;
>> +static u32 ipgre_salt __read_mostly;
>> +
>> +static inline int ipgre_tap_bridge_hash(const unsigned char *mac)
>> +{
>> +     u32 key = get_unaligned((u32 *)(mac + 2));
>> +     return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1);
>> +}
>> +
>> +static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel,
>> +             const struct ipgre_tap_bridge_entry *entry)
>> +{
>> +     return time_before_eq(entry->ageing_timer + tunnel->ageing_time,
>> +             jiffies);
>> +}
>> +
>> +static void ipgre_tap_bridge_rcu_free(struct rcu_head *head)
>> +{
>> +     struct ipgre_tap_bridge_entry *entry
>> +             = container_of(head, struct ipgre_tap_bridge_entry, rcu);
>> +     kmem_cache_free(ipgre_tap_bridge_cache, entry);
>> +}
>> +
>> +void ipgre_tap_bridge_put(struct ipgre_tap_bridge_entry *entry)
>> +{
>> +     if (atomic_dec_and_test(&entry->use_count))
>> +             call_rcu(&entry->rcu, ipgre_tap_bridge_rcu_free);
>
>        You can now use kfree_rcu() and get rid of ipgre_tap_bridge_rcu_free()
hmm... for that part of code was copy & pasted from linux bridge code
from version 2.6.26. Do I have to change that or is it optional?
>
>> +}
>> +
>> +static inline void ipgre_tap_bridge_delete(struct
>> ipgre_tap_bridge_entry *entry)
>> +{
>> +     hlist_del_rcu(&entry->hlist);
>> +     ipgre_tap_bridge_put(entry);
>> +}
>> +
>> +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create(
>> +             struct hlist_head *head,
>> +             u32 source,
>> +             const unsigned char *addr, struct net_device *dev)
>> +{
>> +     struct ipgre_tap_bridge_entry *entry;
>> +     entry = kmem_cache_alloc(ipgre_tap_bridge_cache, GFP_ATOMIC);
>> +     if (entry) {
>> +             memcpy(entry->addr.addr, addr, ETH_ALEN);
>> +             atomic_set(&entry->use_count, 1);
>> +             hlist_add_head_rcu(&entry->hlist, head);
>> +             entry->raddr = source;
>> +             entry->ageing_timer = jiffies;
>> +             entry->dev = dev;
>> +     }
>> +     return entry;
>> +}
>> +
>> +struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get(struct ip_tunnel *tunnel,
>> +     const unsigned char *addr)
>> +{
>> +     struct hlist_node *h;
>> +     struct ipgre_tap_bridge_entry *entry;
>> +     hlist_for_each_entry_rcu(entry, h,
>> +             &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist)
>> +     {
>> +             if (!compare_ether_addr(entry->addr.addr, addr)) {
>> +                     if (unlikely(ipgre_tap_bridge_has_expired(tunnel,
>> +                             entry)))
>> +                             break;
>> +                     return entry;
>> +             }
>> +     }
>> +
>> +     return NULL;
>> +}
>> +
>> +struct ipgre_tap_bridge_entry *ipgre_tap_bridge_get(struct ip_tunnel *tunnel,
>> +     const unsigned char *addr)
>> +{
>> +     struct ipgre_tap_bridge_entry *entry;
>> +     rcu_read_lock();
>> +     entry = __ipgre_tap_bridge_get(tunnel, addr);
>> +     if (entry && !atomic_inc_not_zero(&entry->use_count))
>> +             entry = NULL;
>> +     rcu_read_unlock();
>> +     return entry;
>> +}
>> +
>
> Unfortunately you call ipgre_tap_bridge_get() from
> ipgre_tap_bridge_get_raddr() but you dont release the refcount on
> use_count. Leak in ipgre_tunnel_xmit()
>
>
>> +__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
>> +     const unsigned char *addr)
>> +{
>> +     struct ipgre_tap_bridge_entry *entry;
>> +     entry = ipgre_tap_bridge_get(tunnel, addr);
>
>        So maybe you want __ipgre_tap_bridge_get(tunnel, addr); here ?
>
Hmmm.. I am sorry, I am maybe not that expert on C coding, as most of
the codes were copied from linux bridge code. Can you give me a hint
how should I change that?

>> +     if (entry == NULL)
>> +             return 0;
>> +     else
>> +             return entry->raddr;
>> +}
>> +
>> +
>> +static inline struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find(
>> +     struct hlist_head *head,
>> +     const unsigned char *addr)
>> +{
>> +     struct hlist_node *h;
>> +     struct ipgre_tap_bridge_entry *entry;
>> +     hlist_for_each_entry_rcu(entry, h, head, hlist) {
>> +             if (!compare_ether_addr(entry->addr.addr, addr))
>
>
>
>> +                     return entry;
>> +     }
>> +     return NULL;
>> +}
>> +
>> +int ipgre_tap_bridge_init(void)
>> +{
>> +     ipgre_tap_bridge_cache = kmem_cache_create("ipgre_tap_bridge_cache",
>> +             sizeof(struct ipgre_tap_bridge_entry),
>> +             0,
>> +             SLAB_HWCACHE_ALIGN, NULL);
>> +     if (!ipgre_tap_bridge_cache)
>> +             return -ENOMEM;
>> +     get_random_bytes(&ipgre_salt, sizeof(ipgre_salt));
>> +     return 0;
>> +}
>> +
>> +void ipgre_tap_bridge_fini(void)
>> +{
>> +     kmem_cache_destroy(ipgre_tap_bridge_cache);
>> +}
>> +
>> +void ipgre_tap_bridge_cleanup(unsigned long _data)
>> +{
>> +     struct ip_tunnel *tunnel = (struct ip_tunnel *)_data;
>> +     unsigned long delay = tunnel->ageing_time;
>> +     unsigned long next_timer = jiffies + tunnel->ageing_time;
>> +     int i;
>> +     spin_lock_bh(&tunnel->hash_lock);
>
> Since you are called from timer, no need to use _bh() variant.
again copied structure from linux bridge code
>
>
>        spin_lock(&tunnel->hash_lock);
>
>> +     for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
>> +             struct ipgre_tap_bridge_entry *entry;
>> +             struct hlist_node *h, *n;
>> +             hlist_for_each_entry_safe(entry, h, n,
>> +                     &tunnel->hash[i], hlist)
>> +             {
>> +                     unsigned long this_timer;
>> +                     this_timer = entry->ageing_timer + delay;
>> +                     if (time_before_eq(this_timer, jiffies))
>> +                             ipgre_tap_bridge_delete(entry);
>> +                     else if (time_before(this_timer, next_timer))
>> +                             next_timer = this_timer;
>> +             }
>> +     }
>> +     spin_unlock_bh(&tunnel->hash_lock);
>> +     mod_timer(&tunnel->gc_timer, round_jiffies(next_timer + HZ/4));
>
> wow... why setup a 250 ms timer, if entries are valid 300 seconds ?
no idea, it was in original linux bridge code
>
>
>
>> +}
>> +
>> +void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel)
>> +{
>> +     int i;
>> +     spin_lock_bh(&tunnel->hash_lock);
>> +     for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) {
>> +             struct ipgre_tap_bridge_entry *entry;
>> +             struct hlist_node *h, *n;
>> +             hlist_for_each_entry_safe(entry, h, n,
>> +                     &tunnel->hash[i], hlist)
>> +             {
>> +                     ipgre_tap_bridge_delete(entry);
>> +             }
>> +     }
>> +     spin_unlock_bh(&tunnel->hash_lock);
>> +}
>> +
>> +#endif
>> +
>>  /* Tunnel hash table */
>>
>>  /*
>> @@ -563,6 +761,13 @@ static int ipgre_rcv(struct sk_buff *skb
>>       int    offset = 4;
>>       __be16 gre_proto;
>>
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +     u32 orig_source;
>> +     struct hlist_head *head;
>> +     struct ipgre_tap_bridge_entry *entry;
>> +     struct ethhdr *tethhdr;
>> +#endif
>> +
>>       if (!pskb_may_pull(skb, 16))
>>               goto drop_nolock;
>>
>> @@ -659,10 +864,38 @@ static int ipgre_rcv(struct sk_buff *skb
>>                               tunnel->dev->stats.rx_errors++;
>>                               goto drop;
>>                       }
>> -
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +                     orig_source = iph->saddr;
>> +#endif
>>                       iph = ip_hdr(skb);
>>                       skb->protocol = eth_type_trans(skb, tunnel->dev);
>>                       skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +                     if (ipv4_is_multicast(tunnel->parms.iph.daddr)) {
>> +                             tethhdr = eth_hdr(skb);
>
>> +                             if ((tethhdr->h_source[0]&1) == 0) {
>
>        if (!is_multicast_ether_addr(tethhdr->h_source)) ?
>
fixed

>> +                                     head = &tunnel->hash[
>> +                                             ipgre_tap_bridge_hash(
>> +                                                     tethhdr->h_source)];
>> +                                     entry = ipgre_tap_bridge_find(head,
>> +                                             tethhdr->h_source);
>> +                                     if (likely(entry)) {
>> +                                             entry->raddr = orig_source;
>> +                                             entry->ageing_timer = jiffies;
>> +                                     } else {
>> +                                             spin_lock(&tunnel->hash_lock);
>> +                                             if (!ipgre_tap_bridge_find(head,
>> +                                                     tethhdr->h_source))
>> +                                                     ipgre_tap_bridge_create(
>> +                                                      head,
>> +                                                      orig_source,
>> +                                                      tethhdr->h_source,
>> +                                                      tunnel->dev);
>> +                                             spin_unlock(&tunnel->hash_lock);
>> +                                     }
>> +                             }
>> +                     }
>> +#endif
>>               }
>>
>>               tstats = this_cpu_ptr(tunnel->dev->tstats);
>> @@ -716,7 +949,17 @@ static netdev_tx_t ipgre_tunnel_xmit(str
>>               tiph = &tunnel->parms.iph;
>>       }
>>
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +     if ((dev->type == ARPHRD_ETHER) && ipv4_is_multicast(
>
> please format like this :
>
>        if ((dev->type == ARPHRD_ETHER) &&
>            ipv4_is_multicast(tunnel->parms.iph.daddr))
>
fixed

>> +             tunnel->parms.iph.daddr))
>> +             dst = ipgre_tap_bridge_get_raddr(tunnel,
>> +                     ((struct ethhdr *)skb->data)->h_dest);
>> +     if (dst == 0)
>> +             dst = tiph->daddr;
>> +     if (dst == 0) {
>> +#else
>>       if ((dst = tiph->daddr) == 0) {
>> +#endif
>>               /* NBMA tunnel */
>>
>>               if (skb_dst(skb) == NULL) {
>> @@ -1209,6 +1452,16 @@ static int ipgre_open(struct net_device
>>                       return -EADDRNOTAVAIL;
>>               t->mlink = dev->ifindex;
>>               ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +             if (t->dev->type == ARPHRD_ETHER) {
>> +                     INIT_HLIST_HEAD(t->hash);
>> +                     spin_lock_init(&t->hash_lock);
>> +                     t->ageing_time = 300 * HZ;
>> +                     setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup,
>> +                             (unsigned long) t);
>> +                     mod_timer(&t->gc_timer, jiffies + t->ageing_time);
>
>
>> +             }
>> +#endif
>>       }
>>       return 0;
>>  }
>> @@ -1219,6 +1472,12 @@ static int ipgre_close(struct net_device
>>
>>       if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
>>               struct in_device *in_dev;
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +             if (t->dev->type == ARPHRD_ETHER) {
>> +                     ipgre_tap_bridge_flush(t);
>> +                     del_timer_sync(&t->gc_timer);
>> +             }
>> +#endif
>>               in_dev = inetdev_by_index(dev_net(dev), t->mlink);
>>               if (in_dev)
>>                       ip_mc_dec_group(in_dev, t->parms.iph.daddr);
>> @@ -1341,6 +1600,12 @@ static int __net_init ipgre_init_net(str
>>       struct ipgre_net *ign = net_generic(net, ipgre_net_id);
>>       int err;
>>
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +     err = ipgre_tap_bridge_init();
>> +     if (err)
>> +             goto err_out;
>> +#endif
>> +
>>       ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
>>                                          ipgre_tunnel_setup);
>>       if (!ign->fb_tunnel_dev) {
>> @@ -1362,6 +1627,10 @@ static int __net_init ipgre_init_net(str
>>  err_reg_dev:
>>       ipgre_dev_free(ign->fb_tunnel_dev);
>>  err_alloc_dev:
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +     ipgre_tap_bridge_fini();
>> +err_out:
>> +#endif
>>       return err;
>>  }
>>
>> @@ -1375,6 +1644,9 @@ static void __net_exit ipgre_exit_net(st
>>       ipgre_destroy_tunnels(ign, &list);
>>       unregister_netdevice_many(&list);
>>       rtnl_unlock();
>> +#ifdef CONFIG_NET_IPGRE_BRIDGE
>> +     ipgre_tap_bridge_fini();
>> +#endif
>>  }
>>
>>  static struct pernet_operations ipgre_net_ops = {
>>
>
>



-- 
Stefan Gula
CCNP, CCIP

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 14:05   ` Štefan Gula
@ 2012-01-16 14:25     ` Eric Dumazet
  2012-01-16 14:35       ` [PATCH] bridge: BH already disabled in br_fdb_cleanup() Eric Dumazet
  2012-01-16 14:47       ` [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP Štefan Gula
  0 siblings, 2 replies; 19+ messages in thread
From: Eric Dumazet @ 2012-01-16 14:25 UTC (permalink / raw)
  To: Štefan Gula
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

Le lundi 16 janvier 2012 à 15:05 +0100, Štefan Gula a écrit :
> Dňa 16. januára 2012 14:19, Eric Dumazet <eric.dumazet@gmail.com> napísal/a:

> > Unfortunately you call ipgre_tap_bridge_get() from
> > ipgre_tap_bridge_get_raddr() but you dont release the refcount on
> > use_count. Leak in ipgre_tunnel_xmit()
> >
> >
> >> +__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
> >> +     const unsigned char *addr)
> >> +{
> >> +     struct ipgre_tap_bridge_entry *entry;
> >> +     entry = ipgre_tap_bridge_get(tunnel, addr);
> >
> >        So maybe you want __ipgre_tap_bridge_get(tunnel, addr); here ?
> >
> Hmmm.. I am sorry, I am maybe not that expert on C coding, as most of
> the codes were copied from linux bridge code. Can you give me a hint
> how should I change that?

I dont see in net/bridge the code you copied.

Could you give more information ?



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] bridge: BH already disabled in br_fdb_cleanup()
  2012-01-16 14:25     ` Eric Dumazet
@ 2012-01-16 14:35       ` Eric Dumazet
  2012-01-16 15:53         ` Stephen Hemminger
  2012-01-16 14:47       ` [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP Štefan Gula
  1 sibling, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2012-01-16 14:35 UTC (permalink / raw)
  To: Štefan Gula; +Cc: David S. Miller, netdev, Stephen Hemminger

br_fdb_cleanup() is run from timer interrupt, BH already masked.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Štefan Gula <steweg@gmail.com>
---
 net/bridge/br_fdb.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index f963f6b..5ba0c84 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -146,7 +146,7 @@ void br_fdb_cleanup(unsigned long _data)
 	unsigned long next_timer = jiffies + br->ageing_time;
 	int i;
 
-	spin_lock_bh(&br->hash_lock);
+	spin_lock(&br->hash_lock);
 	for (i = 0; i < BR_HASH_SIZE; i++) {
 		struct net_bridge_fdb_entry *f;
 		struct hlist_node *h, *n;
@@ -162,7 +162,7 @@ void br_fdb_cleanup(unsigned long _data)
 				next_timer = this_timer;
 		}
 	}
-	spin_unlock_bh(&br->hash_lock);
+	spin_unlock(&br->hash_lock);
 
 	mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
 }

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 14:25     ` Eric Dumazet
  2012-01-16 14:35       ` [PATCH] bridge: BH already disabled in br_fdb_cleanup() Eric Dumazet
@ 2012-01-16 14:47       ` Štefan Gula
  2012-01-16 15:13         ` Eric Dumazet
  1 sibling, 1 reply; 19+ messages in thread
From: Štefan Gula @ 2012-01-16 14:47 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

Dňa 16. januára 2012 15:25, Eric Dumazet <eric.dumazet@gmail.com> napísal/a:
> Le lundi 16 janvier 2012 à 15:05 +0100, Štefan Gula a écrit :
>> Dňa 16. januára 2012 14:19, Eric Dumazet <eric.dumazet@gmail.com> napísal/a:
>
>> > Unfortunately you call ipgre_tap_bridge_get() from
>> > ipgre_tap_bridge_get_raddr() but you dont release the refcount on
>> > use_count. Leak in ipgre_tunnel_xmit()
>> >
>> >
>> >> +__be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel,
>> >> +     const unsigned char *addr)
>> >> +{
>> >> +     struct ipgre_tap_bridge_entry *entry;
>> >> +     entry = ipgre_tap_bridge_get(tunnel, addr);
>> >
>> >        So maybe you want __ipgre_tap_bridge_get(tunnel, addr); here ?
>> >
>> Hmmm.. I am sorry, I am maybe not that expert on C coding, as most of
>> the codes were copied from linux bridge code. Can you give me a hint
>> how should I change that?
>
> I dont see in net/bridge the code you copied.
>
> Could you give more information ?
>
>

File: net/birdge/br_fdb.c
Functions:
__br_fdb_get
br_fdb_get
My analogy fuctions:
__ipgre_tap_bridge_get
ipgre_tap_bridge_get

-- 
Stefan Gula

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 14:47       ` [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP Štefan Gula
@ 2012-01-16 15:13         ` Eric Dumazet
  2012-01-16 15:20           ` Štefan Gula
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2012-01-16 15:13 UTC (permalink / raw)
  To: Štefan Gula
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

Le lundi 16 janvier 2012 à 15:47 +0100, Štefan Gula a écrit :

> File: net/birdge/br_fdb.c
> Functions:
> __br_fdb_get
> br_fdb_get
> My analogy fuctions:
> __ipgre_tap_bridge_get
> ipgre_tap_bridge_get
> 

There is no br_fdb_get(), you copied some buggy code I am afraid.

Read again your patch.

If you use atomic_inc_not_zero(&x->use_count), then you must do the
atomic_dec() or leak entries.

Current bridge code doesnt use atomic_inc_not_zero()




^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 15:13         ` Eric Dumazet
@ 2012-01-16 15:20           ` Štefan Gula
  0 siblings, 0 replies; 19+ messages in thread
From: Štefan Gula @ 2012-01-16 15:20 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

Dňa 16. januára 2012 16:13, Eric Dumazet <eric.dumazet@gmail.com> napísal/a:
> Le lundi 16 janvier 2012 à 15:47 +0100, Štefan Gula a écrit :
>
>> File: net/birdge/br_fdb.c
>> Functions:
>> __br_fdb_get
>> br_fdb_get
>> My analogy fuctions:
>> __ipgre_tap_bridge_get
>> ipgre_tap_bridge_get
>>
>
> There is no br_fdb_get(), you copied some buggy code I am afraid.
>
> Read again your patch.
>
> If you use atomic_inc_not_zero(&x->use_count), then you must do the
> atomic_dec() or leak entries.
>
> Current bridge code doesnt use atomic_inc_not_zero()
>
>
>
You are right, new bridge code doesn't have such function, I will adjust that

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] bridge: BH already disabled in br_fdb_cleanup()
  2012-01-16 14:35       ` [PATCH] bridge: BH already disabled in br_fdb_cleanup() Eric Dumazet
@ 2012-01-16 15:53         ` Stephen Hemminger
  2012-01-17 15:17           ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2012-01-16 15:53 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Štefan Gula, David S. Miller, netdev

On Mon, 16 Jan 2012 15:35:50 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> br_fdb_cleanup() is run from timer interrupt, BH already masked.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Stephen Hemminger <shemminger@vyatta.com>
> CC: Štefan Gula <steweg@gmail.com>
> ---
>  net/bridge/br_fdb.c |    4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index f963f6b..5ba0c84 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -146,7 +146,7 @@ void br_fdb_cleanup(unsigned long _data)
>  	unsigned long next_timer = jiffies + br->ageing_time;
>  	int i;
>  
> -	spin_lock_bh(&br->hash_lock);
> +	spin_lock(&br->hash_lock);
>  	for (i = 0; i < BR_HASH_SIZE; i++) {
>  		struct net_bridge_fdb_entry *f;
>  		struct hlist_node *h, *n;
> @@ -162,7 +162,7 @@ void br_fdb_cleanup(unsigned long _data)
>  				next_timer = this_timer;
>  		}
>  	}
> -	spin_unlock_bh(&br->hash_lock);
> +	spin_unlock(&br->hash_lock);
>  
>  	mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
>  }
> 
> 

Acked-by: Stephen Hemminger <shemminger@vyatta.com>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 12:13 [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP Štefan Gula
  2012-01-16 13:19 ` Eric Dumazet
  2012-01-16 13:35 ` David Lamparter
@ 2012-01-16 16:36 ` Stephen Hemminger
  2012-01-16 17:26   ` Štefan Gula
  2 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2012-01-16 16:36 UTC (permalink / raw)
  To: Štefan Gula
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

On Mon, 16 Jan 2012 13:13:19 +0100
Štefan Gula <steweg@gmail.com> wrote:

> From: Stefan Gula <steweg@gmail.com
> 
> This patch is an extension for current Ethernet over GRE
> implementation, which allows user to create virtual bridge (multipoint
> VPN) and forward traffic based on Ethernet MAC address informations in
> it. It simulates the Bridge bahaviour learing mechanism, but instead
> of learning port ID from which given MAC address comes, it learns IP
> address of peer which encapsulated given packet. Multicast, Broadcast
> and unknown-multicast traffic is send over network as multicast
> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> represented as one single virtual switch on logical level and be also
> represented as one multicast IPv4 address on network level.
> 
> Signed-off-by: Stefan Gula <steweg@gmail.com>

Thanks for the effort, but it is duplicating existing functionality.
It possible to do this already with existing gretap device and the
current bridge.

The same thing is also supported by OpenVswitch.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 16:36 ` Stephen Hemminger
@ 2012-01-16 17:26   ` Štefan Gula
  2012-01-16 18:30     ` Stephen Hemminger
  2012-01-16 21:22     ` Jesse Gross
  0 siblings, 2 replies; 19+ messages in thread
From: Štefan Gula @ 2012-01-16 17:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

Dňa 16. januára 2012 17:36, Stephen Hemminger <shemminger@vyatta.com> napísal/a:
> On Mon, 16 Jan 2012 13:13:19 +0100
> Štefan Gula <steweg@gmail.com> wrote:
>
>> From: Stefan Gula <steweg@gmail.com
>>
>> This patch is an extension for current Ethernet over GRE
>> implementation, which allows user to create virtual bridge (multipoint
>> VPN) and forward traffic based on Ethernet MAC address informations in
>> it. It simulates the Bridge bahaviour learing mechanism, but instead
>> of learning port ID from which given MAC address comes, it learns IP
>> address of peer which encapsulated given packet. Multicast, Broadcast
>> and unknown-multicast traffic is send over network as multicast
>> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>> represented as one single virtual switch on logical level and be also
>> represented as one multicast IPv4 address on network level.
>>
>> Signed-off-by: Stefan Gula <steweg@gmail.com>
>
> Thanks for the effort, but it is duplicating existing functionality.
> It possible to do this already with existing gretap device and the
> current bridge.
>
> The same thing is also supported by OpenVswitch.
>

gretap with bridge will not do the same as gretap allows you to only
encapsulate L2 frames inside the GRE - this one part is actually
utilized in my code. GRE multipoint implementation is also utilized in
my code as well. But what is missing is forwarding logic here, which
prevents the traffic going not optimal way. Scenario one - e.g. if you
connect through 3 sites with using 1 gretap multipoint VPN, it always
forwards frames between site 1 and site 2 even if they are unicast.
That represents waste of bandwidth for site 3. Now assume that there
will be more than 40 sites and I hope you see that single current
multipoint gretap is not also good solution here

The second scenario - e.g. using 3 sites using point-to-point gretap
interfaces between each 2 sites (2 gretap VPN interfaces per site) and
bridging those interfaces with real ones results in looped topology
which needs to utilized STP inside to prevent loops. Once STP
converges the topology will looks like this, traffic from site 1 to
site 2 will go always directly by the way of unicast (on GRE level),
from site 2 to site 3 always directly by the way of unicast (on GRE
level) and from site 1 to site 3 will go indirectly through site 2 due
STP limitations, which results in another not optimalized traffic
flows. Now assume that the number of sites rises, so gretap+standard
bridge code is also not a good solution here.

My code utilizes it that way that I have extended the gretap
multipoint interface with the forwarding logic e.g. using 3 sites,
each site uses only one gretap VPN interface and if destination MAC
address is known to bridge code inside the gretap interface forwarding
logic, it forwards it towards only VPN endpoint that actually need
that by the way of unicasting on GRE level. On the other hand if the
destination MAC address is unknown or destination MAC address is L2
multicast or L2 broadcast than the frame is spread out through
multicasting on GRE level, providing delivery mechanism analogous to
standard switches on top of the multipoint GRE tunnels.

I also get through briefly over OpenVswitch documentation and found
that it is more related to virtualization inside the box like VMware
switches or so and not to such technologies interconnecting two or
more separate segments over routed L3 infrastructure - there is a
mention about the CAPWAP UDP transport but this is more related to
WiFi implementations than generic ones. My patch also doesn't need any
special userspace api to be configured. It utilizes the existing one.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 17:26   ` Štefan Gula
@ 2012-01-16 18:30     ` Stephen Hemminger
  2012-01-16 18:38       ` Štefan Gula
  2012-01-16 21:22     ` Jesse Gross
  1 sibling, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2012-01-16 18:30 UTC (permalink / raw)
  To: Štefan Gula
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

On Mon, 16 Jan 2012 18:26:57 +0100
Štefan Gula <steweg@gmail.com> wrote:

> Dňa 16. januára 2012 17:36, Stephen Hemminger <shemminger@vyatta.com> napísal/a:
> > On Mon, 16 Jan 2012 13:13:19 +0100
> > Štefan Gula <steweg@gmail.com> wrote:
> >
> >> From: Stefan Gula <steweg@gmail.com
> >>
> >> This patch is an extension for current Ethernet over GRE
> >> implementation, which allows user to create virtual bridge (multipoint
> >> VPN) and forward traffic based on Ethernet MAC address informations in
> >> it. It simulates the Bridge bahaviour learing mechanism, but instead
> >> of learning port ID from which given MAC address comes, it learns IP
> >> address of peer which encapsulated given packet. Multicast, Broadcast
> >> and unknown-multicast traffic is send over network as multicast
> >> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
> >> represented as one single virtual switch on logical level and be also
> >> represented as one multicast IPv4 address on network level.
> >>
> >> Signed-off-by: Stefan Gula <steweg@gmail.com>
> >
> > Thanks for the effort, but it is duplicating existing functionality.
> > It possible to do this already with existing gretap device and the
> > current bridge.
> >
> > The same thing is also supported by OpenVswitch.
> >
> 
> gretap with bridge will not do the same as gretap allows you to only
> encapsulate L2 frames inside the GRE - this one part is actually
> utilized in my code. GRE multipoint implementation is also utilized in
> my code as well. But what is missing is forwarding logic here, which
> prevents the traffic going not optimal way. Scenario one - e.g. if you
> connect through 3 sites with using 1 gretap multipoint VPN, it always
> forwards frames between site 1 and site 2 even if they are unicast.
> That represents waste of bandwidth for site 3. Now assume that there
> will be more than 40 sites and I hope you see that single current
> multipoint gretap is not also good solution here
> 
> The second scenario - e.g. using 3 sites using point-to-point gretap
> interfaces between each 2 sites (2 gretap VPN interfaces per site) and
> bridging those interfaces with real ones results in looped topology
> which needs to utilized STP inside to prevent loops. Once STP
> converges the topology will looks like this, traffic from site 1 to
> site 2 will go always directly by the way of unicast (on GRE level),
> from site 2 to site 3 always directly by the way of unicast (on GRE
> level) and from site 1 to site 3 will go indirectly through site 2 due
> STP limitations, which results in another not optimalized traffic
> flows. Now assume that the number of sites rises, so gretap+standard
> bridge code is also not a good solution here.
> 
> My code utilizes it that way that I have extended the gretap
> multipoint interface with the forwarding logic e.g. using 3 sites,
> each site uses only one gretap VPN interface and if destination MAC
> address is known to bridge code inside the gretap interface forwarding
> logic, it forwards it towards only VPN endpoint that actually need
> that by the way of unicasting on GRE level. On the other hand if the
> destination MAC address is unknown or destination MAC address is L2
> multicast or L2 broadcast than the frame is spread out through
> multicasting on GRE level, providing delivery mechanism analogous to
> standard switches on top of the multipoint GRE tunnels.

Couldn't this be controlled from user space either by programming
the FDB with netlink or doing alternative version of STP?


> I also get through briefly over OpenVswitch documentation and found
> that it is more related to virtualization inside the box like VMware
> switches or so and not to such technologies interconnecting two or
> more separate segments over routed L3 infrastructure - there is a
> mention about the CAPWAP UDP transport but this is more related to
> WiFi implementations than generic ones. My patch also doesn't need any
> special userspace api to be configured. It utilizes the existing one.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 18:30     ` Stephen Hemminger
@ 2012-01-16 18:38       ` Štefan Gula
  0 siblings, 0 replies; 19+ messages in thread
From: Štefan Gula @ 2012-01-16 18:38 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Alexey Kuznetsov, David S. Miller, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev, linux-kernel

2012/1/16 Stephen Hemminger <shemminger@vyatta.com>:
> On Mon, 16 Jan 2012 18:26:57 +0100
> Štefan Gula <steweg@gmail.com> wrote:
>
>> Dňa 16. januára 2012 17:36, Stephen Hemminger <shemminger@vyatta.com> napísal/a:
>> > On Mon, 16 Jan 2012 13:13:19 +0100
>> > Štefan Gula <steweg@gmail.com> wrote:
>> >
>> >> From: Stefan Gula <steweg@gmail.com
>> >>
>> >> This patch is an extension for current Ethernet over GRE
>> >> implementation, which allows user to create virtual bridge (multipoint
>> >> VPN) and forward traffic based on Ethernet MAC address informations in
>> >> it. It simulates the Bridge bahaviour learing mechanism, but instead
>> >> of learning port ID from which given MAC address comes, it learns IP
>> >> address of peer which encapsulated given packet. Multicast, Broadcast
>> >> and unknown-multicast traffic is send over network as multicast
>> >> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>> >> represented as one single virtual switch on logical level and be also
>> >> represented as one multicast IPv4 address on network level.
>> >>
>> >> Signed-off-by: Stefan Gula <steweg@gmail.com>
>> >
>> > Thanks for the effort, but it is duplicating existing functionality.
>> > It possible to do this already with existing gretap device and the
>> > current bridge.
>> >
>> > The same thing is also supported by OpenVswitch.
>> >
>>
>> gretap with bridge will not do the same as gretap allows you to only
>> encapsulate L2 frames inside the GRE - this one part is actually
>> utilized in my code. GRE multipoint implementation is also utilized in
>> my code as well. But what is missing is forwarding logic here, which
>> prevents the traffic going not optimal way. Scenario one - e.g. if you
>> connect through 3 sites with using 1 gretap multipoint VPN, it always
>> forwards frames between site 1 and site 2 even if they are unicast.
>> That represents waste of bandwidth for site 3. Now assume that there
>> will be more than 40 sites and I hope you see that single current
>> multipoint gretap is not also good solution here
>>
>> The second scenario - e.g. using 3 sites using point-to-point gretap
>> interfaces between each 2 sites (2 gretap VPN interfaces per site) and
>> bridging those interfaces with real ones results in looped topology
>> which needs to utilized STP inside to prevent loops. Once STP
>> converges the topology will looks like this, traffic from site 1 to
>> site 2 will go always directly by the way of unicast (on GRE level),
>> from site 2 to site 3 always directly by the way of unicast (on GRE
>> level) and from site 1 to site 3 will go indirectly through site 2 due
>> STP limitations, which results in another not optimalized traffic
>> flows. Now assume that the number of sites rises, so gretap+standard
>> bridge code is also not a good solution here.
>>
>> My code utilizes it that way that I have extended the gretap
>> multipoint interface with the forwarding logic e.g. using 3 sites,
>> each site uses only one gretap VPN interface and if destination MAC
>> address is known to bridge code inside the gretap interface forwarding
>> logic, it forwards it towards only VPN endpoint that actually need
>> that by the way of unicasting on GRE level. On the other hand if the
>> destination MAC address is unknown or destination MAC address is L2
>> multicast or L2 broadcast than the frame is spread out through
>> multicasting on GRE level, providing delivery mechanism analogous to
>> standard switches on top of the multipoint GRE tunnels.
>
> Couldn't this be controlled from user space either by programming
> the FDB with netlink or doing alternative version of STP?
For certain small number of clients yes, for many clients it is
administrative nightmare. e.g. I have in my network more than 98 VPN
endpoints and 4k users constantly migrating from one site to another.
This solution provides me flexible way to do this, with no
administrative work.
>
>
>> I also get through briefly over OpenVswitch documentation and found
>> that it is more related to virtualization inside the box like VMware
>> switches or so and not to such technologies interconnecting two or
>> more separate segments over routed L3 infrastructure - there is a
>> mention about the CAPWAP UDP transport but this is more related to
>> WiFi implementations than generic ones. My patch also doesn't need any
>> special userspace api to be configured. It utilizes the existing one.
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 17:26   ` Štefan Gula
  2012-01-16 18:30     ` Stephen Hemminger
@ 2012-01-16 21:22     ` Jesse Gross
  2012-01-16 23:40       ` Štefan Gula
  1 sibling, 1 reply; 19+ messages in thread
From: Jesse Gross @ 2012-01-16 21:22 UTC (permalink / raw)
  To: Štefan Gula
  Cc: Stephen Hemminger, Alexey Kuznetsov, David S. Miller,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, netdev,
	linux-kernel

2012/1/16 Štefan Gula <steweg@gmail.com>:
> Dňa 16. januára 2012 17:36, Stephen Hemminger <shemminger@vyatta.com> napísal/a:
>> On Mon, 16 Jan 2012 13:13:19 +0100
>> Štefan Gula <steweg@gmail.com> wrote:
>>
>>> From: Stefan Gula <steweg@gmail.com
>>>
>>> This patch is an extension for current Ethernet over GRE
>>> implementation, which allows user to create virtual bridge (multipoint
>>> VPN) and forward traffic based on Ethernet MAC address informations in
>>> it. It simulates the Bridge bahaviour learing mechanism, but instead
>>> of learning port ID from which given MAC address comes, it learns IP
>>> address of peer which encapsulated given packet. Multicast, Broadcast
>>> and unknown-multicast traffic is send over network as multicast
>>> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>>> represented as one single virtual switch on logical level and be also
>>> represented as one multicast IPv4 address on network level.
>>>
>>> Signed-off-by: Stefan Gula <steweg@gmail.com>
>>
>> Thanks for the effort, but it is duplicating existing functionality.
>> It possible to do this already with existing gretap device and the
>> current bridge.
>>
>> The same thing is also supported by OpenVswitch.
>>
>
> gretap with bridge will not do the same as gretap allows you to only
> encapsulate L2 frames inside the GRE - this one part is actually
> utilized in my code. GRE multipoint implementation is also utilized in
> my code as well. But what is missing is forwarding logic here, which
> prevents the traffic going not optimal way. Scenario one - e.g. if you
> connect through 3 sites with using 1 gretap multipoint VPN, it always
> forwards frames between site 1 and site 2 even if they are unicast.
> That represents waste of bandwidth for site 3. Now assume that there
> will be more than 40 sites and I hope you see that single current
> multipoint gretap is not also good solution here
>
> The second scenario - e.g. using 3 sites using point-to-point gretap
> interfaces between each 2 sites (2 gretap VPN interfaces per site) and
> bridging those interfaces with real ones results in looped topology
> which needs to utilized STP inside to prevent loops. Once STP
> converges the topology will looks like this, traffic from site 1 to
> site 2 will go always directly by the way of unicast (on GRE level),
> from site 2 to site 3 always directly by the way of unicast (on GRE
> level) and from site 1 to site 3 will go indirectly through site 2 due
> STP limitations, which results in another not optimalized traffic
> flows. Now assume that the number of sites rises, so gretap+standard
> bridge code is also not a good solution here.
>
> My code utilizes it that way that I have extended the gretap
> multipoint interface with the forwarding logic e.g. using 3 sites,
> each site uses only one gretap VPN interface and if destination MAC
> address is known to bridge code inside the gretap interface forwarding
> logic, it forwards it towards only VPN endpoint that actually need
> that by the way of unicasting on GRE level. On the other hand if the
> destination MAC address is unknown or destination MAC address is L2
> multicast or L2 broadcast than the frame is spread out through
> multicasting on GRE level, providing delivery mechanism analogous to
> standard switches on top of the multipoint GRE tunnels.
>
> I also get through briefly over OpenVswitch documentation and found
> that it is more related to virtualization inside the box like VMware
> switches or so and not to such technologies interconnecting two or
> more separate segments over routed L3 infrastructure - there is a
> mention about the CAPWAP UDP transport but this is more related to
> WiFi implementations than generic ones. My patch also doesn't need any
> special userspace api to be configured. It utilizes the existing one.

I understand what you're trying to do and I think that the goal makes
sense but I agree with Stephen that this is not the right way to go
about it.  I see two issues:

 * It copies a lot of bridge code, making it unmaintainable and
inflexible to other use cases.
 * The implementation exists in the GRE protocol stack but it applies
equally to other tunneling protocols as well (VXLAN comes to mind).

Open vSwitch doesn't quite do this level of learning yet but it's the
direction that we want to move in (and there's nothing particularly
virtualization specific about it).  What I think makes the most sense
is to create some internal interfaces to the GRE stack that exposes
the information needed to do learning.  That way there is only one
instance of the protocol code for each tunneling mechanism and then
each way of managing those addresses (i.e. the current device-based
mechanism, Open vSwitch, potentially a direct bridge-based mechanism,
etc.) can be reused as well.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP
  2012-01-16 21:22     ` Jesse Gross
@ 2012-01-16 23:40       ` Štefan Gula
  0 siblings, 0 replies; 19+ messages in thread
From: Štefan Gula @ 2012-01-16 23:40 UTC (permalink / raw)
  To: Jesse Gross
  Cc: Stephen Hemminger, Alexey Kuznetsov, David S. Miller,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, netdev,
	linux-kernel

Dňa 16. januára 2012 22:22, Jesse Gross <jesse@nicira.com> napísal/a:
> 2012/1/16 Štefan Gula <steweg@gmail.com>:
>> Dňa 16. januára 2012 17:36, Stephen Hemminger <shemminger@vyatta.com> napísal/a:
>>> On Mon, 16 Jan 2012 13:13:19 +0100
>>> Štefan Gula <steweg@gmail.com> wrote:
>>>
>>>> From: Stefan Gula <steweg@gmail.com
>>>>
>>>> This patch is an extension for current Ethernet over GRE
>>>> implementation, which allows user to create virtual bridge (multipoint
>>>> VPN) and forward traffic based on Ethernet MAC address informations in
>>>> it. It simulates the Bridge bahaviour learing mechanism, but instead
>>>> of learning port ID from which given MAC address comes, it learns IP
>>>> address of peer which encapsulated given packet. Multicast, Broadcast
>>>> and unknown-multicast traffic is send over network as multicast
>>>> enacapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be
>>>> represented as one single virtual switch on logical level and be also
>>>> represented as one multicast IPv4 address on network level.
>>>>
>>>> Signed-off-by: Stefan Gula <steweg@gmail.com>
>>>
>>> Thanks for the effort, but it is duplicating existing functionality.
>>> It possible to do this already with existing gretap device and the
>>> current bridge.
>>>
>>> The same thing is also supported by OpenVswitch.
>>>
>>
>> gretap with bridge will not do the same as gretap allows you to only
>> encapsulate L2 frames inside the GRE - this one part is actually
>> utilized in my code. GRE multipoint implementation is also utilized in
>> my code as well. But what is missing is forwarding logic here, which
>> prevents the traffic going not optimal way. Scenario one - e.g. if you
>> connect through 3 sites with using 1 gretap multipoint VPN, it always
>> forwards frames between site 1 and site 2 even if they are unicast.
>> That represents waste of bandwidth for site 3. Now assume that there
>> will be more than 40 sites and I hope you see that single current
>> multipoint gretap is not also good solution here
>>
>> The second scenario - e.g. using 3 sites using point-to-point gretap
>> interfaces between each 2 sites (2 gretap VPN interfaces per site) and
>> bridging those interfaces with real ones results in looped topology
>> which needs to utilized STP inside to prevent loops. Once STP
>> converges the topology will looks like this, traffic from site 1 to
>> site 2 will go always directly by the way of unicast (on GRE level),
>> from site 2 to site 3 always directly by the way of unicast (on GRE
>> level) and from site 1 to site 3 will go indirectly through site 2 due
>> STP limitations, which results in another not optimalized traffic
>> flows. Now assume that the number of sites rises, so gretap+standard
>> bridge code is also not a good solution here.
>>
>> My code utilizes it that way that I have extended the gretap
>> multipoint interface with the forwarding logic e.g. using 3 sites,
>> each site uses only one gretap VPN interface and if destination MAC
>> address is known to bridge code inside the gretap interface forwarding
>> logic, it forwards it towards only VPN endpoint that actually need
>> that by the way of unicasting on GRE level. On the other hand if the
>> destination MAC address is unknown or destination MAC address is L2
>> multicast or L2 broadcast than the frame is spread out through
>> multicasting on GRE level, providing delivery mechanism analogous to
>> standard switches on top of the multipoint GRE tunnels.
>>
>> I also get through briefly over OpenVswitch documentation and found
>> that it is more related to virtualization inside the box like VMware
>> switches or so and not to such technologies interconnecting two or
>> more separate segments over routed L3 infrastructure - there is a
>> mention about the CAPWAP UDP transport but this is more related to
>> WiFi implementations than generic ones. My patch also doesn't need any
>> special userspace api to be configured. It utilizes the existing one.
>
> I understand what you're trying to do and I think that the goal makes
> sense but I agree with Stephen that this is not the right way to go
> about it.  I see two issues:
>
>  * It copies a lot of bridge code, making it unmaintainable and
> inflexible to other use cases.
>  * The implementation exists in the GRE protocol stack but it applies
> equally to other tunneling protocols as well (VXLAN comes to mind).
>
> Open vSwitch doesn't quite do this level of learning yet but it's the
> direction that we want to move in (and there's nothing particularly
> virtualization specific about it).  What I think makes the most sense
> is to create some internal interfaces to the GRE stack that exposes
> the information needed to do learning.  That way there is only one
> instance of the protocol code for each tunneling mechanism and then
> each way of managing those addresses (i.e. the current device-based
> mechanism, Open vSwitch, potentially a direct bridge-based mechanism,
> etc.) can be reused as well.

I agree with you that using such approach of copying bridge code with
some modifications is maybe not flexible enough, but on the other hand
it does the job needed. It also doesn't breaks any previous
compatibility of usage of gretap interfaces as it modifies the
encapsulation and decapsulation part of codes *how the traffic goes to
and from remote destinations), which doesn't face any change on the
internal linkages coding inside the box (e.g. linking that gretap
interface to standard logical bridge interface is still fully
possible). I would rather see getting some standard set of generic
bridge code, which can be reused anywhere in network stack, but for
now this is the only way I know how to do it. Exposing the GRE
interfaces will not do the job as what I needed was to rebuild the
bridge logic to allow learning endpoint IPs instead of network ports
IDs (it's almost the same as using multiple gretap interfaces inside
one bridge.interface) To obtain back the maintainability I would
assume redesigning the bridge code is the best way here, but I am not
that well coder to do it myself. So if anybody is interested in this
feel free to do it.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] bridge: BH already disabled in br_fdb_cleanup()
  2012-01-16 15:53         ` Stephen Hemminger
@ 2012-01-17 15:17           ` David Miller
  0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2012-01-17 15:17 UTC (permalink / raw)
  To: shemminger; +Cc: eric.dumazet, steweg, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 16 Jan 2012 07:53:49 -0800

> On Mon, 16 Jan 2012 15:35:50 +0100
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
>> br_fdb_cleanup() is run from timer interrupt, BH already masked.
>> 
>> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
 ...
> Acked-by: Stephen Hemminger <shemminger@vyatta.com>

Applied, thanks.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2012-01-17 15:17 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-16 12:13 [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP Štefan Gula
2012-01-16 13:19 ` Eric Dumazet
2012-01-16 13:30   ` David Laight
2012-01-16 13:42     ` Štefan Gula
2012-01-16 14:05   ` Štefan Gula
2012-01-16 14:25     ` Eric Dumazet
2012-01-16 14:35       ` [PATCH] bridge: BH already disabled in br_fdb_cleanup() Eric Dumazet
2012-01-16 15:53         ` Stephen Hemminger
2012-01-17 15:17           ` David Miller
2012-01-16 14:47       ` [patch v1, kernel version 3.2.1] net/ipv4/ip_gre: Ethernet multipoint GRE over IP Štefan Gula
2012-01-16 15:13         ` Eric Dumazet
2012-01-16 15:20           ` Štefan Gula
2012-01-16 13:35 ` David Lamparter
2012-01-16 16:36 ` Stephen Hemminger
2012-01-16 17:26   ` Štefan Gula
2012-01-16 18:30     ` Stephen Hemminger
2012-01-16 18:38       ` Štefan Gula
2012-01-16 21:22     ` Jesse Gross
2012-01-16 23:40       ` Štefan Gula

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.