All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2.6] iptables CLUSTERIP target
@ 2004-10-20 22:38 Harald Welte
  2004-10-20 23:18 ` YOSHIFUJI Hideaki / 吉藤英明
                   ` (3 more replies)
  0 siblings, 4 replies; 29+ messages in thread
From: Harald Welte @ 2004-10-20 22:38 UTC (permalink / raw)
  To: David Miller; +Cc: Linux Netdev List, Netfilter Development Mailinglist, lmb


[-- Attachment #1.1: Type: text/plain, Size: 1115 bytes --]

Hi Dave!

This is the second patch, adding the 'CLUSTERIP' target to iptables. It
depends on the first 'CONNMARK' patch.

This enables you to build a static load sharing cluster between multiple
nodes - without the requirement to have a load balancer.  It uses a
series of [evil] tricks like replying with linklayer multicast addresses
to ARP requests, and using CONNMARK for stateful blocking all traffic
not intended for the local node.

Apart from the usual netfilter-specific file additions and
Kconfig/Makefile patches, this needs to export proc_file_operations in
order to get the reference counting of certain data objects right.  I
hope this change is acceptable.

Signed-off-by: Harald Welte <laforge@netfilter.org>

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #1.2: 2.6.9-clusterip.patch --]
[-- Type: text/plain, Size: 24501 bytes --]

diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/fs/proc/generic.c linux-2.6.9-connmark-clusterip/fs/proc/generic.c
--- linux-2.6.9-connmark/fs/proc/generic.c	2004-10-18 23:55:29.000000000 +0200
+++ linux-2.6.9-connmark-clusterip/fs/proc/generic.c	2004-10-21 00:22:29.123398903 +0200
@@ -34,7 +34,7 @@
 	return !memcmp(name, de->name, len);
 }
 
-static struct file_operations proc_file_operations = {
+struct file_operations proc_file_operations = {
 	.llseek		= proc_file_lseek,
 	.read		= proc_file_read,
 	.write		= proc_file_write,
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/fs/proc/root.c linux-2.6.9-connmark-clusterip/fs/proc/root.c
--- linux-2.6.9-connmark/fs/proc/root.c	2004-10-18 23:54:55.000000000 +0200
+++ linux-2.6.9-connmark-clusterip/fs/proc/root.c	2004-10-21 00:22:29.124398868 +0200
@@ -162,3 +162,4 @@
 EXPORT_SYMBOL(proc_net_stat);
 EXPORT_SYMBOL(proc_bus);
 EXPORT_SYMBOL(proc_root_driver);
+EXPORT_SYMBOL(proc_file_operations);
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/include/linux/proc_fs.h linux-2.6.9-connmark-clusterip/include/linux/proc_fs.h
--- linux-2.6.9-connmark/include/linux/proc_fs.h	2004-10-18 23:55:36.000000000 +0200
+++ linux-2.6.9-connmark-clusterip/include/linux/proc_fs.h	2004-10-21 00:22:29.125398833 +0200
@@ -117,6 +117,7 @@
 extern int proc_readdir(struct file *, void *, filldir_t);
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
+extern struct file_operations proc_file_operations;
 extern struct file_operations proc_kcore_operations;
 extern struct file_operations proc_kmsg_operations;
 extern struct file_operations ppc_htab_operations;
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h linux-2.6.9-connmark-clusterip/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
--- linux-2.6.9-connmark/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.9-connmark-clusterip/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h	2004-10-21 00:22:29.094399911 +0200
@@ -0,0 +1,32 @@
+#ifndef _IPT_CLUSTERIP_H_target
+#define _IPT_CLUSTERIP_H_target
+
+enum clusterip_hashmode {
+    CLUSTERIP_HASHMODE_SIP = 0,
+    CLUSTERIP_HASHMODE_SIP_SPT,
+    CLUSTERIP_HASHMODE_SIP_SPT_DPT,
+};
+
+#define CLUSTERIP_HASHMODE_MAX CLUSTERIP_HASHMODE_SIP_SPT_DPT
+
+#define CLUSTERIP_MAX_NODES 8
+
+#define CLUSTERIP_FLAG_NEW 0x00000001
+
+struct clusterip_config;
+
+struct ipt_clusterip_tgt_info {
+
+	u_int32_t flags;
+	struct clusterip_config *config;
+	
+	/* only relevant for new ones */
+	u_int8_t clustermac[6];
+	u_int16_t num_total_nodes;
+	u_int16_t num_local_nodes;
+	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
+	enum clusterip_hashmode hash_mode;
+	u_int32_t hash_initval;
+};
+
+#endif /*_IPT_CLUSTERIP_H_target*/
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/net/ipv4/netfilter/Kconfig linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/Kconfig
--- linux-2.6.9-connmark/net/ipv4/netfilter/Kconfig	2004-10-21 00:16:30.830850002 +0200
+++ linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/Kconfig	2004-10-21 00:30:17.802111752 +0200
@@ -628,6 +628,16 @@
 	  Documentation/modules.txt.  The module will be called
 	  ipt_CONNMARK.o.  If unsure, say `N'.
 
+config IP_NF_TARGET_CLUSTERIP
+	tristate "CLUSTERIP target support (EXPERIMENTAL)"
+	depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL
+	help
+	  The CLUSTERIP target allows you to build load-balancing clusters of
+	  network servers without having a dedicated load-balancing
+	  router/server/switch.
+	
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # raw + specific targets
 config IP_NF_RAW
 	tristate  'raw table support (required for NOTRACK/TRACE)'
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/net/ipv4/netfilter/Makefile linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/Makefile
--- linux-2.6.9-connmark/net/ipv4/netfilter/Makefile	2004-10-20 23:59:36.368103807 +0200
+++ linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/Makefile	2004-10-21 00:23:57.927312860 +0200
@@ -86,6 +86,7 @@
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
 obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
+obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 
 # generic ARP tables
 obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/net/ipv4/netfilter/ipt_CLUSTERIP.c linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/ipt_CLUSTERIP.c
--- linux-2.6.9-connmark/net/ipv4/netfilter/ipt_CLUSTERIP.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/ipt_CLUSTERIP.c	2004-10-21 00:35:03.690176796 +0200
@@ -0,0 +1,712 @@
+/* Cluster IP hashmark target 
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * based on ideas of Fabio Olive Leite <olive@unixforge.org>
+ *
+ * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/proc_fs.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+
+#include <net/checksum.h>
+
+#include <linux/netfilter_arp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+#define CLUSTERIP_VERSION "0.5"
+
+#define DEBUG_CLUSTERIP
+
+#ifdef DEBUG_CLUSTERIP
+#define DEBUGP	printk
+#else
+#define DEBUGP
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables target for CLUSTERIP");
+
+struct clusterip_config {
+	struct list_head list;			/* list of all configs */
+	atomic_t refcount;			/* reference count */
+
+	u_int32_t clusterip;			/* the IP address */
+	u_int8_t clustermac[ETH_ALEN];		/* the MAC address */
+	struct net_device *dev;			/* device */
+	u_int16_t num_total_nodes;		/* total number of nodes */
+	u_int16_t num_local_nodes;		/* number of local nodes */
+	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];	/* node number array */
+
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *pde;		/* proc dir entry */
+#endif
+	enum clusterip_hashmode hash_mode;	/* which hashing mode */
+	u_int32_t hash_initval;			/* hash initialization */
+};
+
+static LIST_HEAD(clusterip_configs);
+
+/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
+ * data within all structurses (num_local_nodes, local_nodes[]) */
+DECLARE_RWLOCK(clusterip_lock);
+
+#ifdef CONFIG_PROC_FS
+static struct file_operations clusterip_proc_fops;
+static struct proc_dir_entry *clusterip_procdir;
+static int clusterip_proc_open(struct inode *inode, struct file *file);
+static int clusterip_proc_release(struct inode *inode, struct file *file);
+static int clusterip_proc_read(char *buffer, char **start, off_t offset, 
+				int length, int *eof, void *data);
+static int clusterip_proc_write(struct file *file, const char *input, 
+				unsigned long size, void *data);
+
+#endif
+
+static inline void
+clusterip_config_get(struct clusterip_config *c) {
+	atomic_inc(&c->refcount);
+}
+
+static inline void
+clusterip_config_put(struct clusterip_config *c) {
+	if (atomic_dec_and_test(&c->refcount)) {
+		WRITE_LOCK(&clusterip_lock);
+		list_del(&c->list);
+		WRITE_UNLOCK(&clusterip_lock);
+		dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
+		dev_put(c->dev);
+		kfree(c);
+	}
+}
+
+
+static struct clusterip_config *
+__clusterip_config_find(u_int32_t clusterip)
+{
+	struct list_head *pos;
+
+	MUST_BE_READ_LOCKED(&clusterip_lock);
+	list_for_each(pos, &clusterip_configs) {
+		struct clusterip_config *c = list_entry(pos, 
+					struct clusterip_config, list);
+		if (c->clusterip == clusterip) {
+			return c;
+		}
+	}
+
+	return NULL;
+}
+
+static inline struct clusterip_config *
+clusterip_config_find_get(u_int32_t clusterip)
+{
+	struct clusterip_config *c;
+
+	READ_LOCK(&clusterip_lock);
+	c = __clusterip_config_find(clusterip);
+	if (!c) {
+		READ_UNLOCK(&clusterip_lock);
+		return NULL;
+	}
+	atomic_inc(&c->refcount);
+	READ_UNLOCK(&clusterip_lock);
+
+	return c;
+}
+
+static struct clusterip_config *
+clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
+			struct net_device *dev)
+{
+	struct clusterip_config *c;
+	char buffer[16];
+
+	c = kmalloc(sizeof(*c), GFP_ATOMIC);
+	if (!c)
+		return NULL;
+
+	memset(c, 0, sizeof(*c));
+	c->dev = dev;
+	c->clusterip = ip;
+	memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+	c->num_total_nodes = i->num_total_nodes;
+	c->num_local_nodes = i->num_local_nodes;
+	memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
+	c->hash_mode = i->hash_mode;
+	c->hash_initval = i->hash_initval;
+	atomic_set(&c->refcount, 1);
+
+#ifdef CONFIG_PROC_FS
+	/* create proc dir entry */
+	sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip));
+	c->pde = create_proc_entry(buffer, 0,  clusterip_procdir);
+	if (!c->pde) {
+		kfree(c);
+		return NULL;
+	}
+	c->pde->owner = THIS_MODULE;
+	c->pde->data = c;
+	c->pde->read_proc = clusterip_proc_read;
+	c->pde->write_proc = clusterip_proc_write;
+	c->pde->proc_fops = &clusterip_proc_fops;
+#endif
+
+	WRITE_LOCK(&clusterip_lock);
+	list_add(&c->list, &clusterip_configs);
+	WRITE_UNLOCK(&clusterip_lock);
+
+	return c;
+}
+
+static int
+clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+	int i;
+
+	WRITE_LOCK(&clusterip_lock);
+
+	if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
+	    || nodenum > CLUSTERIP_MAX_NODES) {
+		WRITE_UNLOCK(&clusterip_lock);
+		return 1;
+	}
+
+	/* check if we alrady have this number in our array */
+	for (i = 0; i < c->num_local_nodes; i++) {
+		if (c->local_nodes[i] == nodenum) {
+			WRITE_UNLOCK(&clusterip_lock);
+			return 1;
+		}
+	}
+
+	c->local_nodes[c->num_local_nodes++] = nodenum;
+
+	WRITE_UNLOCK(&clusterip_lock);
+	return 0;
+}
+
+static int
+clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+	int i;
+
+	WRITE_LOCK(&clusterip_lock);
+
+	if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
+		WRITE_UNLOCK(&clusterip_lock);
+		return 1;
+	}
+		
+	for (i = 0; i < c->num_local_nodes; i++) {
+		if (c->local_nodes[i] == nodenum) {
+			int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
+			memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
+			c->num_local_nodes--;
+			WRITE_UNLOCK(&clusterip_lock);
+			return 0;
+		}
+	}
+
+	WRITE_UNLOCK(&clusterip_lock);
+	return 1;
+}
+
+static inline u_int32_t
+clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
+{
+	struct iphdr *iph = skb->nh.iph;
+	unsigned long hashval;
+	u_int16_t sport, dport;
+	struct tcphdr *th;
+	struct udphdr *uh;
+	struct icmphdr *ih;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = (void *)iph+iph->ihl*4;
+		sport = ntohs(th->source);
+		dport = ntohs(th->dest);
+		break;
+	case IPPROTO_UDP:
+		uh = (void *)iph+iph->ihl*4;
+		sport = ntohs(uh->source);
+		dport = ntohs(uh->dest);
+		break;
+	case IPPROTO_ICMP:
+		ih = (void *)iph+iph->ihl*4;
+		sport = ntohs(ih->un.echo.id);
+		dport = (ih->type<<8)|ih->code;
+		break;
+	default:
+		if (net_ratelimit()) {
+			printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n",
+				iph->protocol);
+		}
+		sport = dport = 0;
+	}
+
+	switch (config->hash_mode) {
+	case CLUSTERIP_HASHMODE_SIP:
+		hashval = jhash_1word(ntohl(iph->saddr),
+				      config->hash_initval);
+		break;
+	case CLUSTERIP_HASHMODE_SIP_SPT:
+		hashval = jhash_2words(ntohl(iph->saddr), sport, 
+				       config->hash_initval);
+		break;
+	case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
+		hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
+				       config->hash_initval);
+		break;
+	default:
+		/* to make gcc happy */
+		hashval = 0;
+		/* This cannot happen, unless the check function wasn't called
+		 * at rule load time */
+		printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode);
+		BUG();
+		break;
+	}
+
+	/* node numbers are 1..n, not 0..n */
+	return ((hashval % config->num_total_nodes)+1);
+}
+
+static inline int
+clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
+{
+	int i;
+
+	READ_LOCK(&clusterip_lock);
+
+	if (config->num_local_nodes == 0) {
+		READ_UNLOCK(&clusterip_lock);
+		return 0;
+	}
+
+	for (i = 0; i < config->num_local_nodes; i++) {
+		if (config->local_nodes[i] == hash) {
+			READ_UNLOCK(&clusterip_lock);
+			return 1;
+		}
+	}
+
+	READ_UNLOCK(&clusterip_lock);
+
+	return 0;
+}
+
+/*********************************************************************** 
+ * IPTABLES TARGET 
+ ***********************************************************************/
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
+	u_int32_t hash;
+
+	/* don't need to clusterip_config_get() here, since refcount
+	 * is only decremented by destroy() - and ip_tables guarantees
+	 * that the ->target() function isn't called after ->destroy() */
+
+	if (!ct) {
+		printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
+			/* FIXME: need to drop invalid ones, since replies
+			 * to outgoing connections of other nodes will be 
+			 * marked as INVALID */
+		return NF_DROP;
+	}
+
+	/* special case: ICMP error handling. conntrack distinguishes between
+	 * error messages (RELATED) and information requests (see below) */
+	if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
+	    && (ctinfo == IP_CT_RELATED 
+		|| ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
+		return IPT_CONTINUE;
+
+	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 
+	 * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+	 * on, which all have an ID field [relevant for hashing]. */
+
+	hash = clusterip_hashfn(*pskb, cipinfo->config);
+
+	switch (ctinfo) {
+		case IP_CT_NEW:
+			ct->mark = hash;
+			break;
+		case IP_CT_RELATED:
+		case IP_CT_RELATED+IP_CT_IS_REPLY:
+			/* FIXME: we don't handle expectations at the
+			 * moment.  they can arrive on a different node than
+			 * the master connection (e.g. FTP passive mode) */
+		case IP_CT_ESTABLISHED:
+		case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
+			break;
+		default:
+			break;
+	}
+
+#ifdef DEBUG_CLUSTERP
+	DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+#endif
+	DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+	if (!clusterip_responsible(cipinfo->config, hash)) {
+		DEBUGP("not responsible\n");
+		return NF_DROP;
+	}
+	DEBUGP("responsible\n");
+
+	/* despite being received via linklayer multicast, this is
+	 * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
+	(*pskb)->pkt_type = PACKET_HOST;
+
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+
+	struct clusterip_config *config;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) {
+		printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info)));
+		return 0;
+	}
+
+	if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
+	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
+	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
+		printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
+			cipinfo->hash_mode);
+		return 0;
+
+	}
+	if (e->ip.dmsk.s_addr != 0xffffffff
+	    || e->ip.dst.s_addr == 0) {
+		printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
+		return 0;
+	}
+
+	/* FIXME: further sanity checks */
+
+	config = clusterip_config_find_get(e->ip.dst.s_addr);
+	if (!config) {
+		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
+			printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
+			return 0;
+		} else {
+			struct net_device *dev;
+
+			if (e->ip.iniface[0] == '\0') {
+				printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
+				return 0;
+			}
+
+			dev = dev_get_by_name(e->ip.iniface);
+			if (!dev) {
+				printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
+				return 0;
+			}
+
+			config = clusterip_config_init(cipinfo, 
+							e->ip.dst.s_addr, dev);
+			if (!config) {
+				printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
+				dev_put(dev);
+				return 0;
+			}
+			dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
+		}
+	}
+
+	cipinfo->config = config;
+
+	return 1;
+}
+
+/* drop reference count of cluster config when rule is deleted */
+static void destroy(void *matchinfo, unsigned int matchinfosize)
+{
+	struct ipt_clusterip_tgt_info *cipinfo = matchinfo;
+
+	/* we first remove the proc entry and then drop the reference
+	 * count.  In case anyone still accesses the file, the open/close
+	 * functions are also incrementing the refcount on their own */
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry(cipinfo->config->pde->name,
+			  cipinfo->config->pde->parent);
+#endif
+	clusterip_config_put(cipinfo->config);
+}
+
+static struct ipt_target clusterip_tgt = { 
+	.name = "CLUSTERIP",
+	.target = &target, 
+	.checkentry = &checkentry, 
+	.destroy = &destroy,
+	.me = THIS_MODULE
+};
+
+
+/*********************************************************************** 
+ * ARP MANGLING CODE 
+ ***********************************************************************/
+
+/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
+struct arp_payload {
+	u_int8_t src_hw[ETH_ALEN];
+	u_int32_t src_ip;
+	u_int8_t dst_hw[ETH_ALEN];
+	u_int32_t dst_ip;
+} __attribute__ ((packed));
+
+#ifdef CLUSTERIP_DEBUG
+static void arp_print(struct arp_payload *payload) 
+{
+#define HBUFFERLEN 30
+	char hbuffer[HBUFFERLEN];
+	int j,k;
+	const char hexbuf[]= "0123456789abcdef";
+
+	for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+		hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15];
+		hbuffer[k++]=hexbuf[payload->src_hw[j]&15];
+		hbuffer[k++]=':';
+	}
+	hbuffer[--k]='\0';
+
+	printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n", 
+		NIPQUAD(payload->src_ip), hbuffer,
+		NIPQUAD(payload->dst_ip));
+}
+#endif
+
+static unsigned int
+arp_mangle(unsigned int hook,
+	   struct sk_buff **pskb,
+	   const struct net_device *in,
+	   const struct net_device *out,
+	   int (*okfn)(struct sk_buff *))
+{
+	struct arphdr *arp = (*pskb)->nh.arph;
+	struct arp_payload *payload;
+	struct clusterip_config *c;
+
+	/* we don't care about non-ethernet and non-ipv4 ARP */
+	if (arp->ar_hrd != htons(ARPHRD_ETHER)
+	    || arp->ar_pro != htons(ETH_P_IP)
+	    || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+		return NF_ACCEPT;
+
+	/* we only want to mangle arp replies */
+	if (arp->ar_op != htons(ARPOP_REPLY))
+		return NF_ACCEPT;
+
+	payload = (void *)(arp+1);
+
+	/* if there is no clusterip configuration for the arp reply's 
+	 * source ip, we don't want to mangle it */
+	c = clusterip_config_find_get(payload->src_ip);
+	if (!c)
+		return NF_ACCEPT;
+
+	/* normally the linux kernel always replies to arp queries of 
+	 * addresses on different interfacs.  However, in the CLUSTERIP case
+	 * this wouldn't work, since we didn't subscribe the mcast group on
+	 * other interfaces */
+	if (c->dev != out) {
+		DEBUGP("CLUSTERIP: not mangling arp reply on different "
+		       "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name);
+		clusterip_config_put(c);
+		return NF_ACCEPT;
+	}
+
+	/* mangle reply hardware address */
+	memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
+
+#ifdef CLUSTERIP_DEBUG
+	DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
+	arp_print(payload);
+#endif
+
+	clusterip_config_put(c);
+
+	return NF_ACCEPT;
+}
+
+static struct nf_hook_ops cip_arp_ops = {
+	.hook = arp_mangle,
+	.pf = NF_ARP,
+	.hooknum = NF_ARP_OUT,
+	.priority = -1
+};
+
+/*********************************************************************** 
+ * PROC DIR HANDLING 
+ ***********************************************************************/
+
+#ifdef CONFIG_PROC_FS
+
+static int clusterip_proc_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	struct clusterip_config *c = pde->data;
+
+	clusterip_config_get(c);
+
+	return 0;
+}
+
+static int clusterip_proc_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	struct clusterip_config *c = pde->data;
+
+	clusterip_config_put(c);
+
+	return 0;
+}
+
+static int 
+clusterip_proc_read(char *buffer, char **start, off_t offset, int length, 
+		    int *eof, void *data)
+{
+	struct clusterip_config *c = data;
+	int i, len = 0;
+
+	READ_LOCK(&clusterip_lock);
+	for (i = 0; i < c->num_local_nodes; i++) {
+		len += sprintf(buffer+len, "%u,", c->local_nodes[i]);
+	}
+	READ_UNLOCK(&clusterip_lock);
+
+	if (len >= 1)
+		*(buffer+len-1) = '\n';
+	
+	if (length >= len)
+		*eof = 1;
+
+	return len;
+}
+
+static int 
+clusterip_proc_write(struct file *file, const char *input, 
+		     unsigned long size, void *data)
+{
+	#define PROC_WRITELEN	10
+	char buffer[PROC_WRITELEN+1];
+	struct clusterip_config *c = data;
+	unsigned long  nodenum;
+
+	if (copy_from_user(buffer, input, PROC_WRITELEN))
+		return -EFAULT;
+
+	if (*buffer == '+') {
+		nodenum = simple_strtoul(buffer+1, NULL, 10);
+		if (clusterip_add_node(c, nodenum))
+			return -ENOMEM;
+	} else if (*buffer == '-') {
+		nodenum = simple_strtoul(buffer+1, NULL, 10);
+		if (clusterip_del_node(c, nodenum))
+			return -ENOENT;
+	} else
+		return -EIO;
+
+	return size;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int init_or_cleanup(int fini)
+{
+	int ret;
+
+	if (fini)
+		goto cleanup;
+
+	if (ipt_register_target(&clusterip_tgt)) {
+		ret = -EINVAL;
+		goto cleanup_none;
+	}
+
+	if (nf_register_hook(&cip_arp_ops) < 0) {
+		ret = -EINVAL;
+		goto cleanup_target;
+	}
+
+#ifdef CONFIG_PROC_FS
+	memcpy(&clusterip_proc_fops, &proc_file_operations, sizeof(clusterip_proc_fops));
+	clusterip_proc_fops.owner = THIS_MODULE;
+	clusterip_proc_fops.open = clusterip_proc_open;
+	clusterip_proc_fops.release = clusterip_proc_release;
+	clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net);
+	if (!clusterip_procdir) {
+		printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
+		ret = -ENOMEM;
+		goto cleanup_hook;
+	}
+#endif /* CONFIG_PROC_FS */
+
+	printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n",
+		CLUSTERIP_VERSION);
+
+	return 0;
+
+cleanup:
+	printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
+		CLUSTERIP_VERSION);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
+#endif
+cleanup_hook:
+	nf_unregister_hook(&cip_arp_ops);
+cleanup_target:
+	ipt_unregister_target(&clusterip_tgt);
+cleanup_none:
+	return -EINVAL;
+}
+
+static int __init init(void)
+{
+	return init_or_cleanup(0);
+}
+
+static void __exit fini(void)
+{
+	init_or_cleanup(1);
+}
+
+module_init(init);
+module_exit(fini);

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-20 22:38 [PATCH 2.6] iptables CLUSTERIP target Harald Welte
@ 2004-10-20 23:18 ` YOSHIFUJI Hideaki / 吉藤英明
  2004-10-21  4:40   ` David S. Miller
  2004-10-21  7:44 ` Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 29+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-10-20 23:18 UTC (permalink / raw)
  To: laforge; +Cc: yoshfuji, netdev, netfilter-devel, lmb

In article <20041020223828.GP19899@sunbeam.de.gnumonks.org> (at Thu, 21 Oct 2004 00:38:28 +0200), Harald Welte <laforge@netfilter.org> says:

> Apart from the usual netfilter-specific file additions and
> Kconfig/Makefile patches, this needs to export proc_file_operations in
> order to get the reference counting of certain data objects right.  I
> hope this change is acceptable.

Please use seq_file instead.  Thanks.

--yoshfuji

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-20 23:18 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2004-10-21  4:40   ` David S. Miller
  0 siblings, 0 replies; 29+ messages in thread
From: David S. Miller @ 2004-10-21  4:40 UTC (permalink / raw)
  To: yoshfuji; +Cc: laforge, netdev, yoshfuji, netfilter-devel, lmb

On Thu, 21 Oct 2004 08:18:37 +0900 (JST)
YOSHIFUJI Hideaki / ^[$B5HF#1QL@^[(B <yoshfuji@linux-ipv6.org> wrote:

> In article <20041020223828.GP19899@sunbeam.de.gnumonks.org> (at Thu, 21 Oct 2004 00:38:28 +0200), Harald Welte <laforge@netfilter.org> says:
> 
> > Apart from the usual netfilter-specific file additions and
> > Kconfig/Makefile patches, this needs to export proc_file_operations in
> > order to get the reference counting of certain data objects right.  I
> > hope this change is acceptable.
> 
> Please use seq_file instead.  Thanks.

Also, if not, we need to get upstream approval for exporting
proc_file_operations.

Harald, if it is not possible to use seq_file here for some
reason, please post the proc_file_operations exporting patch
(with a suitable description of course) to linux-kernel

Meanwhile this patch is on hold until this is resolved.

Thanks.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-20 22:38 [PATCH 2.6] iptables CLUSTERIP target Harald Welte
  2004-10-20 23:18 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2004-10-21  7:44 ` Christoph Hellwig
  2004-10-21  7:55 ` Christoph Hellwig
  2004-10-21 16:36 ` [PATCH 2.6] iptables CLUSTERIP target, seq_file version Harald Welte
  3 siblings, 0 replies; 29+ messages in thread
From: Christoph Hellwig @ 2004-10-21  7:44 UTC (permalink / raw)
  To: Harald Welte, David Miller, Linux Netdev List,
	Netfilter Development Mailinglist, lmb

On Thu, Oct 21, 2004 at 12:38:28AM +0200, Harald Welte wrote:
> Hi Dave!
> 
> This is the second patch, adding the 'CLUSTERIP' target to iptables. It
> depends on the first 'CONNMARK' patch.
> 
> This enables you to build a static load sharing cluster between multiple
> nodes - without the requirement to have a load balancer.  It uses a
> series of [evil] tricks like replying with linklayer multicast addresses
> to ARP requests, and using CONNMARK for stateful blocking all traffic
> not intended for the local node.
> 
> Apart from the usual netfilter-specific file additions and
> Kconfig/Makefile patches, this needs to export proc_file_operations in
> order to get the reference counting of certain data objects right.  I
> hope this change is acceptable.

The export is totally bogus.  If you need to do fancy things procfs is
the wrong interface.  Care to explain why exactly you think you need it?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-20 22:38 [PATCH 2.6] iptables CLUSTERIP target Harald Welte
  2004-10-20 23:18 ` YOSHIFUJI Hideaki / 吉藤英明
  2004-10-21  7:44 ` Christoph Hellwig
@ 2004-10-21  7:55 ` Christoph Hellwig
  2004-10-21  9:12   ` Harald Welte
  2004-10-21 16:36 ` [PATCH 2.6] iptables CLUSTERIP target, seq_file version Harald Welte
  3 siblings, 1 reply; 29+ messages in thread
From: Christoph Hellwig @ 2004-10-21  7:55 UTC (permalink / raw)
  To: Harald Welte, Linux Netdev List, Netfilter Development Mailinglist

btw, please stop crossposting to public and private lists, or even better
allow posting to netfiler-devel without subscription like every sane list
allows to.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-21  7:55 ` Christoph Hellwig
@ 2004-10-21  9:12   ` Harald Welte
  2004-10-21  9:40     ` Herbert Xu
  2004-10-21 11:05     ` bert hubert
  0 siblings, 2 replies; 29+ messages in thread
From: Harald Welte @ 2004-10-21  9:12 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Linux Netdev List, Netfilter Development Mailinglist

[-- Attachment #1: Type: text/plain, Size: 1019 bytes --]

On Thu, Oct 21, 2004 at 08:55:30AM +0100, Christoph Hellwig wrote:
> btw, please stop crossposting to public and private lists, or even better
> allow posting to netfiler-devel without subscription like every sane list
> allows to.

We allow posting, your posting will just await moderator approval.  Our
listmaster usuall checks more than once per day.  

Developers who happen to post more often (like davem) will be added to
an explicit sender filter.

Unfortunately wi didn't find any other reliable way to catch all the
spam we receive, sorry.  And yes, we already run spamassassin, as well
as mimetype-based and HTML filters.

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-21  9:12   ` Harald Welte
@ 2004-10-21  9:40     ` Herbert Xu
  2004-10-21 11:05     ` bert hubert
  1 sibling, 0 replies; 29+ messages in thread
From: Herbert Xu @ 2004-10-21  9:40 UTC (permalink / raw)
  To: Harald Welte; +Cc: hch, netdev

Harald Welte <laforge@netfilter.org> wrote:
> 
> We allow posting, your posting will just await moderator approval.  Our
> listmaster usuall checks more than once per day.  

Could you at least turn off the notification sent to the poster?

It seems ironic that in an attempt to prevent spam from entering
your list that you're spamming the posters instead.  As it is
I'll be taking your list off any CC list that I'm on.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-21  9:12   ` Harald Welte
  2004-10-21  9:40     ` Herbert Xu
@ 2004-10-21 11:05     ` bert hubert
  2004-10-21 13:03       ` Harald Welte
  1 sibling, 1 reply; 29+ messages in thread
From: bert hubert @ 2004-10-21 11:05 UTC (permalink / raw)
  To: Harald Welte, Christoph Hellwig, Linux Netdev List

Is the CLUSTERIP target going anywhere after the (needlessly harsh) rebukes
posted here?

I note with glee that CLUSTERIP implements (and improves) the evil ideas of
http://lartc.org/autoloadbalance.php3

And yes, I've received hate mail over this from switch engineers :-)

Good luck!

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-21 11:05     ` bert hubert
@ 2004-10-21 13:03       ` Harald Welte
  2004-10-21 13:33         ` Lars Marowsky-Bree
  0 siblings, 1 reply; 29+ messages in thread
From: Harald Welte @ 2004-10-21 13:03 UTC (permalink / raw)
  To: bert hubert, Christoph Hellwig, Linux Netdev List, lmb

[-- Attachment #1: Type: text/plain, Size: 1721 bytes --]

On Thu, Oct 21, 2004 at 01:05:13PM +0200, bert hubert wrote:
> Is the CLUSTERIP target going anywhere after the (needlessly harsh) rebukes
> posted here?

Yes, I think so.  I'm in the process of investigating why I didn't use
seq_file at the time I implemented it (some 1.5 years ago, IIRC).
Unfortunately I don't really remember all the issues involved :(  If I'm
not mistaken, I even posted some questions in this regard to lkml.

> I note with glee that CLUSTERIP implements (and improves) the evil ideas of
> http://lartc.org/autoloadbalance.php3

I didn't know about that page, but yes, indeed.  I first was informed
about this approach by Fabio Olive Leite, who was one of my colleagues
at Conectiva (where I was 2001).  He presented this approach at the
Linux Kongress 2002:
http://www.linux-kongress.org/2002/papers/lk2002-leite.html

Later SuSE approached me if I was interested in implementing that idea,
and that's how CLUSTERIP happened.  If I'm not mistaken SuSE is already
shipping this (at least in some beta version?) - I don't really know but
maybe lmb can shed some light on this issue.

> And yes, I've received hate mail over this from switch engineers :-)

Why is that?  They have to deal with multicast traffic, too...  and I
don't really see how this is any different.

> Good luck!

Thanks ;)

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-21 13:03       ` Harald Welte
@ 2004-10-21 13:33         ` Lars Marowsky-Bree
  2004-10-21 14:25           ` Harald Welte
  0 siblings, 1 reply; 29+ messages in thread
From: Lars Marowsky-Bree @ 2004-10-21 13:33 UTC (permalink / raw)
  To: Harald Welte, bert hubert, Christoph Hellwig, Linux Netdev List

On 2004-10-21T15:03:27, Harald Welte <laforge@netfilter.org> wrote:

> > I note with glee that CLUSTERIP implements (and improves) the evil ideas of
> > http://lartc.org/autoloadbalance.php3
> 
> I didn't know about that page, but yes, indeed.  I first was informed
> about this approach by Fabio Olive Leite, who was one of my colleagues
> at Conectiva (where I was 2001).  He presented this approach at the
> Linux Kongress 2002:
> http://www.linux-kongress.org/2002/papers/lk2002-leite.html
> 
> Later SuSE approached me if I was interested in implementing that idea,
> and that's how CLUSTERIP happened.  If I'm not mistaken SuSE is already
> shipping this (at least in some beta version?) - I don't really know but
> maybe lmb can shed some light on this issue.

Yeah, I learned about it from Fabio, thought it would be a cool idea,
and eventually we sponsored Harald to make it happen. It's shipping in
SLES9, and we're adding full support for it in the 2.0 heartbeat cluster
resource manager. 

It's meant to complement the LVS / mod_backhand based approaches.

If the proc file IO for communicating with the module goes away in
favour of a commandline based add/list/remove/etc interface, that would
just help to tidy up our code, so it's OK with me.

> > And yes, I've received hate mail over this from switch engineers :-)
> Why is that?  They have to deal with multicast traffic, too...  and I
> don't really see how this is any different.

Yeah, but with this they get a whole lot more ;-)


Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX AG - A Novell company

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-21 13:33         ` Lars Marowsky-Bree
@ 2004-10-21 14:25           ` Harald Welte
  2004-10-21 15:08             ` bert hubert
  2004-10-21 21:31             ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
  0 siblings, 2 replies; 29+ messages in thread
From: Harald Welte @ 2004-10-21 14:25 UTC (permalink / raw)
  To: Lars Marowsky-Bree; +Cc: bert hubert, Christoph Hellwig, Linux Netdev List

[-- Attachment #1: Type: text/plain, Size: 2043 bytes --]

On Thu, Oct 21, 2004 at 03:33:46PM +0200, Lars Marowsky-Bree wrote:

> If the proc file IO for communicating with the module goes away in
> favour of a commandline based add/list/remove/etc interface, that would
> just help to tidy up our code, so it's OK with me.

I'm not a big fan of inventing new kernel/userspace interfaces.  We
don't have any associated device, so we don't have any ioctl()s or stuff
like that.

For supporting two primitive operation, adding a new netlink address
family also isn't worthwhile, especially since we're short of netlink
families.

Using the iptables getsockopt/setsockopt interface also is not possible,
since it cannot be extended.

So instead of introducing a new syscall, I think /proc is just the right
way to deal with this :)

I've now converted it to use seq_file, still need to do some testing.
The initial reason not to use seq_file was that 

1) I thought seq_file files cannot be writeable.  
2) We don't actually need to dump a table with dozesn of entries, just
   a small number of integer numbers, thus seq_file seemed a bit like
   overkill.

This assumption '1' seems to be wrong, I now provide my own write
function to struct file_operations.

Let me give it some testing, I'll re-submit it later today or tomorrow.

> > > And yes, I've received hate mail over this from switch engineers :-)
> > Why is that?  They have to deal with multicast traffic, too...  and I
> > don't really see how this is any different.
> 
> Yeah, but with this they get a whole lot more ;-)

I didn't read any ethernet-related specification that said it is safe to
assume nobody uses multicast.

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target
  2004-10-21 14:25           ` Harald Welte
@ 2004-10-21 15:08             ` bert hubert
  2004-10-21 21:31             ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
  1 sibling, 0 replies; 29+ messages in thread
From: bert hubert @ 2004-10-21 15:08 UTC (permalink / raw)
  To: Harald Welte; +Cc: Lars Marowsky-Bree, Christoph Hellwig, Linux Netdev List

On Thu, Oct 21, 2004 at 04:25:27PM +0200, Harald Welte wrote:

> I'm not a big fan of inventing new kernel/userspace interfaces.  We
> don't have any associated device, so we don't have any ioctl()s or stuff
> like that.

Or use netlink.

> So instead of introducing a new syscall, I think /proc is just the right
> way to deal with this :)

Or even more modern, clusteripfs.

> Let me give it some testing, I'll re-submit it later today or tomorrow.

I'll test it at home to see if it does the right thing too.

> > > > And yes, I've received hate mail over this from switch engineers :-)

The hate mail originated from a large telco, I think the innovation just
offended their sensitive minds.

	Bert

-- 
http://www.PowerDNS.com      Open source, database driven DNS Software 
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 2.6] iptables CLUSTERIP target, seq_file version
  2004-10-20 22:38 [PATCH 2.6] iptables CLUSTERIP target Harald Welte
                   ` (2 preceding siblings ...)
  2004-10-21  7:55 ` Christoph Hellwig
@ 2004-10-21 16:36 ` Harald Welte
  2004-10-21 17:44   ` jamal
  2004-10-22  5:52   ` David S. Miller
  3 siblings, 2 replies; 29+ messages in thread
From: Harald Welte @ 2004-10-21 16:36 UTC (permalink / raw)
  To: David Miller, Linux Netdev List, Netfilter Development Mailinglist, lmb


[-- Attachment #1.1: Type: text/plain, Size: 1169 bytes --]

On Thu, Oct 21, 2004 at 12:38:28AM +0200, Harald Welte wrote:
Hi Dave!

This is the 'CLUSTERIP' target for iptables, this time no core kernel
changes required, please apply.

This enables you to build a static load sharing cluster between multiple
nodes - without the requirement to have a load balancer.  It uses a
series of [evil] tricks like replying with linklayer multicast addresses
to ARP requests, and using CONNMARK for stateful blocking all traffic
not intended for the local node.

Apart from the usual netfilter-specific file additions and
Kconfig/Makefile patches, this needs to export proc_file_operations in
order to get the reference counting of certain data objects right.  I
hope this change is acceptable.

Signed-off-by: Harald Welte <laforge@netfilter.org>

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #1.2: 2.6.9-clusterip.patch --]
[-- Type: text/plain, Size: 23519 bytes --]

diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h linux-2.6.9-connmark-clusterip/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
--- linux-2.6.9-connmark/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.9-connmark-clusterip/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h	2004-10-21 00:22:29.000000000 +0200
@@ -0,0 +1,32 @@
+#ifndef _IPT_CLUSTERIP_H_target
+#define _IPT_CLUSTERIP_H_target
+
+enum clusterip_hashmode {
+    CLUSTERIP_HASHMODE_SIP = 0,
+    CLUSTERIP_HASHMODE_SIP_SPT,
+    CLUSTERIP_HASHMODE_SIP_SPT_DPT,
+};
+
+#define CLUSTERIP_HASHMODE_MAX CLUSTERIP_HASHMODE_SIP_SPT_DPT
+
+#define CLUSTERIP_MAX_NODES 16
+
+#define CLUSTERIP_FLAG_NEW 0x00000001
+
+struct clusterip_config;
+
+struct ipt_clusterip_tgt_info {
+
+	u_int32_t flags;
+	struct clusterip_config *config;
+	
+	/* only relevant for new ones */
+	u_int8_t clustermac[6];
+	u_int16_t num_total_nodes;
+	u_int16_t num_local_nodes;
+	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
+	enum clusterip_hashmode hash_mode;
+	u_int32_t hash_initval;
+};
+
+#endif /*_IPT_CLUSTERIP_H_target*/
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/net/ipv4/netfilter/Kconfig linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/Kconfig
--- linux-2.6.9-connmark/net/ipv4/netfilter/Kconfig	2004-10-21 00:16:30.000000000 +0200
+++ linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/Kconfig	2004-10-21 00:30:17.000000000 +0200
@@ -628,6 +628,16 @@
 	  Documentation/modules.txt.  The module will be called
 	  ipt_CONNMARK.o.  If unsure, say `N'.
 
+config IP_NF_TARGET_CLUSTERIP
+	tristate "CLUSTERIP target support (EXPERIMENTAL)"
+	depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL
+	help
+	  The CLUSTERIP target allows you to build load-balancing clusters of
+	  network servers without having a dedicated load-balancing
+	  router/server/switch.
+	
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # raw + specific targets
 config IP_NF_RAW
 	tristate  'raw table support (required for NOTRACK/TRACE)'
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/net/ipv4/netfilter/Makefile linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/Makefile
--- linux-2.6.9-connmark/net/ipv4/netfilter/Makefile	2004-10-20 23:59:36.000000000 +0200
+++ linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/Makefile	2004-10-21 00:23:57.000000000 +0200
@@ -86,6 +86,7 @@
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
 obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
+obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 
 # generic ARP tables
 obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff linux-2.6.9-connmark/net/ipv4/netfilter/ipt_CLUSTERIP.c linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/ipt_CLUSTERIP.c
--- linux-2.6.9-connmark/net/ipv4/netfilter/ipt_CLUSTERIP.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.9-connmark-clusterip/net/ipv4/netfilter/ipt_CLUSTERIP.c	2004-10-21 18:28:30.195525212 +0200
@@ -0,0 +1,760 @@
+/* Cluster IP hashmark target 
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * based on ideas of Fabio Olive Leite <olive@unixforge.org>
+ *
+ * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/proc_fs.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <net/checksum.h>
+
+#include <linux/netfilter_arp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+#define CLUSTERIP_VERSION "0.6"
+
+#define DEBUG_CLUSTERIP
+
+#ifdef DEBUG_CLUSTERIP
+#define DEBUGP	printk
+#else
+#define DEBUGP
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables target for CLUSTERIP");
+
+struct clusterip_config {
+	struct list_head list;			/* list of all configs */
+	atomic_t refcount;			/* reference count */
+
+	u_int32_t clusterip;			/* the IP address */
+	u_int8_t clustermac[ETH_ALEN];		/* the MAC address */
+	struct net_device *dev;			/* device */
+	u_int16_t num_total_nodes;		/* total number of nodes */
+	u_int16_t num_local_nodes;		/* number of local nodes */
+	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];	/* node number array */
+
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *pde;		/* proc dir entry */
+#endif
+	enum clusterip_hashmode hash_mode;	/* which hashing mode */
+	u_int32_t hash_initval;			/* hash initialization */
+};
+
+static LIST_HEAD(clusterip_configs);
+
+/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
+ * data within all structurses (num_local_nodes, local_nodes[]) */
+DECLARE_RWLOCK(clusterip_lock);
+
+#ifdef CONFIG_PROC_FS
+static struct file_operations clusterip_proc_fops;
+static struct proc_dir_entry *clusterip_procdir;
+#endif
+
+static inline void
+clusterip_config_get(struct clusterip_config *c) {
+	atomic_inc(&c->refcount);
+}
+
+static inline void
+clusterip_config_put(struct clusterip_config *c) {
+	if (atomic_dec_and_test(&c->refcount)) {
+		WRITE_LOCK(&clusterip_lock);
+		list_del(&c->list);
+		WRITE_UNLOCK(&clusterip_lock);
+		dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
+		dev_put(c->dev);
+		kfree(c);
+	}
+}
+
+
+static struct clusterip_config *
+__clusterip_config_find(u_int32_t clusterip)
+{
+	struct list_head *pos;
+
+	MUST_BE_READ_LOCKED(&clusterip_lock);
+	list_for_each(pos, &clusterip_configs) {
+		struct clusterip_config *c = list_entry(pos, 
+					struct clusterip_config, list);
+		if (c->clusterip == clusterip) {
+			return c;
+		}
+	}
+
+	return NULL;
+}
+
+static inline struct clusterip_config *
+clusterip_config_find_get(u_int32_t clusterip)
+{
+	struct clusterip_config *c;
+
+	READ_LOCK(&clusterip_lock);
+	c = __clusterip_config_find(clusterip);
+	if (!c) {
+		READ_UNLOCK(&clusterip_lock);
+		return NULL;
+	}
+	atomic_inc(&c->refcount);
+	READ_UNLOCK(&clusterip_lock);
+
+	return c;
+}
+
+static struct clusterip_config *
+clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
+			struct net_device *dev)
+{
+	struct clusterip_config *c;
+	char buffer[16];
+
+	c = kmalloc(sizeof(*c), GFP_ATOMIC);
+	if (!c)
+		return NULL;
+
+	memset(c, 0, sizeof(*c));
+	c->dev = dev;
+	c->clusterip = ip;
+	memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+	c->num_total_nodes = i->num_total_nodes;
+	c->num_local_nodes = i->num_local_nodes;
+	memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
+	c->hash_mode = i->hash_mode;
+	c->hash_initval = i->hash_initval;
+	atomic_set(&c->refcount, 1);
+
+#ifdef CONFIG_PROC_FS
+	/* create proc dir entry */
+	sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip));
+	c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir);
+	if (!c->pde) {
+		kfree(c);
+		return NULL;
+	}
+	c->pde->proc_fops = &clusterip_proc_fops;
+	c->pde->data = c;
+#endif
+
+	WRITE_LOCK(&clusterip_lock);
+	list_add(&c->list, &clusterip_configs);
+	WRITE_UNLOCK(&clusterip_lock);
+
+	return c;
+}
+
+static int
+clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+	int i;
+
+	WRITE_LOCK(&clusterip_lock);
+
+	if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
+	    || nodenum > CLUSTERIP_MAX_NODES) {
+		WRITE_UNLOCK(&clusterip_lock);
+		return 1;
+	}
+
+	/* check if we alrady have this number in our array */
+	for (i = 0; i < c->num_local_nodes; i++) {
+		if (c->local_nodes[i] == nodenum) {
+			WRITE_UNLOCK(&clusterip_lock);
+			return 1;
+		}
+	}
+
+	c->local_nodes[c->num_local_nodes++] = nodenum;
+
+	WRITE_UNLOCK(&clusterip_lock);
+	return 0;
+}
+
+static int
+clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+	int i;
+
+	WRITE_LOCK(&clusterip_lock);
+
+	if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
+		WRITE_UNLOCK(&clusterip_lock);
+		return 1;
+	}
+		
+	for (i = 0; i < c->num_local_nodes; i++) {
+		if (c->local_nodes[i] == nodenum) {
+			int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
+			memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
+			c->num_local_nodes--;
+			WRITE_UNLOCK(&clusterip_lock);
+			return 0;
+		}
+	}
+
+	WRITE_UNLOCK(&clusterip_lock);
+	return 1;
+}
+
+static inline u_int32_t
+clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
+{
+	struct iphdr *iph = skb->nh.iph;
+	unsigned long hashval;
+	u_int16_t sport, dport;
+	struct tcphdr *th;
+	struct udphdr *uh;
+	struct icmphdr *ih;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = (void *)iph+iph->ihl*4;
+		sport = ntohs(th->source);
+		dport = ntohs(th->dest);
+		break;
+	case IPPROTO_UDP:
+		uh = (void *)iph+iph->ihl*4;
+		sport = ntohs(uh->source);
+		dport = ntohs(uh->dest);
+		break;
+	case IPPROTO_ICMP:
+		ih = (void *)iph+iph->ihl*4;
+		sport = ntohs(ih->un.echo.id);
+		dport = (ih->type<<8)|ih->code;
+		break;
+	default:
+		if (net_ratelimit()) {
+			printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n",
+				iph->protocol);
+		}
+		sport = dport = 0;
+	}
+
+	switch (config->hash_mode) {
+	case CLUSTERIP_HASHMODE_SIP:
+		hashval = jhash_1word(ntohl(iph->saddr),
+				      config->hash_initval);
+		break;
+	case CLUSTERIP_HASHMODE_SIP_SPT:
+		hashval = jhash_2words(ntohl(iph->saddr), sport, 
+				       config->hash_initval);
+		break;
+	case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
+		hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
+				       config->hash_initval);
+		break;
+	default:
+		/* to make gcc happy */
+		hashval = 0;
+		/* This cannot happen, unless the check function wasn't called
+		 * at rule load time */
+		printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode);
+		BUG();
+		break;
+	}
+
+	/* node numbers are 1..n, not 0..n */
+	return ((hashval % config->num_total_nodes)+1);
+}
+
+static inline int
+clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
+{
+	int i;
+
+	READ_LOCK(&clusterip_lock);
+
+	if (config->num_local_nodes == 0) {
+		READ_UNLOCK(&clusterip_lock);
+		return 0;
+	}
+
+	for (i = 0; i < config->num_local_nodes; i++) {
+		if (config->local_nodes[i] == hash) {
+			READ_UNLOCK(&clusterip_lock);
+			return 1;
+		}
+	}
+
+	READ_UNLOCK(&clusterip_lock);
+
+	return 0;
+}
+
+/*********************************************************************** 
+ * IPTABLES TARGET 
+ ***********************************************************************/
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
+	u_int32_t hash;
+
+	/* don't need to clusterip_config_get() here, since refcount
+	 * is only decremented by destroy() - and ip_tables guarantees
+	 * that the ->target() function isn't called after ->destroy() */
+
+	if (!ct) {
+		printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
+			/* FIXME: need to drop invalid ones, since replies
+			 * to outgoing connections of other nodes will be 
+			 * marked as INVALID */
+		return NF_DROP;
+	}
+
+	/* special case: ICMP error handling. conntrack distinguishes between
+	 * error messages (RELATED) and information requests (see below) */
+	if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
+	    && (ctinfo == IP_CT_RELATED 
+		|| ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
+		return IPT_CONTINUE;
+
+	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 
+	 * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+	 * on, which all have an ID field [relevant for hashing]. */
+
+	hash = clusterip_hashfn(*pskb, cipinfo->config);
+
+	switch (ctinfo) {
+		case IP_CT_NEW:
+			ct->mark = hash;
+			break;
+		case IP_CT_RELATED:
+		case IP_CT_RELATED+IP_CT_IS_REPLY:
+			/* FIXME: we don't handle expectations at the
+			 * moment.  they can arrive on a different node than
+			 * the master connection (e.g. FTP passive mode) */
+		case IP_CT_ESTABLISHED:
+		case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
+			break;
+		default:
+			break;
+	}
+
+#ifdef DEBUG_CLUSTERP
+	DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+#endif
+	DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+	if (!clusterip_responsible(cipinfo->config, hash)) {
+		DEBUGP("not responsible\n");
+		return NF_DROP;
+	}
+	DEBUGP("responsible\n");
+
+	/* despite being received via linklayer multicast, this is
+	 * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
+	(*pskb)->pkt_type = PACKET_HOST;
+
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+
+	struct clusterip_config *config;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) {
+		printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info)));
+		return 0;
+	}
+
+	if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
+	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
+	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
+		printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
+			cipinfo->hash_mode);
+		return 0;
+
+	}
+	if (e->ip.dmsk.s_addr != 0xffffffff
+	    || e->ip.dst.s_addr == 0) {
+		printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
+		return 0;
+	}
+
+	/* FIXME: further sanity checks */
+
+	config = clusterip_config_find_get(e->ip.dst.s_addr);
+	if (!config) {
+		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
+			printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
+			return 0;
+		} else {
+			struct net_device *dev;
+
+			if (e->ip.iniface[0] == '\0') {
+				printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
+				return 0;
+			}
+
+			dev = dev_get_by_name(e->ip.iniface);
+			if (!dev) {
+				printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
+				return 0;
+			}
+
+			config = clusterip_config_init(cipinfo, 
+							e->ip.dst.s_addr, dev);
+			if (!config) {
+				printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
+				dev_put(dev);
+				return 0;
+			}
+			dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
+		}
+	}
+
+	cipinfo->config = config;
+
+	return 1;
+}
+
+/* drop reference count of cluster config when rule is deleted */
+static void destroy(void *matchinfo, unsigned int matchinfosize)
+{
+	struct ipt_clusterip_tgt_info *cipinfo = matchinfo;
+
+	/* we first remove the proc entry and then drop the reference
+	 * count.  In case anyone still accesses the file, the open/close
+	 * functions are also incrementing the refcount on their own */
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry(cipinfo->config->pde->name,
+			  cipinfo->config->pde->parent);
+#endif
+	clusterip_config_put(cipinfo->config);
+}
+
+static struct ipt_target clusterip_tgt = { 
+	.name = "CLUSTERIP",
+	.target = &target, 
+	.checkentry = &checkentry, 
+	.destroy = &destroy,
+	.me = THIS_MODULE
+};
+
+
+/*********************************************************************** 
+ * ARP MANGLING CODE 
+ ***********************************************************************/
+
+/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
+struct arp_payload {
+	u_int8_t src_hw[ETH_ALEN];
+	u_int32_t src_ip;
+	u_int8_t dst_hw[ETH_ALEN];
+	u_int32_t dst_ip;
+} __attribute__ ((packed));
+
+#ifdef CLUSTERIP_DEBUG
+static void arp_print(struct arp_payload *payload) 
+{
+#define HBUFFERLEN 30
+	char hbuffer[HBUFFERLEN];
+	int j,k;
+	const char hexbuf[]= "0123456789abcdef";
+
+	for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+		hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15];
+		hbuffer[k++]=hexbuf[payload->src_hw[j]&15];
+		hbuffer[k++]=':';
+	}
+	hbuffer[--k]='\0';
+
+	printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n", 
+		NIPQUAD(payload->src_ip), hbuffer,
+		NIPQUAD(payload->dst_ip));
+}
+#endif
+
+static unsigned int
+arp_mangle(unsigned int hook,
+	   struct sk_buff **pskb,
+	   const struct net_device *in,
+	   const struct net_device *out,
+	   int (*okfn)(struct sk_buff *))
+{
+	struct arphdr *arp = (*pskb)->nh.arph;
+	struct arp_payload *payload;
+	struct clusterip_config *c;
+
+	/* we don't care about non-ethernet and non-ipv4 ARP */
+	if (arp->ar_hrd != htons(ARPHRD_ETHER)
+	    || arp->ar_pro != htons(ETH_P_IP)
+	    || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+		return NF_ACCEPT;
+
+	/* we only want to mangle arp replies */
+	if (arp->ar_op != htons(ARPOP_REPLY))
+		return NF_ACCEPT;
+
+	payload = (void *)(arp+1);
+
+	/* if there is no clusterip configuration for the arp reply's 
+	 * source ip, we don't want to mangle it */
+	c = clusterip_config_find_get(payload->src_ip);
+	if (!c)
+		return NF_ACCEPT;
+
+	/* normally the linux kernel always replies to arp queries of 
+	 * addresses on different interfacs.  However, in the CLUSTERIP case
+	 * this wouldn't work, since we didn't subscribe the mcast group on
+	 * other interfaces */
+	if (c->dev != out) {
+		DEBUGP("CLUSTERIP: not mangling arp reply on different "
+		       "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name);
+		clusterip_config_put(c);
+		return NF_ACCEPT;
+	}
+
+	/* mangle reply hardware address */
+	memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
+
+#ifdef CLUSTERIP_DEBUG
+	DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
+	arp_print(payload);
+#endif
+
+	clusterip_config_put(c);
+
+	return NF_ACCEPT;
+}
+
+static struct nf_hook_ops cip_arp_ops = {
+	.hook = arp_mangle,
+	.pf = NF_ARP,
+	.hooknum = NF_ARP_OUT,
+	.priority = -1
+};
+
+/*********************************************************************** 
+ * PROC DIR HANDLING 
+ ***********************************************************************/
+
+#ifdef CONFIG_PROC_FS
+
+static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct clusterip_config *c = pde->data;
+	unsigned int *nodeidx;
+
+	READ_LOCK(&clusterip_lock);
+	if (*pos >= c->num_local_nodes)
+		return NULL;
+
+	nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL);
+	if (!nodeidx)
+		return ERR_PTR(-ENOMEM);
+
+	*nodeidx = *pos;
+	return nodeidx;
+}
+
+static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct clusterip_config *c = pde->data;
+	unsigned int *nodeidx = (unsigned int *)v;
+
+	*pos = ++(*nodeidx);
+	if (*pos >= c->num_local_nodes) {
+		kfree(v);
+		return NULL;
+	}
+	return nodeidx;
+}
+
+static void clusterip_seq_stop(struct seq_file *s, void *v)
+{
+	kfree(v);
+
+	READ_UNLOCK(&clusterip_lock);
+}
+
+static int clusterip_seq_show(struct seq_file *s, void *v)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct clusterip_config *c = pde->data;
+	unsigned int *nodeidx = (unsigned int *)v;
+
+	if (*nodeidx != 0) 
+		seq_putc(s, ',');
+	seq_printf(s, "%u", c->local_nodes[*nodeidx]);
+
+	if (*nodeidx == c->num_local_nodes-1)
+		seq_putc(s, '\n');
+
+	return 0;
+}
+
+static struct seq_operations clusterip_seq_ops = {
+	.start	= clusterip_seq_start,
+	.next	= clusterip_seq_next,
+	.stop	= clusterip_seq_stop,
+	.show	= clusterip_seq_show,
+};
+
+static int clusterip_proc_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &clusterip_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+		struct proc_dir_entry *pde = PDE(inode);
+		struct clusterip_config *c = pde->data;
+
+		sf->private = pde;
+
+		clusterip_config_get(c);
+	}
+
+	return ret;
+}
+
+static int clusterip_proc_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	struct clusterip_config *c = pde->data;
+	int ret;
+
+	ret = seq_release(inode, file);
+
+	if (!ret)
+		clusterip_config_put(c);
+
+	return ret;
+}
+
+static ssize_t clusterip_proc_write(struct file *file, const char *input,
+				size_t size, loff_t *ofs)
+{
+#define PROC_WRITELEN	10
+	char buffer[PROC_WRITELEN+1];
+	struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
+	struct clusterip_config *c = pde->data;
+	unsigned long nodenum;
+
+	if (copy_from_user(buffer, input, PROC_WRITELEN))
+		return -EFAULT;
+
+	if (*buffer == '+') {
+		nodenum = simple_strtoul(buffer+1, NULL, 10);
+		if (clusterip_add_node(c, nodenum))
+			return -ENOMEM;
+	} else if (*buffer == '-') {
+		nodenum = simple_strtoul(buffer+1, NULL,10);
+		if (clusterip_del_node(c, nodenum))
+			return -ENOENT;
+	} else
+		return -EIO;
+
+	return size;
+}
+
+static struct file_operations clusterip_proc_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = clusterip_proc_open,
+	.read	 = seq_read,
+	.write	 = clusterip_proc_write,
+	.llseek	 = seq_lseek,
+	.release = clusterip_proc_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static int init_or_cleanup(int fini)
+{
+	int ret;
+
+	if (fini)
+		goto cleanup;
+
+	if (ipt_register_target(&clusterip_tgt)) {
+		ret = -EINVAL;
+		goto cleanup_none;
+	}
+
+	if (nf_register_hook(&cip_arp_ops) < 0) {
+		ret = -EINVAL;
+		goto cleanup_target;
+	}
+
+#ifdef CONFIG_PROC_FS
+	clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net);
+	if (!clusterip_procdir) {
+		printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
+		ret = -ENOMEM;
+		goto cleanup_hook;
+	}
+#endif /* CONFIG_PROC_FS */
+
+	printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n",
+		CLUSTERIP_VERSION);
+
+	return 0;
+
+cleanup:
+	printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
+		CLUSTERIP_VERSION);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
+#endif
+cleanup_hook:
+	nf_unregister_hook(&cip_arp_ops);
+cleanup_target:
+	ipt_unregister_target(&clusterip_tgt);
+cleanup_none:
+	return -EINVAL;
+}
+
+static int __init init(void)
+{
+	return init_or_cleanup(0);
+}
+
+static void __exit fini(void)
+{
+	init_or_cleanup(1);
+}
+
+module_init(init);
+module_exit(fini);

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target, seq_file version
  2004-10-21 16:36 ` [PATCH 2.6] iptables CLUSTERIP target, seq_file version Harald Welte
@ 2004-10-21 17:44   ` jamal
  2004-10-21 18:03     ` Harald Welte
  2004-10-22  5:52   ` David S. Miller
  1 sibling, 1 reply; 29+ messages in thread
From: jamal @ 2004-10-21 17:44 UTC (permalink / raw)
  To: Harald Welte; +Cc: Linux Netdev List, Netfilter Development Mailinglist, lmb


Sorry, couldnt resist - so out of hiding for just a few seconds; should
be able to achieve this much simpler with gact.

Example:
Consider two machines with allowed to receive only packets for 10.0.0.3.
A simple balance scheme is to have even src IPs being processed by one
and odd by another.

#add ingress qdisc to eth2
# Accept arps destined for 10.0.0.3 but sourced from even numbered
# src addresses
tc filter add dev eth2 parent ffff: protocol arp prio 6 u32 match u32
0xa000003 0xffffffff at 24 match u8 0x0 0x1 at 17 flowid 1:2 action ok
# drop any other arps for 10.0.0.3
tc filter add dev eth2 parent ffff: protocol arp prio 7 u32 match u32
0xa000003 0xffffffff at 24 flowid 1:2 action drop

On a second machine which is doing odd just change the u8 to 0x1 0x1.
No need to send fake ARPs using multicast with this. But if you wanted
to be funky you could use pedit to create a virtual MAC address.

We use this well with a lot more complex static rules and failover.

cheers,
jamal (Back to work)

On Thu, 2004-10-21 at 12:36, Harald Welte wrote:
> On Thu, Oct 21, 2004 at 12:38:28AM +0200, Harald Welte wrote:
> Hi Dave!
> 
> This is the 'CLUSTERIP' target for iptables, this time no core kernel
> changes required, please apply.
> 
> This enables you to build a static load sharing cluster between multiple
> nodes - without the requirement to have a load balancer.  It uses a
> series of [evil] tricks like replying with linklayer multicast addresses
> to ARP requests, and using CONNMARK for stateful blocking all traffic
> not intended for the local node.
> 
> Apart from the usual netfilter-specific file additions and
> Kconfig/Makefile patches, this needs to export proc_file_operations in
> order to get the reference counting of certain data objects right.  I
> hope this change is acceptable.
> 
> Signed-off-by: Harald Welte <laforge@netfilter.org>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target, seq_file version
  2004-10-21 17:44   ` jamal
@ 2004-10-21 18:03     ` Harald Welte
  2004-10-21 18:41       ` Henrik Nordstrom
  0 siblings, 1 reply; 29+ messages in thread
From: Harald Welte @ 2004-10-21 18:03 UTC (permalink / raw)
  To: jamal; +Cc: Linux Netdev List, Netfilter Development Mailinglist, lmb

[-- Attachment #1: Type: text/plain, Size: 1046 bytes --]

On Thu, Oct 21, 2004 at 01:44:11PM -0400, jamal wrote:
> 
> Sorry, couldnt resist - so out of hiding for just a few seconds; should
> be able to achieve this much simpler with gact.

One of the issues that CLUSTERIP needed to do is to work with
locally-originated connections, i.e. every node within the cluster still
has to be able to open tcp connections to anywhere.

We currently catch this with connection tracking, which will assign all
reply packets to such outbound connections INVALID on all but the
originating node in the cluster.

Yes, I know, this sounds like a very strange setup.  Still it was one of
the requirements for it's implementation.

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target, seq_file version
  2004-10-21 18:03     ` Harald Welte
@ 2004-10-21 18:41       ` Henrik Nordstrom
  0 siblings, 0 replies; 29+ messages in thread
From: Henrik Nordstrom @ 2004-10-21 18:41 UTC (permalink / raw)
  To: Harald Welte
  Cc: Linux Netdev List, Netfilter Development Mailinglist, jamal, lmb

On Thu, 21 Oct 2004, Harald Welte wrote:

> On Thu, Oct 21, 2004 at 01:44:11PM -0400, jamal wrote:
>>
>> Sorry, couldnt resist - so out of hiding for just a few seconds; should
>> be able to achieve this much simpler with gact.
>
> One of the issues that CLUSTERIP needed to do is to work with
> locally-originated connections, i.e. every node within the cluster still
> has to be able to open tcp connections to anywhere.

Another criteria fulfilled by CLUSTERIP is the ability to do a soft 
failover where the new note accepts all new connections but the old node 
continues processing the connections it already have.

Regards
Henrik

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-21 14:25           ` Harald Welte
  2004-10-21 15:08             ` bert hubert
@ 2004-10-21 21:31             ` Herbert Xu
  2004-10-21 22:53               ` Thomas Graf
                                 ` (2 more replies)
  1 sibling, 3 replies; 29+ messages in thread
From: Herbert Xu @ 2004-10-21 21:31 UTC (permalink / raw)
  To: Harald Welte; +Cc: lmb, ahu, hch, netdev, davem

Harald Welte <laforge@netfilter.org> wrote:
>
> For supporting two primitive operation, adding a new netlink address
> family also isn't worthwhile, especially since we're short of netlink
> families.

That's something I'm looking into as well.  The current strategy of
either creating a new family or tacking random things into RTNETLINK
is simply not going to scale.

Initially I considered an interface where kernel users can register
themselves using a string as the key.  But I soon realised that we
could simply allow the netlink_family field to be an arbitrary integer
that is used as a key to a hash table.

The CPU cost of the hash table isn't too bad since you'll only be
looking it up when the socket is created.

Comments anyone?
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-21 21:31             ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
@ 2004-10-21 22:53               ` Thomas Graf
  2004-10-21 23:02                 ` Allowing netlink_family to be any integer Ben Greear
  2004-10-22 12:25                 ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
  2004-10-22 11:29               ` jamal
  2004-10-22 23:05               ` David S. Miller
  2 siblings, 2 replies; 29+ messages in thread
From: Thomas Graf @ 2004-10-21 22:53 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Harald Welte, lmb, ahu, hch, netdev, davem

* Herbert Xu <E1CKkWZ-0005x5-00@gondolin.me.apana.org.au> 2004-10-22 07:31
> Initially I considered an interface where kernel users can register
> themselves using a string as the key.  But I soon realised that we
> could simply allow the netlink_family field to be an arbitrary integer
> that is used as a key to a hash table.

Sounds like a good idea, converting nl_table and nl_nonroot into
a hash table won't be much of a problem. The netlink device driver
must probably be changed to create devices on the fly if we want
modules to be able to register netlink families. The same goes
for some selinux stuff but this is minor.

The only real problem I see is sk_protocol being only 8bit as
limiting factor.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer
  2004-10-21 22:53               ` Thomas Graf
@ 2004-10-21 23:02                 ` Ben Greear
  2004-10-22 12:25                 ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
  1 sibling, 0 replies; 29+ messages in thread
From: Ben Greear @ 2004-10-21 23:02 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Herbert Xu, Harald Welte, lmb, ahu, hch, netdev, davem

Thomas Graf wrote:

> The only real problem I see is sk_protocol being only 8bit as
> limiting factor.

When you fix this, please make the route-table ID at least 16 bits as well :)

Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2.6] iptables CLUSTERIP target, seq_file version
  2004-10-21 16:36 ` [PATCH 2.6] iptables CLUSTERIP target, seq_file version Harald Welte
  2004-10-21 17:44   ` jamal
@ 2004-10-22  5:52   ` David S. Miller
  1 sibling, 0 replies; 29+ messages in thread
From: David S. Miller @ 2004-10-22  5:52 UTC (permalink / raw)
  To: Harald Welte; +Cc: netdev, netfilter-devel, lmb

On Thu, 21 Oct 2004 18:36:55 +0200
Harald Welte <laforge@netfilter.org> wrote:

> This is the 'CLUSTERIP' target for iptables, this time no core kernel
> changes required, please apply.
> 
> This enables you to build a static load sharing cluster between multiple
> nodes - without the requirement to have a load balancer.  It uses a
> series of [evil] tricks like replying with linklayer multicast addresses
> to ARP requests, and using CONNMARK for stateful blocking all traffic
> not intended for the local node.

It's clever and nasty, I like it :-)

Thanks for cleaning it up to use seq_file.  Patch applied.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-21 21:31             ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
  2004-10-21 22:53               ` Thomas Graf
@ 2004-10-22 11:29               ` jamal
  2004-10-22 11:39                 ` Herbert Xu
  2004-10-22 23:05               ` David S. Miller
  2 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-10-22 11:29 UTC (permalink / raw)
  To: Herbert Xu, Evgeniy Polyakov
  Cc: Harald Welte, lmb, ahu, hch, netdev, David S. Miller


Evgeniy Polyakov(aka mr. Sean Paul) posted code a while back for
something that uses netlink that he calls  "kernel Konnector". I think
thats a good idea which will ease the use of those limited numbers. It
also has potential for generic kernel-kernel as well as kernel-userspace
messaging subsystem.
I printed the code, got a large cup of brazillian-derived cappucino but
alas got preempted before finishing the rewiew. Maybe you could work
with him Herbert?
I still plan to continue looking at it.

This does not exclude the use of the netlink numbers, but should ease
them.

cheers,
jamal

On Thu, 2004-10-21 at 17:31, Herbert Xu wrote:
> Harald Welte <laforge@netfilter.org> wrote:
> >
> > For supporting two primitive operation, adding a new netlink address
> > family also isn't worthwhile, especially since we're short of netlink
> > families.
> 
> That's something I'm looking into as well.  The current strategy of
> either creating a new family or tacking random things into RTNETLINK
> is simply not going to scale.
> 
> Initially I considered an interface where kernel users can register
> themselves using a string as the key.  But I soon realised that we
> could simply allow the netlink_family field to be an arbitrary integer
> that is used as a key to a hash table.
> 
> The CPU cost of the hash table isn't too bad since you'll only be
> looking it up when the socket is created.
> 
> Comments anyone?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-22 11:29               ` jamal
@ 2004-10-22 11:39                 ` Herbert Xu
  2004-10-22 12:19                   ` jamal
  0 siblings, 1 reply; 29+ messages in thread
From: Herbert Xu @ 2004-10-22 11:39 UTC (permalink / raw)
  To: jamal
  Cc: Evgeniy Polyakov, Harald Welte, lmb, ahu, hch, netdev, David S. Miller

On Fri, Oct 22, 2004 at 07:29:39AM -0400, jamal wrote:
> 
> Evgeniy Polyakov(aka mr. Sean Paul) posted code a while back for
> something that uses netlink that he calls  "kernel Konnector". I think

That patch puts the ID in each message, right?

That would mean paying the lookup cost for each message rather than
once when you create the socket.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-22 11:39                 ` Herbert Xu
@ 2004-10-22 12:19                   ` jamal
  2004-10-22 12:32                     ` Evgeniy Polyakov
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-10-22 12:19 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Evgeniy Polyakov, Harald Welte, lmb, ahu, hch, netdev, David S. Miller

On Fri, 2004-10-22 at 07:39, Herbert Xu wrote:
> On Fri, Oct 22, 2004 at 07:29:39AM -0400, jamal wrote:
> > 
> > Evgeniy Polyakov(aka mr. Sean Paul) posted code a while back for
> > something that uses netlink that he calls  "kernel Konnector". I think
> 
> That patch puts the ID in each message, right?

Yes, the ID is necessary for "routing" the message. Remember this is for
a messaging subsystem so you cant avoid having something that is in the
packet that is used to find where to go next.
I was suggesting also messaging by name.

> That would mean paying the lookup cost for each message rather than
> once when you create the socket.
> 

Take a look at the patch. See how it can be made better.
Evgeniy, Do you have something new to post?

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-21 22:53               ` Thomas Graf
  2004-10-21 23:02                 ` Allowing netlink_family to be any integer Ben Greear
@ 2004-10-22 12:25                 ` Herbert Xu
  2004-10-22 12:53                   ` jamal
  1 sibling, 1 reply; 29+ messages in thread
From: Herbert Xu @ 2004-10-22 12:25 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Harald Welte, lmb, ahu, hch, netdev, davem

On Fri, Oct 22, 2004 at 12:53:15AM +0200, Thomas Graf wrote:
> 
> The only real problem I see is sk_protocol being only 8bit as
> limiting factor.

Well we wouldn't be using sk_protocol to look things up anymore.
We'd be holding a reference on the family directly once the socket
is created.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-22 12:19                   ` jamal
@ 2004-10-22 12:32                     ` Evgeniy Polyakov
  0 siblings, 0 replies; 29+ messages in thread
From: Evgeniy Polyakov @ 2004-10-22 12:32 UTC (permalink / raw)
  To: hadi; +Cc: Herbert Xu, Harald Welte, lmb, ahu, hch, netdev, David S. Miller

[-- Attachment #1: Type: text/plain, Size: 1102 bytes --]

On Fri, 2004-10-22 at 16:19, jamal wrote:
> On Fri, 2004-10-22 at 07:39, Herbert Xu wrote:
> > On Fri, Oct 22, 2004 at 07:29:39AM -0400, jamal wrote:
> > > 
> > > Evgeniy Polyakov(aka mr. Sean Paul) posted code a while back for
> > > something that uses netlink that he calls  "kernel Konnector". I think
> > 
> > That patch puts the ID in each message, right?
> 
> Yes, the ID is necessary for "routing" the message. Remember this is for
> a messaging subsystem so you cant avoid having something that is in the
> packet that is used to find where to go next.
> I was suggesting also messaging by name.
> 
> > That would mean paying the lookup cost for each message rather than
> > once when you create the socket.
> > 
> 
> Take a look at the patch. See how it can be made better.
> Evgeniy, Do you have something new to post?

Nothing major, only multicast group selection by id.
Although I think message sending mechanism can use unicast.
Both have it's own advantages.

> cheers,
> jamal
-- 
	Evgeniy Polyakov

Crash is better than data corruption. -- Art Grabowski

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-22 12:25                 ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
@ 2004-10-22 12:53                   ` jamal
  0 siblings, 0 replies; 29+ messages in thread
From: jamal @ 2004-10-22 12:53 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, Harald Welte, lmb, ahu, hch, netdev, David S. Miller

On Fri, 2004-10-22 at 08:25, Herbert Xu wrote:
> On Fri, Oct 22, 2004 at 12:53:15AM +0200, Thomas Graf wrote:
> > 
> > The only real problem I see is sk_protocol being only 8bit as
> > limiting factor.
> 
> Well we wouldn't be using sk_protocol to look things up anymore.
> We'd be holding a reference on the family directly once the socket
> is created.

Thats why i said:
This does not exclude the use of the netlink numbers, but should ease
them. I think that rtnetlink is already overloaded and would benefit
from having a clean split. As an example things like links and IPaddress
should probably reside elsewhere since they are generic enough a
service.

Orthogal to this:
If we could get more people to use the work from Evgeniy as opposed to
creating a new netlink protocols, then it would benefit people - Harald
could have used that scheme in what he was trying to do for example.

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-21 21:31             ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
  2004-10-21 22:53               ` Thomas Graf
  2004-10-22 11:29               ` jamal
@ 2004-10-22 23:05               ` David S. Miller
  2004-10-22 23:16                 ` Herbert Xu
  2 siblings, 1 reply; 29+ messages in thread
From: David S. Miller @ 2004-10-22 23:05 UTC (permalink / raw)
  To: Herbert Xu; +Cc: laforge, lmb, ahu, hch, netdev

On Fri, 22 Oct 2004 07:31:07 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Initially I considered an interface where kernel users can register
> themselves using a string as the key.  But I soon realised that we
> could simply allow the netlink_family field to be an arbitrary integer
> that is used as a key to a hash table.

(I assume you mean "nl_family" not "netlink_family" :-)

> The CPU cost of the hash table isn't too bad since you'll only be
> looking it up when the socket is created.

I'm fine with this idea, however please tell me how you intend
to make things like ->getname() behave?

I would also suggest to start with values > AF_MAX, so that people
sticking other AF_* values in there by accident are caught.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-22 23:05               ` David S. Miller
@ 2004-10-22 23:16                 ` Herbert Xu
  2004-10-26  3:27                   ` David S. Miller
  0 siblings, 1 reply; 29+ messages in thread
From: Herbert Xu @ 2004-10-22 23:16 UTC (permalink / raw)
  To: David S. Miller; +Cc: laforge, lmb, ahu, hch, netdev

On Fri, Oct 22, 2004 at 04:05:59PM -0700, David S. Miller wrote:
> On Fri, 22 Oct 2004 07:31:07 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > Initially I considered an interface where kernel users can register
> > themselves using a string as the key.  But I soon realised that we
> > could simply allow the netlink_family field to be an arbitrary integer
> > that is used as a key to a hash table.
> 
> (I assume you mean "nl_family" not "netlink_family" :-)

I'm no plans yet in taking over all address families :)
I'm only referring to the third argument in socket(2).

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target)
  2004-10-22 23:16                 ` Herbert Xu
@ 2004-10-26  3:27                   ` David S. Miller
  0 siblings, 0 replies; 29+ messages in thread
From: David S. Miller @ 2004-10-26  3:27 UTC (permalink / raw)
  To: Herbert Xu; +Cc: laforge, lmb, ahu, hch, netdev

On Sat, 23 Oct 2004 09:16:07 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Fri, Oct 22, 2004 at 04:05:59PM -0700, David S. Miller wrote:
> > On Fri, 22 Oct 2004 07:31:07 +1000
> > Herbert Xu <herbert@gondor.apana.org.au> wrote:
> > 
> > > Initially I considered an interface where kernel users can register
> > > themselves using a string as the key.  But I soon realised that we
> > > could simply allow the netlink_family field to be an arbitrary integer
> > > that is used as a key to a hash table.
> > 
> > (I assume you mean "nl_family" not "netlink_family" :-)
> 
> I'm no plans yet in taking over all address families :)
> I'm only referring to the third argument in socket(2).

Color me confused about how your scheme might work.  If
you cook up an example patch, I guarentee it will be worth
your while. 8)

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2004-10-26  3:27 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-10-20 22:38 [PATCH 2.6] iptables CLUSTERIP target Harald Welte
2004-10-20 23:18 ` YOSHIFUJI Hideaki / 吉藤英明
2004-10-21  4:40   ` David S. Miller
2004-10-21  7:44 ` Christoph Hellwig
2004-10-21  7:55 ` Christoph Hellwig
2004-10-21  9:12   ` Harald Welte
2004-10-21  9:40     ` Herbert Xu
2004-10-21 11:05     ` bert hubert
2004-10-21 13:03       ` Harald Welte
2004-10-21 13:33         ` Lars Marowsky-Bree
2004-10-21 14:25           ` Harald Welte
2004-10-21 15:08             ` bert hubert
2004-10-21 21:31             ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
2004-10-21 22:53               ` Thomas Graf
2004-10-21 23:02                 ` Allowing netlink_family to be any integer Ben Greear
2004-10-22 12:25                 ` Allowing netlink_family to be any integer (was: [PATCH 2.6] iptables CLUSTERIP target) Herbert Xu
2004-10-22 12:53                   ` jamal
2004-10-22 11:29               ` jamal
2004-10-22 11:39                 ` Herbert Xu
2004-10-22 12:19                   ` jamal
2004-10-22 12:32                     ` Evgeniy Polyakov
2004-10-22 23:05               ` David S. Miller
2004-10-22 23:16                 ` Herbert Xu
2004-10-26  3:27                   ` David S. Miller
2004-10-21 16:36 ` [PATCH 2.6] iptables CLUSTERIP target, seq_file version Harald Welte
2004-10-21 17:44   ` jamal
2004-10-21 18:03     ` Harald Welte
2004-10-21 18:41       ` Henrik Nordstrom
2004-10-22  5:52   ` David S. Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.