All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ipvs: changes related to service usecnt
@ 2010-09-18 14:18 Julian Anastasov
  2010-09-19 11:48 ` Simon Horman
  2010-09-20 10:13 ` [RFC][PATCH] ipvs: IPv6 tunnel mode Hans Schillstrom
  0 siblings, 2 replies; 8+ messages in thread
From: Julian Anastasov @ 2010-09-18 14:18 UTC (permalink / raw)
  To: Simon Horman; +Cc: Patrick McHardy, lvs-devel, netfilter-devel


	Change the usage of svc usecnt during command execution:

- we check if svc is registered but we do not need to hold usecnt
reference while under __ip_vs_mutex, only the packet handling needs
it during scheduling

- change __ip_vs_service_get to __ip_vs_service_find and
__ip_vs_svc_fwm_get to __ip_vs_svc_fwm_find because now caller
will increase svc->usecnt

- put common code that calls update_service in __ip_vs_update_dest

- put common code in ip_vs_unlink_service() and use it to unregister
the service

- add comment that svc should not be accessed after ip_vs_del_service 
anymore

- all IP_VS_WAIT_WHILE calls are now unified: usecnt > 0

- Properly log the app ports

	As result, some problems are fixed:

- possible use-after-free of svc in ip_vs_genl_set_cmd after
ip_vs_del_service because our usecnt reference does not guarantee that
svc is not freed on refcnt==0, eg. when no dests are moved to trash

- possible usecnt leak in do_ip_vs_set_ctl after ip_vs_del_service
when the service is not freed now, for example, when some
destionations are moved into trash and svc->refcnt remains above 0.
It is harmless because svc is not in hash anymore.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---

	This patch applies on net-next-2.6-e548833 (2010-SEP-10)
and also after the last 3 patches for nfct.

diff -urp net-next-2.6-e548833/linux/net/netfilter/ipvs/ip_vs_app.c linux/net/netfilter/ipvs/ip_vs_app.c
--- net-next-2.6-e548833/linux/net/netfilter/ipvs/ip_vs_app.c	2010-09-10 08:27:33.000000000 +0300
+++ linux/net/netfilter/ipvs/ip_vs_app.c	2010-09-18 16:32:51.000000000 +0300
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app,
 		goto out;
 
 	list_add(&inc->a_list, &app->incs_list);
-	IP_VS_DBG(9, "%s application %s:%u registered\n",
-		  pp->name, inc->name, inc->port);
+	IP_VS_DBG(9, "%s App %s:%u registered\n",
+		  pp->name, inc->name, ntohs(inc->port));
 
 	return 0;
 
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *
 		pp->unregister_app(inc);
 
 	IP_VS_DBG(9, "%s App %s:%u unregistered\n",
-		  pp->name, inc->name, inc->port);
+		  pp->name, inc->name, ntohs(inc->port));
 
 	list_del(&inc->a_list);
 
diff -urp net-next-2.6-e548833/linux/net/netfilter/ipvs/ip_vs_ctl.c linux/net/netfilter/ipvs/ip_vs_ctl.c
--- net-next-2.6-e548833/linux/net/netfilter/ipvs/ip_vs_ctl.c	2010-09-10 08:27:33.000000000 +0300
+++ linux/net/netfilter/ipvs/ip_vs_ctl.c	2010-09-18 16:08:01.000000000 +0300
@@ -401,7 +401,7 @@ static int ip_vs_svc_unhash(struct ip_vs
  *	Get service by {proto,addr,port} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
 		    __be16 vport)
 {
 	unsigned hash;
@@ -416,7 +416,6 @@ __ip_vs_service_get(int af, __u16 protoc
 		    && (svc->port == vport)
 		    && (svc->protocol == protocol)) {
 			/* HIT */
-			atomic_inc(&svc->usecnt);
 			return svc;
 		}
 	}
@@ -429,7 +428,7 @@ __ip_vs_service_get(int af, __u16 protoc
  *	Get service by {fwmark} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(int af, __u32 fwmark)
 {
 	unsigned hash;
 	struct ip_vs_service *svc;
@@ -440,7 +439,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark
 	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
 		if (svc->fwmark == fwmark && svc->af == af) {
 			/* HIT */
-			atomic_inc(&svc->usecnt);
 			return svc;
 		}
 	}
@@ -459,14 +457,14 @@ ip_vs_service_get(int af, __u32 fwmark, 
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
-	if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+	if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
 		goto out;
 
 	/*
 	 *	Check the table hashed by <protocol,addr,port>
 	 *	for "full" addressed entries
 	 */
-	svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+	svc = __ip_vs_service_find(af, protocol, vaddr, vport);
 
 	if (svc == NULL
 	    && protocol == IPPROTO_TCP
@@ -476,7 +474,7 @@ ip_vs_service_get(int af, __u32 fwmark, 
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
 		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+		svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
 	}
 
 	if (svc == NULL
@@ -484,10 +482,12 @@ ip_vs_service_get(int af, __u32 fwmark, 
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+		svc = __ip_vs_service_find(af, protocol, vaddr, 0);
 	}
 
   out:
+	if (svc)
+		atomic_inc(&svc->usecnt);
 	read_unlock(&__ip_vs_svc_lock);
 
 	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -506,14 +506,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest
 	dest->svc = svc;
 }
 
-static inline void
+static void
 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 {
 	struct ip_vs_service *svc = dest->svc;
 
 	dest->svc = NULL;
-	if (atomic_dec_and_test(&svc->refcnt))
+	if (atomic_dec_and_test(&svc->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
 		kfree(svc);
+	}
 }
 
 
@@ -758,8 +763,8 @@ ip_vs_zero_stats(struct ip_vs_stats *sta
  *	Update a destination in the given service
  */
 static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
-		    struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+		    struct ip_vs_dest_user_kern *udest, int add)
 {
 	int conn_flags;
 
@@ -813,6 +818,25 @@ __ip_vs_update_dest(struct ip_vs_service
 		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 	dest->u_threshold = udest->u_threshold;
 	dest->l_threshold = udest->l_threshold;
+
+	if (add)
+		ip_vs_new_estimator(&dest->stats);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/* Wait until all other svc users go away */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+	if (add) {
+		list_add(&dest->n_list, &svc->destinations);
+		svc->num_dests++;
+	}
+
+	/* call the update_service, because server weight may be changed */
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
 }
 
 
@@ -860,13 +884,12 @@ ip_vs_new_dest(struct ip_vs_service *svc
 	atomic_set(&dest->activeconns, 0);
 	atomic_set(&dest->inactconns, 0);
 	atomic_set(&dest->persistconns, 0);
-	atomic_set(&dest->refcnt, 0);
+	atomic_set(&dest->refcnt, 1);
 
 	INIT_LIST_HEAD(&dest->d_list);
 	spin_lock_init(&dest->dst_lock);
 	spin_lock_init(&dest->stats.lock);
-	__ip_vs_update_dest(svc, dest, udest);
-	ip_vs_new_estimator(&dest->stats);
+	__ip_vs_update_dest(svc, dest, udest, 1);
 
 	*dest_p = dest;
 
@@ -926,65 +949,22 @@ ip_vs_add_dest(struct ip_vs_service *svc
 			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
 			      ntohs(dest->vport));
 
-		__ip_vs_update_dest(svc, dest, udest);
-
 		/*
 		 * Get the destination from the trash
 		 */
 		list_del(&dest->n_list);
 
-		ip_vs_new_estimator(&dest->stats);
-
-		write_lock_bh(&__ip_vs_svc_lock);
-
+		__ip_vs_update_dest(svc, dest, udest, 1);
+		ret = 0;
+	} else {
 		/*
-		 * Wait until all other svc users go away.
+		 * Allocate and initialize the dest structure
 		 */
-		IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-		list_add(&dest->n_list, &svc->destinations);
-		svc->num_dests++;
-
-		/* call the update_service function of its scheduler */
-		if (svc->scheduler->update_service)
-			svc->scheduler->update_service(svc);
-
-		write_unlock_bh(&__ip_vs_svc_lock);
-		return 0;
-	}
-
-	/*
-	 * Allocate and initialize the dest structure
-	 */
-	ret = ip_vs_new_dest(svc, udest, &dest);
-	if (ret) {
-		return ret;
+		ret = ip_vs_new_dest(svc, udest, &dest);
 	}
-
-	/*
-	 * Add the dest entry into the list
-	 */
-	atomic_inc(&dest->refcnt);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 * Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	list_add(&dest->n_list, &svc->destinations);
-	svc->num_dests++;
-
-	/* call the update_service function of its scheduler */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
 	LeaveFunction(2);
 
-	return 0;
+	return ret;
 }
 
 
@@ -1023,19 +1003,7 @@ ip_vs_edit_dest(struct ip_vs_service *sv
 		return -ENOENT;
 	}
 
-	__ip_vs_update_dest(svc, dest, udest);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/* Wait until all other svc users go away */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	/* call the update_service, because server weight may be changed */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
+	__ip_vs_update_dest(svc, dest, udest, 0);
 	LeaveFunction(2);
 
 	return 0;
@@ -1062,6 +1030,10 @@ static void __ip_vs_del_dest(struct ip_v
 	 *  the destination into the trash.
 	 */
 	if (atomic_dec_and_test(&dest->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port));
 		ip_vs_dst_reset(dest);
 		/* simply decrease svc->refcnt here, let the caller check
 		   and release the service if nobody refers to it.
@@ -1128,7 +1100,7 @@ ip_vs_del_dest(struct ip_vs_service *svc
 	/*
 	 *	Wait until all other svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	/*
 	 *	Unlink dest from the service
@@ -1185,7 +1157,7 @@ ip_vs_add_service(struct ip_vs_service_u
 	}
 
 	/* I'm the first user of the service */
-	atomic_set(&svc->usecnt, 1);
+	atomic_set(&svc->usecnt, 0);
 	atomic_set(&svc->refcnt, 0);
 
 	svc->af = u->af;
@@ -1279,7 +1251,7 @@ ip_vs_edit_service(struct ip_vs_service 
 	/*
 	 * Wait until all other svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	/*
 	 * Set the flags and timeout value
@@ -1378,21 +1350,23 @@ static void __ip_vs_del_service(struct i
 	/*
 	 *    Free the service if nobody refers to it
 	 */
-	if (atomic_read(&svc->refcnt) == 0)
+	if (atomic_read(&svc->refcnt) == 0) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
 		kfree(svc);
+	}
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 }
 
 /*
- *	Delete a service from the service list
+ * Unlink a service from list and try to delete it if its refcnt reached 0
  */
-static int ip_vs_del_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
 {
-	if (svc == NULL)
-		return -EEXIST;
-
 	/*
 	 * Unhash it from the service table
 	 */
@@ -1403,11 +1377,21 @@ static int ip_vs_del_service(struct ip_v
 	/*
 	 * Wait until all the svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	__ip_vs_del_service(svc);
 
 	write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ *	Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+	if (svc == NULL)
+		return -EEXIST;
+	ip_vs_unlink_service(svc);
 
 	return 0;
 }
@@ -1426,14 +1410,7 @@ static int ip_vs_flush(void)
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
+			ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -1443,14 +1420,7 @@ static int ip_vs_flush(void)
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt,
 					 &ip_vs_svc_fwm_table[idx], f_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
+			ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -2147,15 +2117,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cm
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
 	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
+		svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+					   &usvc.addr, usvc.port);
 	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+		svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
 
 	if (cmd != IP_VS_SO_SET_ADD
 	    && (svc == NULL || svc->protocol != usvc.protocol)) {
 		ret = -ESRCH;
-		goto out_drop_service;
+		goto out_unlock;
 	}
 
 	switch (cmd) {
@@ -2189,10 +2159,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cm
 		ret = -EINVAL;
 	}
 
-out_drop_service:
-	if (svc)
-		ip_vs_service_put(svc);
-
   out_unlock:
 	mutex_unlock(&__ip_vs_mutex);
   out_dec:
@@ -2285,10 +2251,10 @@ __ip_vs_get_dest_entries(const struct ip
 	int ret = 0;
 
 	if (get->fwmark)
-		svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+		svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
 	else
-		svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
-					  get->port);
+		svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+					   get->port);
 
 	if (svc) {
 		int count = 0;
@@ -2316,7 +2282,6 @@ __ip_vs_get_dest_entries(const struct ip
 			}
 			count++;
 		}
-		ip_vs_service_put(svc);
 	} else
 		ret = -ESRCH;
 	return ret;
@@ -2437,15 +2402,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cm
 		entry = (struct ip_vs_service_entry *)arg;
 		addr.ip = entry->addr;
 		if (entry->fwmark)
-			svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+			svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
 		else
-			svc = __ip_vs_service_get(AF_INET, entry->protocol,
-						  &addr, entry->port);
+			svc = __ip_vs_service_find(AF_INET, entry->protocol,
+						   &addr, entry->port);
 		if (svc) {
 			ip_vs_copy_service(entry, svc);
 			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
 				ret = -EFAULT;
-			ip_vs_service_put(svc);
 		} else
 			ret = -ESRCH;
 	}
@@ -2712,10 +2676,12 @@ nla_put_failure:
 }
 
 static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
-				    struct nlattr *nla, int full_entry)
+				    struct nlattr *nla, int full_entry,
+				    struct ip_vs_service **ret_svc)
 {
 	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
 	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+	struct ip_vs_service *svc;
 
 	/* Parse mandatory identifying service fields first */
 	if (nla == NULL ||
@@ -2751,12 +2717,18 @@ static int ip_vs_genl_parse_service(stru
 		usvc->fwmark = 0;
 	}
 
+	if (usvc->fwmark)
+		svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+	else
+		svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+					   &usvc->addr, usvc->port);
+	*ret_svc = svc;
+
 	/* If a full entry was requested, check for the additional fields */
 	if (full_entry) {
 		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
 			      *nla_netmask;
 		struct ip_vs_flags flags;
-		struct ip_vs_service *svc;
 
 		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
 		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
@@ -2769,16 +2741,8 @@ static int ip_vs_genl_parse_service(stru
 		nla_memcpy(&flags, nla_flags, sizeof(flags));
 
 		/* prefill flags from service if it already exists */
-		if (usvc->fwmark)
-			svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
-		else
-			svc = __ip_vs_service_get(usvc->af, usvc->protocol,
-						  &usvc->addr, usvc->port);
-		if (svc) {
+		if (svc)
 			usvc->flags = svc->flags;
-			ip_vs_service_put(svc);
-		} else
-			usvc->flags = 0;
 
 		/* set new flags from userland */
 		usvc->flags = (usvc->flags & ~flags.mask) |
@@ -2794,17 +2758,11 @@ static int ip_vs_genl_parse_service(stru
 static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
 {
 	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_service *svc;
 	int ret;
 
-	ret = ip_vs_genl_parse_service(&usvc, nla, 0);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (usvc.fwmark)
-		return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-	else
-		return __ip_vs_service_get(usvc.af, usvc.protocol,
-					   &usvc.addr, usvc.port);
+	ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+	return ret ? ERR_PTR(ret) : svc;
 }
 
 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2895,7 +2853,6 @@ static int ip_vs_genl_dump_dests(struct 
 
 nla_put_failure:
 	cb->args[0] = idx;
-	ip_vs_service_put(svc);
 
 out_err:
 	mutex_unlock(&__ip_vs_mutex);
@@ -3108,17 +3065,10 @@ static int ip_vs_genl_set_cmd(struct sk_
 
 	ret = ip_vs_genl_parse_service(&usvc,
 				       info->attrs[IPVS_CMD_ATTR_SERVICE],
-				       need_full_svc);
+				       need_full_svc, &svc);
 	if (ret)
 		goto out;
 
-	/* Lookup the exact service by <protocol, addr, port> or fwmark */
-	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
-	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
 	/* Unless we're adding a new service, the service must already exist */
 	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
 		ret = -ESRCH;
@@ -3152,6 +3102,7 @@ static int ip_vs_genl_set_cmd(struct sk_
 		break;
 	case IPVS_CMD_DEL_SERVICE:
 		ret = ip_vs_del_service(svc);
+		/* do not use svc, it can be freed */
 		break;
 	case IPVS_CMD_NEW_DEST:
 		ret = ip_vs_add_dest(svc, &udest);
@@ -3170,8 +3121,6 @@ static int ip_vs_genl_set_cmd(struct sk_
 	}
 
 out:
-	if (svc)
-		ip_vs_service_put(svc);
 	mutex_unlock(&__ip_vs_mutex);
 
 	return ret;
@@ -3217,7 +3166,6 @@ static int ip_vs_genl_get_cmd(struct sk_
 			goto out_err;
 		} else if (svc) {
 			ret = ip_vs_genl_fill_service(msg, svc);
-			ip_vs_service_put(svc);
 			if (ret)
 				goto nla_put_failure;
 		} else {

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ipvs: changes related to service usecnt
  2010-09-18 14:18 [PATCH] ipvs: changes related to service usecnt Julian Anastasov
@ 2010-09-19 11:48 ` Simon Horman
  2010-09-21 16:15   ` Patrick McHardy
  2010-09-20 10:13 ` [RFC][PATCH] ipvs: IPv6 tunnel mode Hans Schillstrom
  1 sibling, 1 reply; 8+ messages in thread
From: Simon Horman @ 2010-09-19 11:48 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: Patrick McHardy, lvs-devel, netfilter-devel

On Sat, Sep 18, 2010 at 05:18:46PM +0300, Julian Anastasov wrote:
> 
> 	Change the usage of svc usecnt during command execution:
> 
> - we check if svc is registered but we do not need to hold usecnt
> reference while under __ip_vs_mutex, only the packet handling needs
> it during scheduling
> 
> - change __ip_vs_service_get to __ip_vs_service_find and
> __ip_vs_svc_fwm_get to __ip_vs_svc_fwm_find because now caller
> will increase svc->usecnt
> 
> - put common code that calls update_service in __ip_vs_update_dest
> 
> - put common code in ip_vs_unlink_service() and use it to unregister
> the service
> 
> - add comment that svc should not be accessed after ip_vs_del_service 
> anymore
> 
> - all IP_VS_WAIT_WHILE calls are now unified: usecnt > 0
> 
> - Properly log the app ports
> 
> 	As result, some problems are fixed:
> 
> - possible use-after-free of svc in ip_vs_genl_set_cmd after
> ip_vs_del_service because our usecnt reference does not guarantee that
> svc is not freed on refcnt==0, eg. when no dests are moved to trash
> 
> - possible usecnt leak in do_ip_vs_set_ctl after ip_vs_del_service
> when the service is not freed now, for example, when some
> destionations are moved into trash and svc->refcnt remains above 0.
> It is harmless because svc is not in hash anymore.
> 
> Signed-off-by: Julian Anastasov <ja@ssi.bg>

Hi Julian,

thats a pretty big patch to review, but it looks good to me.

Acked-by: Simon Horman <horms@verge.net.au>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [RFC][PATCH] ipvs: IPv6 tunnel mode
  2010-09-18 14:18 [PATCH] ipvs: changes related to service usecnt Julian Anastasov
  2010-09-19 11:48 ` Simon Horman
@ 2010-09-20 10:13 ` Hans Schillstrom
  2010-09-22 13:50   ` Julian Anastasov
  1 sibling, 1 reply; 8+ messages in thread
From: Hans Schillstrom @ 2010-09-20 10:13 UTC (permalink / raw)
  To: lvs-devel; +Cc: Simon Horman, Julian Anastasov

Tunnel mode for IPv6 doesn't work.

IPv6 encapsulation uses a bad source address for the tunnel.
i.e. VIP will be used as local-addr and encap. dst addr.
Decapsulation will not accept this.

Example
LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
    (eth0 2003::1:0:1/96)
RS  (ethX 2003::1:0:5/96)     

tcpdump
 2003::2:0:100 > 2003::1:0:5: 
 IP6 (hlim 63, next-header TCP (6) payload length: 40) 
  2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
(correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
1904932 ecr 0,nop,wscale 3], length 0


In Linux IPv6 impl. you can't have a tunnel with an any cast address
receiving packets (I have not tried to interpret RFC 2473)
To have receive capabilities the tunnel must have:
  - Local address set as multicast addr or an unicast addr
  - Remote address set as an unicast addr.
  - Loop back addres or Link local address are not allowed.

This causes us to setup a tunnel in the Real Server with the
LVS as the remote address, here you can't use the VIP address since it's
used inside the tunnel.

Solution
Use outgoing interface IPv6 address (match against the destination).
i.e. use ipv6_dev_get_saddr(...) to set the source address of the
encapsulated package.

However this lookup cost some cpu time to do, so there is some speed
enhancements to this patch
 
a) Cache the result
b) Static configuration i.e. used a static configured address
   (since the tunnel in RS use a static address remote address)

Here follows the patch without any enhancements mention above.

Signed-off-by: hans.schillstrom@ericsson.com

---

--- linux-2.6.35.next/net/netfilter/ipvs/ip_vs_xmit.c       2010-08-13
12:18:12.000000000 +0200
+++ linux-2.6.35.x/net/netfilter/ipvs/ip_vs_xmit.c  2010-09-20
11:38:50.000000000 +0200

@@ -688,10 +702,25 @@
        rt = __ip_vs_get_out_rt_v6(cp);
        if (!rt)
                goto tx_error_icmp;

        tdev = rt->dst.dev;
+       /* Lookup source address for the tunnel */
+        if (likely(ipv6_addr_any(&rt->rt6i_src.addr))) {
+                struct net *net = dev_net(skb->dev);
+                int err = ipv6_dev_get_saddr(net, tdev,
+                                         &rt->rt6i_dst.addr,
+                                         0,
+                                         &rt->rt6i_src.addr);
+                /* RFC 2473 Ch 8. IPv6 Tunnel Error Processing and
Reporting
+                 *   Both tunnel header and tunnel packet problems are
reported
+                 *   to the tunnel entry-point node.
+                 */
+                if (err) {
+                        goto tx_error_icmp;
+                }
+        }

        mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
        /* TODO IPv6: do we need this check in IPv6? */
        if (mtu < 1280) {
                dst_release(&rt->dst);
@@ -747,11 +776,11 @@
        iph->payload_len        =       old_iph->payload_len;
        be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
        iph->priority           =       old_iph->priority;
        memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
        iph->daddr              =       rt->rt6i_dst.addr;
-       iph->saddr              =       cp->vaddr.in6; /*
rt->rt6i_src.addr; */
+       iph->saddr              =       rt->rt6i_src.addr;
        iph->hop_limit          =       old_iph->hop_limit;

        /* Another hack: avoid icmp_send in ip_fragment */
        skb->local_df = 1;
--



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ipvs: changes related to service usecnt
  2010-09-19 11:48 ` Simon Horman
@ 2010-09-21 16:15   ` Patrick McHardy
  0 siblings, 0 replies; 8+ messages in thread
From: Patrick McHardy @ 2010-09-21 16:15 UTC (permalink / raw)
  To: Simon Horman; +Cc: Julian Anastasov, lvs-devel, netfilter-devel

Am 19.09.2010 13:48, schrieb Simon Horman:
> On Sat, Sep 18, 2010 at 05:18:46PM +0300, Julian Anastasov wrote:
>> > 
>> > 	Change the usage of svc usecnt during command execution:
>> > 
>> > - we check if svc is registered but we do not need to hold usecnt
>> > reference while under __ip_vs_mutex, only the packet handling needs
>> > it during scheduling
>> > 
>> > - change __ip_vs_service_get to __ip_vs_service_find and
>> > __ip_vs_svc_fwm_get to __ip_vs_svc_fwm_find because now caller
>> > will increase svc->usecnt
>> > 
>> > - put common code that calls update_service in __ip_vs_update_dest
>> > 
>> > - put common code in ip_vs_unlink_service() and use it to unregister
>> > the service
>> > 
>> > - add comment that svc should not be accessed after ip_vs_del_service 
>> > anymore
>> > 
>> > - all IP_VS_WAIT_WHILE calls are now unified: usecnt > 0
>> > 
>> > - Properly log the app ports
>> > 
>> > 	As result, some problems are fixed:
>> > 
>> > - possible use-after-free of svc in ip_vs_genl_set_cmd after
>> > ip_vs_del_service because our usecnt reference does not guarantee that
>> > svc is not freed on refcnt==0, eg. when no dests are moved to trash
>> > 
>> > - possible usecnt leak in do_ip_vs_set_ctl after ip_vs_del_service
>> > when the service is not freed now, for example, when some
>> > destionations are moved into trash and svc->refcnt remains above 0.
>> > It is harmless because svc is not in hash anymore.
>> > 
>> > Signed-off-by: Julian Anastasov <ja@ssi.bg>
> Hi Julian,
> 
> thats a pretty big patch to review, but it looks good to me.

Indeed, I'd prefer slightly smaller patches too.
 
> Acked-by: Simon Horman <horms@verge.net.au>

Applied, thanks.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] ipvs: IPv6 tunnel mode
  2010-09-20 10:13 ` [RFC][PATCH] ipvs: IPv6 tunnel mode Hans Schillstrom
@ 2010-09-22 13:50   ` Julian Anastasov
  2010-09-23 16:16     ` Hans Schillstrom
  0 siblings, 1 reply; 8+ messages in thread
From: Julian Anastasov @ 2010-09-22 13:50 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: lvs-devel, Simon Horman, Julius Volz


 	Hello,

On Mon, 20 Sep 2010, Hans Schillstrom wrote:

> Tunnel mode for IPv6 doesn't work.
>
> IPv6 encapsulation uses a bad source address for the tunnel.
> i.e. VIP will be used as local-addr and encap. dst addr.
> Decapsulation will not accept this.

 	Reading http://kb.linuxvirtualserver.org/wiki/IPv6_load_balancing
may be TUN for IPv6 is not fully tested. Using VIP as saddr
looks like spoofing because it can be configured on real server
too. CC-ed Julius Volz to comment about using cp->vaddr as
source for tunnel. May be it was used because rt6i_src is zero.

> Example
> LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
>    (eth0 2003::1:0:1/96)
> RS  (ethX 2003::1:0:5/96)
>
> tcpdump
> 2003::2:0:100 > 2003::1:0:5:
> IP6 (hlim 63, next-header TCP (6) payload length: 40)
>  2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
> (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
> 1904932 ecr 0,nop,wscale 3], length 0
>
>
> In Linux IPv6 impl. you can't have a tunnel with an any cast address
> receiving packets (I have not tried to interpret RFC 2473)
> To have receive capabilities the tunnel must have:
>  - Local address set as multicast addr or an unicast addr
>  - Remote address set as an unicast addr.
>  - Loop back addres or Link local address are not allowed.

 	Yes, the tunnel device has many options for assigning
local/remote addresses depending on traffic direction but
for IPVS we use only unicast->unicast traffic. We even
do not configure any tunnel parameters in real server
for IPv4, just adding some address is enough, eg. 0.0.0.0.
New kernels will need {all,default}/rp_filter=0 together
with tunl*/rp_filter=0 because MAX(all,dev}/rp_filter is used.

> This causes us to setup a tunnel in the Real Server with the
> LVS as the remote address, here you can't use the VIP address since it's
> used inside the tunnel.
>
> Solution
> Use outgoing interface IPv6 address (match against the destination).
> i.e. use ipv6_dev_get_saddr(...) to set the source address of the
> encapsulated package.
>
> However this lookup cost some cpu time to do, so there is some speed
> enhancements to this patch

 	The dst caching should work as for IPv4.
To properly cache for IPv6 we need to add new fields in
struct ip_vs_dest: dst_cookie, dst_saddr. We can do it just like
in net/ipv6/ip6_tunnel.c:ip6_tnl_dst_check(). ip6_tnl_xmit2()
is a good example for IPv6 tunneling.

> a) Cache the result
> b) Static configuration i.e. used a static configured address
>   (since the tunnel in RS use a static address remote address)
>
> Here follows the patch without any enhancements mention above.
>
> Signed-off-by: hans.schillstrom@ericsson.com
>
> ---
>
> --- linux-2.6.35.next/net/netfilter/ipvs/ip_vs_xmit.c       2010-08-13
> 12:18:12.000000000 +0200
> +++ linux-2.6.35.x/net/netfilter/ipvs/ip_vs_xmit.c  2010-09-20
> 11:38:50.000000000 +0200
>
> @@ -688,10 +702,25 @@
>        rt = __ip_vs_get_out_rt_v6(cp);
>        if (!rt)
>                goto tx_error_icmp;
>
>        tdev = rt->dst.dev;
> +       /* Lookup source address for the tunnel */
> +        if (likely(ipv6_addr_any(&rt->rt6i_src.addr))) {
> +                struct net *net = dev_net(skb->dev);
> +                int err = ipv6_dev_get_saddr(net, tdev,
> +                                         &rt->rt6i_dst.addr,
> +                                         0,
> +                                         &rt->rt6i_src.addr);

 	After looking at ipv6/ files I'm still not sure
if ip6_route_output() returns valid rt6i_src in all cases.
May be rt6i_src is filled only when CONFIG_IPV6_SUBTREES
is defined.

 	So, may be your approach to use ipv6_dev_get_saddr
is correct but I prefer a version that caches the routing
results. So, I created a new patch for testing. But I don't
have IPv6 setup to test it. Can you confirm that with this
patch applied the message "new dst %pI6, src %pI6" shows
correct results and that it does not appear for every packet.
The idea is to see this message only once if the dst caching
works for us. Patch is appended.

> +                /* RFC 2473 Ch 8. IPv6 Tunnel Error Processing and
> Reporting
> +                 *   Both tunnel header and tunnel packet problems are
> reported
> +                 *   to the tunnel entry-point node.
> +                 */
> +                if (err) {
> +                        goto tx_error_icmp;
> +                }
> +        }
>
>        mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
>        /* TODO IPv6: do we need this check in IPv6? */
>        if (mtu < 1280) {
>                dst_release(&rt->dst);
> @@ -747,11 +776,11 @@
>        iph->payload_len        =       old_iph->payload_len;
>        be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
>        iph->priority           =       old_iph->priority;
>        memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
>        iph->daddr              =       rt->rt6i_dst.addr;
> -       iph->saddr              =       cp->vaddr.in6; /*
> rt->rt6i_src.addr; */
> +       iph->saddr              =       rt->rt6i_src.addr;
>        iph->hop_limit          =       old_iph->hop_limit;
>
>        /* Another hack: avoid icmp_send in ip_fragment */
>        skb->local_df = 1;
> --

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---

diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h linux/include/net/ip_vs.h
--- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h	2010-09-16 09:03:48.000000000 +0300
+++ linux/include/net/ip_vs.h	2010-09-22 10:50:18.548963467 +0300
@@ -509,6 +509,10 @@ struct ip_vs_dest {
  	spinlock_t		dst_lock;	/* lock of dst_cache */
  	struct dst_entry	*dst_cache;	/* destination cache entry */
  	u32			dst_rtos;	/* RT_TOS(tos) for dst */
+	u32			dst_cookie;
+#ifdef CONFIG_IP_VS_IPV6
+	struct in6_addr		dst_saddr;
+#endif

  	/* for virtual service */
  	struct ip_vs_service	*svc;		/* service it belongs to */
diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
--- net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-16 09:02:25.000000000 +0300
+++ linux/net/netfilter/ipvs/ip_vs_xmit.c	2010-09-22 16:29:43.271964521 +0300
@@ -26,6 +26,7 @@
  #include <net/route.h>                  /* for ip_route_output */
  #include <net/ipv6.h>
  #include <net/ip6_route.h>
+#include <net/addrconf.h>
  #include <linux/icmpv6.h>
  #include <linux/netfilter.h>
  #include <linux/netfilter_ipv4.h>
@@ -37,26 +38,27 @@
   *      Destination cache to speed up outgoing route lookup
   */
  static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+		u32 dst_cookie)
  {
  	struct dst_entry *old_dst;

  	old_dst = dest->dst_cache;
  	dest->dst_cache = dst;
  	dest->dst_rtos = rtos;
+	dest->dst_cookie = dst_cookie;
  	dst_release(old_dst);
  }

  static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
  {
  	struct dst_entry *dst = dest->dst_cache;

  	if (!dst)
  		return NULL;
-	if ((dst->obsolete
-	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-	    dst->ops->check(dst, cookie) == NULL) {
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
  		dest->dst_cache = NULL;
  		dst_release(dst);
  		return NULL;
@@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
  }

  static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
  {
+	struct net *net = dev_net(skb->dev);
  	struct rtable *rt;			/* Route to the other host */
  	struct ip_vs_dest *dest = cp->dest;

  	if (dest) {
  		spin_lock(&dest->dst_lock);
  		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos, 0))) {
+		      __ip_vs_dst_check(dest, rtos))) {
  			struct flowi fl = {
  				.oif = 0,
  				.nl_u = {
@@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
  						.tos = rtos, } },
  			};

-			if (ip_route_output_key(&init_net, &rt, &fl)) {
+			if (ip_route_output_key(net, &rt, &fl)) {
  				spin_unlock(&dest->dst_lock);
  				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
  					     &dest->addr.ip);
  				return NULL;
  			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
  			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
  				  &dest->addr.ip,
  				  atomic_read(&rt->dst.__refcnt), rtos);
@@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
  					.tos = rtos, } },
  		};

-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(net, &rt, &fl)) {
  			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
  				     &cp->daddr.ip);
  			return NULL;
@@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
  }

  #ifdef CONFIG_IP_VS_IPV6
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+			struct in6_addr *ret_saddr, int do_xfrm)
+{
+	struct dst_entry *dst;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *daddr,
+			},
+		},
+	};
+
+	dst = ip6_route_output(net, NULL, &fl);
+	if (dst->error)
+		goto out_err;
+	if (!ret_saddr)
+		return dst;
+	if (ipv6_addr_any(&fl.fl6_src) &&
+	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+			       &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+		goto out_err;
+	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+		goto out_err;
+	ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+	return dst;
+
+out_err:
+	dst_release(dst);
+	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+	return NULL;
+}
+
  static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		      struct in6_addr *ret_saddr, int do_xfrm)
  {
+	struct net *net = dev_net(skb->dev);
  	struct rt6_info *rt;			/* Route to the other host */
  	struct ip_vs_dest *dest = cp->dest;
+	struct dst_entry *dst;

  	if (dest) {
  		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
  		if (!rt) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip6_u = {
-						.daddr = dest->addr.in6,
-						.saddr = {
-							.s6_addr32 =
-								{ 0, 0, 0, 0 },
-						},
-					},
-				},
-			};
+			u32 cookie;

-			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
-			if (!rt) {
+			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+						      &dest->dst_saddr,
+						      do_xfrm);
+			if (!dst) {
  				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-					     &dest->addr.in6);
  				return NULL;
  			}
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
-			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-				  &dest->addr.in6,
+			rt = (struct rt6_info *) dst;
+			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+				  &dest->addr.in6, &dest->dst_saddr,
  				  atomic_read(&rt->dst.__refcnt));
  		}
+		if (ret_saddr)
+			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
  		spin_unlock(&dest->dst_lock);
  	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip6_u = {
-					.daddr = cp->daddr.in6,
-					.saddr = {
-						.s6_addr32 = { 0, 0, 0, 0 },
-					},
-				},
-			},
-		};
-
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-				     &cp->daddr.in6);
+		dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
+					      do_xfrm);
+		if (!dst)
  			return NULL;
-		}
+		rt = (struct rt6_info *) dst;
  	}

  	return rt;
@@ -248,6 +268,7 @@ int
  ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  		  struct ip_vs_protocol *pp)
  {
+	struct net *net = dev_net(skb->dev);
  	struct rtable *rt;			/* Route to the other host */
  	struct iphdr  *iph = ip_hdr(skb);
  	u8     tos = iph->tos;
@@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s

  	EnterFunction(10);

-	if (ip_route_output_key(&init_net, &rt, &fl)) {
+	if (ip_route_output_key(net, &rt, &fl)) {
  		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
  			     __func__, &iph->daddr);
  		goto tx_error_icmp;
@@ -313,25 +334,18 @@ int
  ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  		     struct ip_vs_protocol *pp)
  {
+	struct net *net = dev_net(skb->dev);
+	struct dst_entry *dst;
  	struct rt6_info *rt;			/* Route to the other host */
  	struct ipv6hdr  *iph = ipv6_hdr(skb);
  	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = iph->daddr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};

  	EnterFunction(10);

-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (!rt) {
-		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
-			     __func__, &iph->daddr);
+	dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
+	if (!dst)
  		goto tx_error_icmp;
-	}
+	rt = (struct rt6_info *) dst;

  	/* MTU checking */
  	mtu = dst_mtu(&rt->dst);
@@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
  		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
  	}

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
  		goto tx_error_icmp;

  	/* MTU checking */
@@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
  		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
  	}

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
  	if (!rt)
  		goto tx_error_icmp;

@@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
  	struct iphdr  *old_iph = ip_hdr(skb);
  	u8     tos = old_iph->tos;
  	__be16 df = old_iph->frag_off;
-	sk_buff_data_t old_transport_header = skb->transport_header;
  	struct iphdr  *iph;			/* Our new IP header */
  	unsigned int max_headroom;		/* The extra header space needed */
  	int    mtu;
@@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
  		goto tx_error;
  	}

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
  		goto tx_error_icmp;

  	tdev = rt->dst.dev;
@@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
  		old_iph = ip_hdr(skb);
  	}

-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;

  	/* fix old IP header checksum */
  	ip_send_check(old_iph);
@@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
  		     struct ip_vs_protocol *pp)
  {
  	struct rt6_info *rt;		/* Route to the other host */
+	struct in6_addr saddr;		/* Source for tunnel */
  	struct net_device *tdev;	/* Device to other host */
  	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-	sk_buff_data_t old_transport_header = skb->transport_header;
  	struct ipv6hdr  *iph;		/* Our new IP header */
  	unsigned int max_headroom;	/* The extra header space needed */
  	int    mtu;
@@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
  		goto tx_error;
  	}

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
  	if (!rt)
  		goto tx_error_icmp;

  	tdev = rt->dst.dev;

  	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
-	/* TODO IPv6: do we need this check in IPv6? */
-	if (mtu < 1280) {
+	if (mtu < IPV6_MIN_MTU) {
  		dst_release(&rt->dst);
-		IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
+		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+			     IPV6_MIN_MTU);
  		goto tx_error;
  	}
  	if (skb_dst(skb))
@@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
  		old_iph = ipv6_hdr(skb);
  	}

-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;

  	skb_push(skb, sizeof(struct ipv6hdr));
  	skb_reset_network_header(skb);
@@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
  	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
  	iph->priority		=	old_iph->priority;
  	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-	iph->daddr		=	rt->rt6i_dst.addr;
-	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
+	ipv6_addr_copy(&iph->saddr, &saddr);
  	iph->hop_limit		=	old_iph->hop_limit;

  	/* Another hack: avoid icmp_send in ip_fragment */
@@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc

  	EnterFunction(10);

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
  		goto tx_error_icmp;

  	/* MTU checking */
@@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st

  	EnterFunction(10);

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
  	if (!rt)
  		goto tx_error_icmp;

@@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
  	 * mangle and send the packet here (only for VS/NAT)
  	 */

-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
  		goto tx_error_icmp;

  	/* MTU checking */
@@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
  	 * mangle and send the packet here (only for VS/NAT)
  	 */

-	rt = __ip_vs_get_out_rt_v6(cp);
+	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
  	if (!rt)
  		goto tx_error_icmp;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] ipvs: IPv6 tunnel mode
  2010-09-22 13:50   ` Julian Anastasov
@ 2010-09-23 16:16     ` Hans Schillstrom
  2010-09-23 22:36       ` Julian Anastasov
  0 siblings, 1 reply; 8+ messages in thread
From: Hans Schillstrom @ 2010-09-23 16:16 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel, Simon Horman, Julius Volz

Hello 
This is a first test report for IPv6 cases 

On Wed, 2010-09-22 at 15:50 +0200, Julian Anastasov wrote:
>         Hello,
> 
> On Mon, 20 Sep 2010, Hans Schillstrom wrote:
> 
> > Tunnel mode for IPv6 doesn't work.
> >
> > IPv6 encapsulation uses a bad source address for the tunnel.
> > i.e. VIP will be used as local-addr and encap. dst addr.
> > Decapsulation will not accept this.
> 
>         Reading http://kb.linuxvirtualserver.org/wiki/IPv6_load_balancing
> may be TUN for IPv6 is not fully tested. Using VIP as saddr
> looks like spoofing because it can be configured on real server
> too. CC-ed Julius Volz to comment about using cp->vaddr as
> source for tunnel. May be it was used because rt6i_src is zero.
> 
> > Example
> > LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
> >    (eth0 2003::1:0:1/96)
> > RS  (ethX 2003::1:0:5/96)
> >
> > tcpdump
> > 2003::2:0:100 > 2003::1:0:5:
> > IP6 (hlim 63, next-header TCP (6) payload length: 40)
> >  2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
> > (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
> > 1904932 ecr 0,nop,wscale 3], length 0
> >
> >
> > In Linux IPv6 impl. you can't have a tunnel with an any cast address
> > receiving packets (I have not tried to interpret RFC 2473)
> > To have receive capabilities the tunnel must have:
> >  - Local address set as multicast addr or an unicast addr
> >  - Remote address set as an unicast addr.
> >  - Loop back addres or Link local address are not allowed.
> 
>         Yes, the tunnel device has many options for assigning
> local/remote addresses depending on traffic direction but
> for IPVS we use only unicast->unicast traffic. We even
> do not configure any tunnel parameters in real server
> for IPv4, just adding some address is enough, eg. 0.0.0.0.
> New kernels will need {all,default}/rp_filter=0 together
> with tunl*/rp_filter=0 because MAX(all,dev}/rp_filter is used.
> 
> > This causes us to setup a tunnel in the Real Server with the
> > LVS as the remote address, here you can't use the VIP address since it's
> > used inside the tunnel.
> >
> > Solution
> > Use outgoing interface IPv6 address (match against the destination).
> > i.e. use ipv6_dev_get_saddr(...) to set the source address of the
> > encapsulated package.
> >
> > However this lookup cost some cpu time to do, so there is some speed
> > enhancements to this patch
> 
>         The dst caching should work as for IPv4.
> To properly cache for IPv6 we need to add new fields in
> struct ip_vs_dest: dst_cookie, dst_saddr. We can do it just like
> in net/ipv6/ip6_tunnel.c:ip6_tnl_dst_check(). ip6_tnl_xmit2()
> is a good example for IPv6 tunneling.
> 
> > a) Cache the result
> > b) Static configuration i.e. used a static configured address
> >   (since the tunnel in RS use a static address remote address)
> >
> > Here follows the patch without any enhancements mention above.
> >
> > Signed-off-by: hans.schillstrom@ericsson.com
> >
> > ---
> >
> > --- linux-2.6.35.next/net/netfilter/ipvs/ip_vs_xmit.c       2010-08-13
> > 12:18:12.000000000 +0200
> > +++ linux-2.6.35.x/net/netfilter/ipvs/ip_vs_xmit.c  2010-09-20
> > 11:38:50.000000000 +0200
> >
> > @@ -688,10 +702,25 @@
> >        rt = __ip_vs_get_out_rt_v6(cp);
> >        if (!rt)
> >                goto tx_error_icmp;
> >
> >        tdev = rt->dst.dev;
> > +       /* Lookup source address for the tunnel */
> > +        if (likely(ipv6_addr_any(&rt->rt6i_src.addr))) {
> > +                struct net *net = dev_net(skb->dev);
> > +                int err = ipv6_dev_get_saddr(net, tdev,
> > +                                         &rt->rt6i_dst.addr,
> > +                                         0,
> > +                                         &rt->rt6i_src.addr);
> 
>         After looking at ipv6/ files I'm still not sure
> if ip6_route_output() returns valid rt6i_src in all cases.
> May be rt6i_src is filled only when CONFIG_IPV6_SUBTREES
> is defined.
> 
Made some nice "printk" testing here ip6_route_output()
seldom returns any source address.

>         So, may be your approach to use ipv6_dev_get_saddr
> is correct but I prefer a version that caches the routing
> results. So, I created a new patch for testing. But I don't
> have IPv6 setup to test it. Can you confirm that with this
> patch applied the message "new dst %pI6, src %pI6" shows
> correct results and that it does not appear for every packet.
> The idea is to see this message only once if the dst caching
> works for us. Patch is appended.
IPv6
dst caching works.
Tunnel mode on same physical link as VIP works
Tunnel mode on other interface as VIP also work
and it also works in another network namespace !
More testing will be needed... 

IPv4
No tests made yet, I'll do that tomorrow. 
> 
> > +                /* RFC 2473 Ch 8. IPv6 Tunnel Error Processing and
> > Reporting
> > +                 *   Both tunnel header and tunnel packet problems are
> > reported
> > +                 *   to the tunnel entry-point node.
> > +                 */
> > +                if (err) {
> > +                        goto tx_error_icmp;
> > +                }
> > +        }
> >
> >        mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> >        /* TODO IPv6: do we need this check in IPv6? */
> >        if (mtu < 1280) {
> >                dst_release(&rt->dst);
> > @@ -747,11 +776,11 @@
> >        iph->payload_len        =       old_iph->payload_len;
> >        be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
> >        iph->priority           =       old_iph->priority;
> >        memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
> >        iph->daddr              =       rt->rt6i_dst.addr;
> > -       iph->saddr              =       cp->vaddr.in6; /*
> > rt->rt6i_src.addr; */
> > +       iph->saddr              =       rt->rt6i_src.addr;
> >        iph->hop_limit          =       old_iph->hop_limit;
> >
> >        /* Another hack: avoid icmp_send in ip_fragment */
> >        skb->local_df = 1;
> > --
> 
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> ---
> 
> diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h linux/include/net/ip_vs.h
> --- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h    2010-09-16 09:03:48.000000000 +0300
> +++ linux/include/net/ip_vs.h   2010-09-22 10:50:18.548963467 +0300
> @@ -509,6 +509,10 @@ struct ip_vs_dest {
>         spinlock_t              dst_lock;       /* lock of dst_cache */
>         struct dst_entry        *dst_cache;     /* destination cache entry */
>         u32                     dst_rtos;       /* RT_TOS(tos) for dst */
> +       u32                     dst_cookie;
> +#ifdef CONFIG_IP_VS_IPV6
> +       struct in6_addr         dst_saddr;
> +#endif
> 
>         /* for virtual service */
>         struct ip_vs_service    *svc;           /* service it belongs to */
> diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c linux/net/netfilter/ipvs/ip_vs_xmit.c
> --- net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c        2010-09-16 09:02:25.000000000 +0300
> +++ linux/net/netfilter/ipvs/ip_vs_xmit.c       2010-09-22 16:29:43.271964521 +0300
> @@ -26,6 +26,7 @@
>   #include <net/route.h>                  /* for ip_route_output */
>   #include <net/ipv6.h>
>   #include <net/ip6_route.h>
> +#include <net/addrconf.h>
>   #include <linux/icmpv6.h>
>   #include <linux/netfilter.h>
>   #include <linux/netfilter_ipv4.h>
> @@ -37,26 +38,27 @@
>    *      Destination cache to speed up outgoing route lookup
>    */
>   static inline void
> -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
> +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
> +               u32 dst_cookie)
>   {
>         struct dst_entry *old_dst;
> 
>         old_dst = dest->dst_cache;
>         dest->dst_cache = dst;
>         dest->dst_rtos = rtos;
> +       dest->dst_cookie = dst_cookie;
>         dst_release(old_dst);
>   }
> 
>   static inline struct dst_entry *
> -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
> +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
>   {
>         struct dst_entry *dst = dest->dst_cache;
> 
>         if (!dst)
>                 return NULL;
> -       if ((dst->obsolete
> -            || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
> -           dst->ops->check(dst, cookie) == NULL) {
> +       if ((dst->obsolete || rtos != dest->dst_rtos) &&
> +           dst->ops->check(dst, dest->dst_cookie) == NULL) {
>                 dest->dst_cache = NULL;
>                 dst_release(dst);
>                 return NULL;
> @@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
>   }
> 
>   static struct rtable *
> -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
> +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
>   {
> +       struct net *net = dev_net(skb->dev);
>         struct rtable *rt;                      /* Route to the other host */
>         struct ip_vs_dest *dest = cp->dest;
> 
>         if (dest) {
>                 spin_lock(&dest->dst_lock);
>                 if (!(rt = (struct rtable *)
> -                     __ip_vs_dst_check(dest, rtos, 0))) {
> +                     __ip_vs_dst_check(dest, rtos))) {
>                         struct flowi fl = {
>                                 .oif = 0,
>                                 .nl_u = {
> @@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
>                                                 .tos = rtos, } },
>                         };
> 
> -                       if (ip_route_output_key(&init_net, &rt, &fl)) {
> +                       if (ip_route_output_key(net, &rt, &fl)) {
>                                 spin_unlock(&dest->dst_lock);
>                                 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
>                                              &dest->addr.ip);
>                                 return NULL;
>                         }
> -                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
> +                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
>                         IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
>                                   &dest->addr.ip,
>                                   atomic_read(&rt->dst.__refcnt), rtos);
> @@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
>                                         .tos = rtos, } },
>                 };
> 
> -               if (ip_route_output_key(&init_net, &rt, &fl)) {
> +               if (ip_route_output_key(net, &rt, &fl)) {
>                         IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
>                                      &cp->daddr.ip);
>                         return NULL;
> @@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
>   }
> 
>   #ifdef CONFIG_IP_VS_IPV6
> +
> +static struct dst_entry *
> +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
> +                       struct in6_addr *ret_saddr, int do_xfrm)
> +{
> +       struct dst_entry *dst;
> +       struct flowi fl = {
> +               .oif = 0,
> +               .nl_u = {
> +                       .ip6_u = {
> +                               .daddr = *daddr,
> +                       },
> +               },
> +       };
> +
> +       dst = ip6_route_output(net, NULL, &fl);
> +       if (dst->error)
> +               goto out_err;
> +       if (!ret_saddr)
> +               return dst;
> +       if (ipv6_addr_any(&fl.fl6_src) &&
> +           ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
> +                              &fl.fl6_dst, 0, &fl.fl6_src) < 0)
> +               goto out_err;
> +       if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
> +               goto out_err;
> +       ipv6_addr_copy(ret_saddr, &fl.fl6_src);
> +       return dst;
> +
> +out_err:
> +       dst_release(dst);
> +       IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
> +       return NULL;
> +}
> +
>   static struct rt6_info *
> -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
> +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> +                     struct in6_addr *ret_saddr, int do_xfrm)
>   {
> +       struct net *net = dev_net(skb->dev);
>         struct rt6_info *rt;                    /* Route to the other host */
>         struct ip_vs_dest *dest = cp->dest;
> +       struct dst_entry *dst;
> 
>         if (dest) {
>                 spin_lock(&dest->dst_lock);
> -               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
> +               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
>                 if (!rt) {
> -                       struct flowi fl = {
> -                               .oif = 0,
> -                               .nl_u = {
> -                                       .ip6_u = {
> -                                               .daddr = dest->addr.in6,
> -                                               .saddr = {
> -                                                       .s6_addr32 =
> -                                                               { 0, 0, 0, 0 },
> -                                               },
> -                                       },
> -                               },
> -                       };
> +                       u32 cookie;
> 
> -                       rt = (struct rt6_info *)ip6_route_output(&init_net,
> -                                                                NULL, &fl);
> -                       if (!rt) {
> +                       dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
> +                                                     &dest->dst_saddr,
> +                                                     do_xfrm);
> +                       if (!dst) {
>                                 spin_unlock(&dest->dst_lock);
> -                               IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> -                                            &dest->addr.in6);
>                                 return NULL;
>                         }
> -                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
> -                       IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
> -                                 &dest->addr.in6,
> +                       rt = (struct rt6_info *) dst;
> +                       cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
> +                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
> +                       IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
> +                                 &dest->addr.in6, &dest->dst_saddr,
>                                   atomic_read(&rt->dst.__refcnt));
>                 }
> +               if (ret_saddr)
> +                       ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
>                 spin_unlock(&dest->dst_lock);
>         } else {
> -               struct flowi fl = {
> -                       .oif = 0,
> -                       .nl_u = {
> -                               .ip6_u = {
> -                                       .daddr = cp->daddr.in6,
> -                                       .saddr = {
> -                                               .s6_addr32 = { 0, 0, 0, 0 },
> -                                       },
> -                               },
> -                       },
> -               };
> -
> -               rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> -               if (!rt) {
> -                       IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> -                                    &cp->daddr.in6);
> +               dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
> +                                             do_xfrm);
> +               if (!dst)
>                         return NULL;
> -               }
> +               rt = (struct rt6_info *) dst;
>         }
> 
>         return rt;
> @@ -248,6 +268,7 @@ int
>   ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
>                   struct ip_vs_protocol *pp)
>   {
> +       struct net *net = dev_net(skb->dev);
>         struct rtable *rt;                      /* Route to the other host */
>         struct iphdr  *iph = ip_hdr(skb);
>         u8     tos = iph->tos;
> @@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s
> 
>         EnterFunction(10);
> 
> -       if (ip_route_output_key(&init_net, &rt, &fl)) {
> +       if (ip_route_output_key(net, &rt, &fl)) {
>                 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
>                              __func__, &iph->daddr);
>                 goto tx_error_icmp;
> @@ -313,25 +334,18 @@ int
>   ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
>                      struct ip_vs_protocol *pp)
>   {
> +       struct net *net = dev_net(skb->dev);
> +       struct dst_entry *dst;
>         struct rt6_info *rt;                    /* Route to the other host */
>         struct ipv6hdr  *iph = ipv6_hdr(skb);
>         int    mtu;
> -       struct flowi fl = {
> -               .oif = 0,
> -               .nl_u = {
> -                       .ip6_u = {
> -                               .daddr = iph->daddr,
> -                               .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
> -       };
> 
>         EnterFunction(10);
> 
> -       rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> -       if (!rt) {
> -               IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
> -                            __func__, &iph->daddr);
> +       dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
> +       if (!dst)
>                 goto tx_error_icmp;
> -       }
> +       rt = (struct rt6_info *) dst;
> 
>         /* MTU checking */
>         mtu = dst_mtu(&rt->dst);
> @@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
>                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
>         }
> 
> -       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> +       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
>                 goto tx_error_icmp;
> 
>         /* MTU checking */
> @@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
>                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
>         }
> 
> -       rt = __ip_vs_get_out_rt_v6(cp);
> +       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
>         if (!rt)
>                 goto tx_error_icmp;
> 
> @@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
>         struct iphdr  *old_iph = ip_hdr(skb);
>         u8     tos = old_iph->tos;
>         __be16 df = old_iph->frag_off;
> -       sk_buff_data_t old_transport_header = skb->transport_header;
>         struct iphdr  *iph;                     /* Our new IP header */
>         unsigned int max_headroom;              /* The extra header space needed */
>         int    mtu;
> @@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
>                 goto tx_error;
>         }
> 
> -       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
> +       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
>                 goto tx_error_icmp;
> 
>         tdev = rt->dst.dev;
> @@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
>                 old_iph = ip_hdr(skb);
>         }
> 
> -       skb->transport_header = old_transport_header;
> +       skb->transport_header = skb->network_header;
> 
>         /* fix old IP header checksum */
>         ip_send_check(old_iph);
> @@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>                      struct ip_vs_protocol *pp)
>   {
>         struct rt6_info *rt;            /* Route to the other host */
> +       struct in6_addr saddr;          /* Source for tunnel */
>         struct net_device *tdev;        /* Device to other host */
>         struct ipv6hdr  *old_iph = ipv6_hdr(skb);
> -       sk_buff_data_t old_transport_header = skb->transport_header;
>         struct ipv6hdr  *iph;           /* Our new IP header */
>         unsigned int max_headroom;      /* The extra header space needed */
>         int    mtu;
> @@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>                 goto tx_error;
>         }
> 
> -       rt = __ip_vs_get_out_rt_v6(cp);
> +       rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
>         if (!rt)
>                 goto tx_error_icmp;
> 
>         tdev = rt->dst.dev;
> 
>         mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> -       /* TODO IPv6: do we need this check in IPv6? */
> -       if (mtu < 1280) {
> +       if (mtu < IPV6_MIN_MTU) {
>                 dst_release(&rt->dst);
> -               IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
> +               IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> +                            IPV6_MIN_MTU);
>                 goto tx_error;
>         }
>         if (skb_dst(skb))
> @@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>                 old_iph = ipv6_hdr(skb);
>         }
> 
> -       skb->transport_header = old_transport_header;
> +       skb->transport_header = skb->network_header;
> 
>         skb_push(skb, sizeof(struct ipv6hdr));
>         skb_reset_network_header(skb);
> @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
>         be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
>         iph->priority           =       old_iph->priority;
>         memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
> -       iph->daddr              =       rt->rt6i_dst.addr;
> -       iph->saddr              =       cp->vaddr.in6; /* rt->rt6i_src.addr; */
> +       ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
> +       ipv6_addr_copy(&iph->saddr, &saddr);
>         iph->hop_limit          =       old_iph->hop_limit;
> 
>         /* Another hack: avoid icmp_send in ip_fragment */
> @@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc
> 
>         EnterFunction(10);
> 
> -       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> +       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
>                 goto tx_error_icmp;
> 
>         /* MTU checking */
> @@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st
> 
>         EnterFunction(10);
> 
> -       rt = __ip_vs_get_out_rt_v6(cp);
> +       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
>         if (!rt)
>                 goto tx_error_icmp;
> 
> @@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
>          * mangle and send the packet here (only for VS/NAT)
>          */
> 
> -       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
> +       if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
>                 goto tx_error_icmp;
> 
>         /* MTU checking */
> @@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
>          * mangle and send the packet here (only for VS/NAT)
>          */
> 
> -       rt = __ip_vs_get_out_rt_v6(cp);
> +       rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
>         if (!rt)
>                 goto tx_error_icmp;
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] ipvs: IPv6 tunnel mode
  2010-09-23 16:16     ` Hans Schillstrom
@ 2010-09-23 22:36       ` Julian Anastasov
  2010-09-24 12:06         ` Hans Schillstrom
  0 siblings, 1 reply; 8+ messages in thread
From: Julian Anastasov @ 2010-09-23 22:36 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: lvs-devel, Simon Horman, Julius Volz


 	Hello,

On Thu, 23 Sep 2010, Hans Schillstrom wrote:

> IPv6
> dst caching works.
> Tunnel mode on same physical link as VIP works
> Tunnel mode on other interface as VIP also work
> and it also works in another network namespace !
> More testing will be needed...

 	Thanks! Should it work in another namespace ? :)
I'm sure your namespace work will make it possible.

> IPv4
> No tests made yet, I'll do that tomorrow.

 	The changes for IPv4 are little, I tested TUN for v4.
Let me know if you need more changes in this patch or
if I should push it as-is with your Signed-off-by line.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC][PATCH] ipvs: IPv6 tunnel mode
  2010-09-23 22:36       ` Julian Anastasov
@ 2010-09-24 12:06         ` Hans Schillstrom
  0 siblings, 0 replies; 8+ messages in thread
From: Hans Schillstrom @ 2010-09-24 12:06 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel, Simon Horman, Julius Volz

Hi Julian
I'm ready with my tests no bugs, so you can push it

Regads 
Hans Schillstom <hans.schillstom@ericsson.com>

On Fri, 2010-09-24 at 00:36 +0200, Julian Anastasov wrote:
>  	Hello,
> 
> On Thu, 23 Sep 2010, Hans Schillstrom wrote:
> 
> > IPv6
> > dst caching works.
> > Tunnel mode on same physical link as VIP works
> > Tunnel mode on other interface as VIP also work
> > and it also works in another network namespace !
> > More testing will be needed...
> 
>  	Thanks! Should it work in another namespace ? :)
> I'm sure your namespace work will make it possible.
> 
> > IPv4
> > No tests made yet, I'll do that tomorrow.
> 
>  	The changes for IPv4 are little, I tested TUN for v4.
> Let me know if you need more changes in this patch or
> if I should push it as-is with your Signed-off-by line.
> 
> Regards
> 
> --
> Julian Anastasov <ja@ssi.bg>


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2010-09-24 12:06 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-18 14:18 [PATCH] ipvs: changes related to service usecnt Julian Anastasov
2010-09-19 11:48 ` Simon Horman
2010-09-21 16:15   ` Patrick McHardy
2010-09-20 10:13 ` [RFC][PATCH] ipvs: IPv6 tunnel mode Hans Schillstrom
2010-09-22 13:50   ` Julian Anastasov
2010-09-23 16:16     ` Hans Schillstrom
2010-09-23 22:36       ` Julian Anastasov
2010-09-24 12:06         ` Hans Schillstrom

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.