All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
@ 2013-05-24 12:09 Alexander Frolkin
  2013-05-24 15:05 ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-05-24 12:09 UTC (permalink / raw)
  To: lvs-devel

[-- Attachment #1: Type: text/plain, Size: 1231 bytes --]

Hi,

I've added some features that I needed for our purposes to LVS, and I'd
like to submit a patch, in case they might be useful to other users.

The patch is against the Ubuntu 12.04 kernel (3.2.0).

The patch adds three features:

1.  Sloppy TCP handling.  When enabled (net.ipv4.vs.sloppy_tcp=1,
default 0), it allows IPVS to create a TCP connection state on any TCP
packet, not just a SYN.  This allows connections to fail over to a
different director (our plan is to run multiple directors active-active)
without being reset.

2. SH rebalancing.  When enabled (net.ipv4.vs.sh_rebalance=1, default
0), virtual servers using SH (or SHP --- see below) scheduling will
retry the realserver selection if the realserver selected the first time
round is unavailable (e.g., because it has weight 0).  This allows
realservers to be paused on SH(P) virtual servers by setting the weight
to 0.

3. SHP (SH + port) scheduler.  This is a clone of the SH code, but
hacked to also take the port number (TCP, UDP, SCTP) into account.  This
may seem no different to round-robin, but in our scenario, if a
connection is failed over to a different director, this guarantees that
it will continue being forwarded to the same realserver.


Alex


[-- Attachment #2: ipvs.patch --]
[-- Type: text/plain, Size: 14877 bytes --]

diff -Nupr linux-source-3.2.0/include/net/ip_vs.h linux-source-3.2.0-patched/include/net/ip_vs.h
--- linux-source-3.2.0/include/net/ip_vs.h	2012-01-04 23:55:44.000000000 +0000
+++ linux-source-3.2.0-patched/include/net/ip_vs.h	2013-05-17 14:44:58.000000000 +0100
@@ -871,6 +871,8 @@ struct netns_ipvs {
 	int			sysctl_sync_ver;
 	int			sysctl_cache_bypass;
 	int			sysctl_expire_nodest_conn;
+	int			sysctl_sloppy_tcp;
+	int			sysctl_sh_rebalance;
 	int			sysctl_expire_quiescent_template;
 	int			sysctl_sync_threshold[2];
 	int			sysctl_nat_icmp_send;
@@ -911,6 +913,8 @@ struct netns_ipvs {
 #define DEFAULT_SYNC_THRESHOLD	3
 #define DEFAULT_SYNC_PERIOD	50
 #define DEFAULT_SYNC_VER	1
+#define DEFAULT_SLOPPY_TCP	0
+#define DEFAULT_SH_REBALANCE	0
 
 #ifdef CONFIG_SYSCTL
 
@@ -929,6 +933,16 @@ static inline int sysctl_sync_ver(struct
 	return ipvs->sysctl_sync_ver;
 }
 
+static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sloppy_tcp;
+}
+
+static inline int sysctl_sh_rebalance(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sh_rebalance;
+}
+
 #else
 
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -946,6 +960,16 @@ static inline int sysctl_sync_ver(struct
 	return DEFAULT_SYNC_VER;
 }
 
+static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SLOPPY_TCP;
+}
+
+static inline int sysctl_sh_rebalance(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SH_REBALANCE;
+}
+
 #endif
 
 /*
diff -Nupr linux-source-3.2.0/net/netfilter/ipvs/Kconfig linux-source-3.2.0-patched/net/netfilter/ipvs/Kconfig
--- linux-source-3.2.0/net/netfilter/ipvs/Kconfig	2012-01-04 23:55:44.000000000 +0000
+++ linux-source-3.2.0-patched/net/netfilter/ipvs/Kconfig	2013-05-17 14:44:58.000000000 +0100
@@ -206,6 +206,16 @@ config	IP_VS_SH
 	  If you want to compile it in kernel, say Y. To compile it as a
 	  module, choose M here. If unsure, say N.
 
+config	IP_VS_SHP
+	tristate "layer 4 source hashing scheduling"
+	---help---
+	  The source hashing scheduling algorithm assigns network
+	  connections to the servers through looking up a statically assigned
+	  hash table by their source IP addresses and ports.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
 config	IP_VS_SED
 	tristate "shortest expected delay scheduling"
 	---help---
diff -Nupr linux-source-3.2.0/net/netfilter/ipvs/Makefile linux-source-3.2.0-patched/net/netfilter/ipvs/Makefile
--- linux-source-3.2.0/net/netfilter/ipvs/Makefile	2012-01-04 23:55:44.000000000 +0000
+++ linux-source-3.2.0-patched/net/netfilter/ipvs/Makefile	2013-05-17 14:44:58.000000000 +0100
@@ -30,6 +30,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
 obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
 obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SHP) += ip_vs_shp.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
diff -Nupr linux-source-3.2.0/net/netfilter/ipvs/ip_vs_ctl.c linux-source-3.2.0-patched/net/netfilter/ipvs/ip_vs_ctl.c
--- linux-source-3.2.0/net/netfilter/ipvs/ip_vs_ctl.c	2012-09-26 22:32:28.000000000 +0100
+++ linux-source-3.2.0-patched/net/netfilter/ipvs/ip_vs_ctl.c	2013-05-17 14:44:58.000000000 +0100
@@ -1730,6 +1730,18 @@ static struct ctl_table vs_vars[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "sloppy_tcp",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sh_rebalance",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "expire_quiescent_template",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
@@ -3657,6 +3669,8 @@ int __net_init ip_vs_control_net_init_sy
 	tbl[idx++].data = &ipvs->sysctl_sync_ver;
 	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
 	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+	tbl[idx++].data = &ipvs->sysctl_sh_rebalance;
 	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
 	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
 	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
diff -Nupr linux-source-3.2.0/net/netfilter/ipvs/ip_vs_proto_tcp.c linux-source-3.2.0-patched/net/netfilter/ipvs/ip_vs_proto_tcp.c
--- linux-source-3.2.0/net/netfilter/ipvs/ip_vs_proto_tcp.c	2012-01-04 23:55:44.000000000 +0000
+++ linux-source-3.2.0-patched/net/netfilter/ipvs/ip_vs_proto_tcp.c	2013-05-17 14:44:58.000000000 +0100
@@ -49,7 +49,7 @@ tcp_conn_schedule(int af, struct sk_buff
 	}
 	net = skb_net(skb);
 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
-	if (th->syn &&
+	if ((sysctl_sloppy_tcp(net_ipvs(net)) || th->syn) &&
 	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
 				     &iph.daddr, th->dest))) {
 		int ignored;
diff -Nupr linux-source-3.2.0/net/netfilter/ipvs/ip_vs_sh.c linux-source-3.2.0-patched/net/netfilter/ipvs/ip_vs_sh.c
--- linux-source-3.2.0/net/netfilter/ipvs/ip_vs_sh.c	2012-01-04 23:55:44.000000000 +0000
+++ linux-source-3.2.0-patched/net/netfilter/ipvs/ip_vs_sh.c	2013-05-17 14:44:58.000000000 +0100
@@ -65,7 +65,7 @@ struct ip_vs_sh_bucket {
 /*
  *	Returns hash value for IPVS SH entry
  */
-static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, int offset)
 {
 	__be32 addr_fold = addr->ip;
 
@@ -74,7 +74,7 @@ static inline unsigned ip_vs_sh_hashkey(
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+	return (offset + ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
 }
 
 
@@ -83,9 +83,9 @@ static inline unsigned ip_vs_sh_hashkey(
  */
 static inline struct ip_vs_dest *
 ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
-	     const union nf_inet_addr *addr)
+	     const union nf_inet_addr *addr, int offset)
 {
-	return (tbl[ip_vs_sh_hashkey(af, addr)]).dest;
+	return (tbl[ip_vs_sh_hashkey(af, addr, offset)]).dest;
 }
 
 
@@ -211,14 +211,36 @@ ip_vs_sh_schedule(struct ip_vs_service *
 	struct ip_vs_dest *dest;
 	struct ip_vs_sh_bucket *tbl;
 	struct ip_vs_iphdr iph;
+	int offset;
+	int found;
 
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
 	tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr);
-	if (!dest
+	if(sysctl_sh_rebalance(net_ipvs(svc->net))) {
+		found = 0;
+		for(offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+			dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr, offset);
+			if(!dest
+			   || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+			   || atomic_read(&dest->weight) <= 0
+			   || is_overloaded(dest)) {
+				IP_VS_DBG_BUF(6, "SH: Selected unavailable server %s:%d, retrying with offset %d\n",
+		      			      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+					      ntohs(dest->port), offset);
+			} else {
+				found = 1;
+				break;
+			}
+		}
+	} else {
+		dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr, 0);
+		found = 1;
+	}
+	if (!found
+	    || !dest
 	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
 	    || atomic_read(&dest->weight) <= 0
 	    || is_overloaded(dest)) {
diff -Nupr linux-source-3.2.0/net/netfilter/ipvs/ip_vs_shp.c linux-source-3.2.0-patched/net/netfilter/ipvs/ip_vs_shp.c
--- linux-source-3.2.0/net/netfilter/ipvs/ip_vs_shp.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-source-3.2.0-patched/net/netfilter/ipvs/ip_vs_shp.c	2013-05-17 14:45:09.000000000 +0100
@@ -0,0 +1,299 @@
+/*
+ * IPVS:        SHP scheduling module
+ *
+ * Authors:     Alexander Frolkin <avf@eldamar.org.uk>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * This is simply the SH module but hacked to also include the source port
+ * in the hash calculation.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
+/*
+ *      IPVS SHP bucket
+ */
+struct ip_vs_shp_bucket {
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS SHP entry hash table
+ */
+#ifndef CONFIG_IP_VS_SHP_TAB_BITS
+#define CONFIG_IP_VS_SHP_TAB_BITS        8
+#endif
+#define IP_VS_SHP_TAB_BITS               CONFIG_IP_VS_SHP_TAB_BITS
+#define IP_VS_SHP_TAB_SIZE               (1 << IP_VS_SHP_TAB_BITS)
+#define IP_VS_SHP_TAB_MASK               (IP_VS_SHP_TAB_SIZE - 1)
+
+
+/*
+ *	Returns hash value for IPVS SHP entry
+ */
+static inline unsigned ip_vs_shp_hashkey(int af, const union nf_inet_addr *addr, unsigned int port, int offset)
+{
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+	return (offset + (port + ntohl(addr_fold))*2654435761UL) & IP_VS_SHP_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_shp_get(int af, struct ip_vs_shp_bucket *tbl,
+	     const union nf_inet_addr *addr, unsigned int port, int offset)
+{
+	return (tbl[ip_vs_shp_hashkey(af, addr, port, offset)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_shp_assign(struct ip_vs_shp_bucket *tbl, struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_shp_bucket *b;
+	struct list_head *p;
+	struct ip_vs_dest *dest;
+
+	b = tbl;
+	p = &svc->destinations;
+	for (i=0; i<IP_VS_SHP_TAB_SIZE; i++) {
+		if (list_empty(p)) {
+			b->dest = NULL;
+		} else {
+			if (p == &svc->destinations)
+				p = p->next;
+
+			dest = list_entry(p, struct ip_vs_dest, n_list);
+			atomic_inc(&dest->refcnt);
+			b->dest = dest;
+
+			p = p->next;
+		}
+		b++;
+	}
+	return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_shp_flush(struct ip_vs_shp_bucket *tbl)
+{
+	int i;
+	struct ip_vs_shp_bucket *b;
+
+	b = tbl;
+	for (i=0; i<IP_VS_SHP_TAB_SIZE; i++) {
+		if (b->dest) {
+			atomic_dec(&b->dest->refcnt);
+			b->dest = NULL;
+		}
+		b++;
+	}
+}
+
+
+static int ip_vs_shp_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_shp_bucket *tbl;
+
+	/* allocate the SHP table for this service */
+	tbl = kmalloc(sizeof(struct ip_vs_shp_bucket)*IP_VS_SHP_TAB_SIZE,
+		      GFP_ATOMIC);
+	if (tbl == NULL)
+		return -ENOMEM;
+
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "SHP hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_shp_bucket)*IP_VS_SHP_TAB_SIZE);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_shp_assign(tbl, svc);
+
+	return 0;
+}
+
+
+static int ip_vs_shp_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_shp_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_shp_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "SHP hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_shp_bucket)*IP_VS_SHP_TAB_SIZE);
+
+	return 0;
+}
+
+
+static int ip_vs_shp_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_shp_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_shp_flush(tbl);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_shp_assign(tbl, svc);
+
+	return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_shp_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_shp_bucket *tbl;
+	struct ip_vs_iphdr iph;
+	struct tcphdr _tcph, *th;
+	struct udphdr _udph, *uh;
+	sctp_sctphdr_t _sctph, *sh;
+	unsigned int port;
+	int offset;
+	int found;
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	IP_VS_DBG(6, "ip_vs_shp_schedule(): Scheduling...\n");
+
+	switch(svc->protocol) {
+		case IPPROTO_TCP:
+			th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
+			port = ntohs(th->source);
+			break;
+		case IPPROTO_UDP:
+			uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
+			port = ntohs(uh->source);
+			break;
+		case IPPROTO_SCTP:
+			sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph);
+			port = ntohs(sh->source);
+			break;
+		default:
+			port = 0;
+	}
+
+	tbl = (struct ip_vs_shp_bucket *)svc->sched_data;
+	if(sysctl_sh_rebalance(net_ipvs(svc->net))) {
+		found = 0;
+		for(offset = 0; offset < IP_VS_SHP_TAB_SIZE; offset++) {
+			dest = ip_vs_shp_get(svc->af, tbl, &iph.saddr, port, offset);
+			if(!dest
+			   || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+			   || atomic_read(&dest->weight) <= 0
+			   || is_overloaded(dest)) {
+				IP_VS_DBG_BUF(6, "SHP: Selected unavailable server %s:%d, retrying with offset %d\n",
+		      			      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+					      ntohs(dest->port), offset);
+			} else {
+				found = 1;
+				break;
+			}
+		}
+	} else {
+		dest = ip_vs_shp_get(svc->af, tbl, &iph.saddr, port, 0);
+		found = 1;
+	}
+	if (!found
+	    || !dest
+	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+	    || atomic_read(&dest->weight) <= 0
+	    || is_overloaded(dest)) {
+		ip_vs_scheduler_err(svc, "no destination available");
+		return NULL;
+	}
+
+	IP_VS_DBG_BUF(6, "SHP: source IP address %s --> server %s:%d\n",
+		      IP_VS_DBG_ADDR(svc->af, &iph.saddr),
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+		      ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS SHP Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_shp_scheduler =
+{
+	.name =			"shp",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list	 =		LIST_HEAD_INIT(ip_vs_shp_scheduler.n_list),
+	.init_service =		ip_vs_shp_init_svc,
+	.done_service =		ip_vs_shp_done_svc,
+	.update_service =	ip_vs_shp_update_svc,
+	.schedule =		ip_vs_shp_schedule,
+};
+
+
+static int __init ip_vs_shp_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_shp_scheduler);
+}
+
+
+static void __exit ip_vs_shp_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_shp_scheduler);
+}
+
+
+module_init(ip_vs_shp_init);
+module_exit(ip_vs_shp_cleanup);
+MODULE_LICENSE("GPL");

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-24 12:09 [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling Alexander Frolkin
@ 2013-05-24 15:05 ` Julian Anastasov
  2013-05-24 15:14   ` Alexander Frolkin
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-05-24 15:05 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Fri, 24 May 2013, Alexander Frolkin wrote:

> Hi,
> 
> I've added some features that I needed for our purposes to LVS, and I'd
> like to submit a patch, in case they might be useful to other users.
> 
> The patch is against the Ubuntu 12.04 kernel (3.2.0).

	I assume this is not intended for kernel inclusion :)

> The patch adds three features:
> 
> 1.  Sloppy TCP handling.  When enabled (net.ipv4.vs.sloppy_tcp=1,
> default 0), it allows IPVS to create a TCP connection state on any TCP
> packet, not just a SYN.  This allows connections to fail over to a
> different director (our plan is to run multiple directors active-active)
> without being reset.

	For most of the connectoins the backup server
should get a sync messages in time, so it should be able
to find existing connection in correct state, usually
established. By using persistence the chances to
hit the right real server in backup are increased.

> 2. SH rebalancing.  When enabled (net.ipv4.vs.sh_rebalance=1, default
> 0), virtual servers using SH (or SHP --- see below) scheduling will
> retry the realserver selection if the realserver selected the first time
> round is unavailable (e.g., because it has weight 0).  This allows
> realservers to be paused on SH(P) virtual servers by setting the weight
> to 0.

	The SH authors decided to change the mapping in SH
table with destinations only when dest is added/removed
but not when weight is set to 0. It is better not to
complicate the SH scheduler, especially when more schedulers
can be created.

> 3. SHP (SH + port) scheduler.  This is a clone of the SH code, but
> hacked to also take the port number (TCP, UDP, SCTP) into account.  This
> may seem no different to round-robin, but in our scenario, if a
> connection is failed over to a different director, this guarantees that
> it will continue being forwarded to the same realserver.

	Is it a scenario where one client IP/net creates
many connections that can influence the balancing and
persistence can cause imbalance? Isn't persistence
suitable? IIRC, it can do failover when expire_quiescent_template
is enabled:

Documentation/networking/ipvs-sysctl.txt

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-24 15:05 ` Julian Anastasov
@ 2013-05-24 15:14   ` Alexander Frolkin
  2013-05-24 16:18     ` Aleksey Chudov
  2013-05-27 21:11     ` [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling Julian Anastasov
  0 siblings, 2 replies; 52+ messages in thread
From: Alexander Frolkin @ 2013-05-24 15:14 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> > 1.  Sloppy TCP handling.  When enabled (net.ipv4.vs.sloppy_tcp=1,
> > default 0), it allows IPVS to create a TCP connection state on any TCP
> > packet, not just a SYN.  This allows connections to fail over to a
> > different director (our plan is to run multiple directors active-active)
> > without being reset.
> 	For most of the connectoins the backup server
> should get a sync messages in time, so it should be able
> to find existing connection in correct state, usually
> established. By using persistence the chances to
> hit the right real server in backup are increased.

We have a number of directors in active-active mode, we don't have any
kind of state sync.  My understanding is that the state sync daemon only
supports an active-backup configuration.  In our configuration it would
have to be sending out updates and receiving updates from other servers
at the same time.  Even if this works, we don't want a connection on one
server creating state on all the servers in the cluster, because that
would be a waste of memory most of the time.  Also, state sync
introduces a race condition which doesn't exist without state sync.

> 	The SH authors decided to change the mapping in SH
> table with destinations only when dest is added/removed
> but not when weight is set to 0. It is better not to
> complicate the SH scheduler, especially when more schedulers
> can be created.

Fair enough.  So if I create a new scheduler instead of hacking SH,
would that be more likely to be accepted?

> > 3. SHP (SH + port) scheduler.  This is a clone of the SH code, but
> > hacked to also take the port number (TCP, UDP, SCTP) into account.  This
> > may seem no different to round-robin, but in our scenario, if a
> > connection is failed over to a different director, this guarantees that
> > it will continue being forwarded to the same realserver.
> 	Is it a scenario where one client IP/net creates
> many connections that can influence the balancing and
> persistence can cause imbalance? Isn't persistence
> suitable? IIRC, it can do failover when expire_quiescent_template
> is enabled:

It's not about imbalance, it's just about running a number of
independent directors, with no state sync, but with the ability to fail
over from one to another.


Alex


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-24 15:14   ` Alexander Frolkin
@ 2013-05-24 16:18     ` Aleksey Chudov
  2013-05-27 21:31       ` Julian Anastasov
  2013-05-27 21:11     ` [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling Julian Anastasov
  1 sibling, 1 reply; 52+ messages in thread
From: Aleksey Chudov @ 2013-05-24 16:18 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: Julian Anastasov, lvs-devel

On 24.05.2013 18:14, Alexander Frolkin wrote:
> Hi,
>
>>> 1.  Sloppy TCP handling.  When enabled (net.ipv4.vs.sloppy_tcp=1,
>>> default 0), it allows IPVS to create a TCP connection state on any TCP
>>> packet, not just a SYN.  This allows connections to fail over to a
>>> different director (our plan is to run multiple directors active-active)
>>> without being reset.
>> 	For most of the connectoins the backup server
>> should get a sync messages in time, so it should be able
>> to find existing connection in correct state, usually
>> established. By using persistence the chances to
>> hit the right real server in backup are increased.
> We have a number of directors in active-active mode, we don't have any
> kind of state sync.  My understanding is that the state sync daemon only
> supports an active-backup configuration.  In our configuration it would
> have to be sending out updates and receiving updates from other servers
> at the same time.  Even if this works, we don't want a connection on one
> server creating state on all the servers in the cluster, because that
> would be a waste of memory most of the time.  Also, state sync
> introduces a race condition which doesn't exist without state sync.

I'm sorry for interrupting your conversation. Actually sync daemon send 
updates via multicast. So it is enough to run two processes on each 
server. One in the Master mode and second in the Backup mode. In theory 
it is possible to synchronize a large number of servers. In fact, in our 
experience, it is very dangerous to synchronize 16 node LVS cluster. 
During a typical syn flood all servers will runs out of memory unless 
you have 512GB of RAM in each node. For example, we observed the 
consumption of more than 30 GB of memory on each server during syn flood 
(without connections sync). Unfortunately sync more than three - four 
servers with each other is very expensive.

> It's not about imbalance, it's just about running a number of
> independent directors, with no state sync, but with the ability to fail
> over from one to another.
>

May be better to modify the sync algorithm to synchronize only 
persistence templates for these specific cases? Is it possible at all?


Aleksey

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-24 15:14   ` Alexander Frolkin
  2013-05-24 16:18     ` Aleksey Chudov
@ 2013-05-27 21:11     ` Julian Anastasov
  2013-06-07  8:12       ` Alexander Frolkin
  2013-06-10 15:12       ` Alexander Frolkin
  1 sibling, 2 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-05-27 21:11 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Fri, 24 May 2013, Alexander Frolkin wrote:

> Hi,
> 
> > > 1.  Sloppy TCP handling.  When enabled (net.ipv4.vs.sloppy_tcp=1,
> > > default 0), it allows IPVS to create a TCP connection state on any TCP
> > > packet, not just a SYN.  This allows connections to fail over to a
> > > different director (our plan is to run multiple directors active-active)
> > > without being reset.
> > 	For most of the connectoins the backup server
> > should get a sync messages in time, so it should be able
> > to find existing connection in correct state, usually
> > established. By using persistence the chances to
> > hit the right real server in backup are increased.
> 
> We have a number of directors in active-active mode, we don't have any
> kind of state sync.  My understanding is that the state sync daemon only
> supports an active-backup configuration.  In our configuration it would
> have to be sending out updates and receiving updates from other servers
> at the same time.  Even if this works, we don't want a connection on one
> server creating state on all the servers in the cluster, because that
> would be a waste of memory most of the time.  Also, state sync
> introduces a race condition which doesn't exist without state sync.

	ok, I have to think more days about the
effects from sloppy_tcp. May be this logic is useful
also for SCTP.

> > 	The SH authors decided to change the mapping in SH
> > table with destinations only when dest is added/removed
> > but not when weight is set to 0. It is better not to
> > complicate the SH scheduler, especially when more schedulers
> > can be created.
> 
> Fair enough.  So if I create a new scheduler instead of hacking SH,
> would that be more likely to be accepted?

	OTOH, the difference is very small: the port.
The problem is that we add only global controls, it
would be good if we can configure such parameters
per virtual service:

- use port in source hash
- use source netmask for source address - similar to the
netmask used by persistence

	Not sure what solution is better. May be we
can add some IP_VS_SVC_F_SCHED1..N definitions to
parametrize the schedulers. As for the netmask, one
variant is to reuse the persistent mask/plen. For
example:

IP_VS_SVC_F_OPEN (or other name): sloppy_tcp/sloppy_sctp

	The problem here is that we call ip_vs_service_find()
after checking th->syn. So, may be it is better to have
global sysctl flag here, as in your patch.

IP_VS_SVC_F_SCHED1: scheduler flag 1 (SH: fallback to other dest if 
weight=0), i.e. the sh_rebalance flag

IP_VS_SVC_F_SCHED2: scheduler flag 1 (SH: add port in hash)

IP_VS_SVC_F_SCHED3: scheduler flag 2 (SH: consider mask/plen)

	Note that latest SH version supports weights and
RCU, you have to consider it for next patch versions.

	sh_rebalance can become sh_fallback if not
done with IP_VS_SVC_F_SCHED1. May be SHP is not needed
if SH is parametrized.

	Comments?

	Also, you have to check the coding style rules:
Documentation/CodingStyle

- there are lines above 80 chars
- '||' must not be first in line

# scripts/checkpatch.pl /tmp/ocado-ipvs.patch
gives more warnings.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-24 16:18     ` Aleksey Chudov
@ 2013-05-27 21:31       ` Julian Anastasov
  2013-05-28 13:41         ` Aleksey Chudov
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-05-27 21:31 UTC (permalink / raw)
  To: Aleksey Chudov; +Cc: Alexander Frolkin, lvs-devel


	Hello,

On Fri, 24 May 2013, Aleksey Chudov wrote:

> May be better to modify the sync algorithm to synchronize only persistence
> templates for these specific cases? Is it possible at all?

	May be, with some flag and also sloppy_tcp. Then the
parametrized SH with netmask will do the same - we can avoid
the sync messages. Of course, with SH there is more risk
for imbalance and it can be exploited. Also SH requires
equal configuration for the real servers.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-27 21:31       ` Julian Anastasov
@ 2013-05-28 13:41         ` Aleksey Chudov
  2013-05-30  6:37           ` Julian Anastasov
  2013-06-19  9:03           ` Julian Anastasov
  0 siblings, 2 replies; 52+ messages in thread
From: Aleksey Chudov @ 2013-05-28 13:41 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

On 28.05.2013 0:31, Julian Anastasov wrote:
> On Fri, 24 May 2013, Aleksey Chudov wrote:
>
>> May be better to modify the sync algorithm to synchronize only persistence
>> templates for these specific cases? Is it possible at all?
> 	May be, with some flag and also sloppy_tcp. Then the
> parametrized SH with netmask will do the same - we can avoid
> the sync messages. Of course, with SH there is more risk
> for imbalance and it can be exploited. Also SH requires
> equal configuration for the real servers.
>

Currently we are using multiple active / standby server pairs and 
synchronize them with each other. So half of the servers are constantly 
doing nothing. We are searching how to use all the servers in active / 
active mode while maintaining high availability and sessions persistence 
in case of failure of one of the load balancers. Unfortunately the 
proposed stateless scheme with SH scheduler and Sloppy TCP is not 
suitable for as since we are using WLC and WRR schedulers. As you 
mentioned SH scheduler has several drawbacks because of which we can not 
use it. Also, we can not synchronize all connections between all 
servers, since it would require a lot of memory and the search for such 
a huge connection table is likely to be slower.

But we can solve the sync problem in such a way as done in the 
conntrackd which allows filtering by flow state. The easiest option is 
to make the filter only for IP_VS_CONN_F_TEMPLATE state. Thus if all the 
load balancers will sync persistent templates with each other then even 
if one of the load balancers fails most users will remain on the same 
real servers. Of course without the full sync clients must reestablish 
TCP connections, but for this case we can use Sloppy TCP to create a TCP 
connection state on any TCP packet.

What do you think of this idea?



Regards,
Aleksey

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-28 13:41         ` Aleksey Chudov
@ 2013-05-30  6:37           ` Julian Anastasov
  2013-06-07  7:53             ` Alexander Frolkin
  2013-06-19  9:03           ` Julian Anastasov
  1 sibling, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-05-30  6:37 UTC (permalink / raw)
  To: Aleksey Chudov; +Cc: lvs-devel


	Hello,

On Tue, 28 May 2013, Aleksey Chudov wrote:

> On 28.05.2013 0:31, Julian Anastasov wrote:
> > On Fri, 24 May 2013, Aleksey Chudov wrote:
> > 
> > > May be better to modify the sync algorithm to synchronize only persistence
> > > templates for these specific cases? Is it possible at all?
> > 	May be, with some flag and also sloppy_tcp. Then the
> > parametrized SH with netmask will do the same - we can avoid
> > the sync messages. Of course, with SH there is more risk
> > for imbalance and it can be exploited. Also SH requires
> > equal configuration for the real servers.
> > 
> 
> Currently we are using multiple active / standby server pairs and synchronize
> them with each other. So half of the servers are constantly doing nothing. We
> are searching how to use all the servers in active / active mode while
> maintaining high availability and sessions persistence in case of failure of
> one of the load balancers. Unfortunately the proposed stateless scheme with SH
> scheduler and Sloppy TCP is not suitable for as since we are using WLC and WRR
> schedulers. As you mentioned SH scheduler has several drawbacks because of
> which we can not use it. Also, we can not synchronize all connections between
> all servers, since it would require a lot of memory and the search for such a
> huge connection table is likely to be slower.
> 
> But we can solve the sync problem in such a way as done in the conntrackd
> which allows filtering by flow state. The easiest option is to make the filter
> only for IP_VS_CONN_F_TEMPLATE state. Thus if all the load balancers will sync
> persistent templates with each other then even if one of the load balancers
> fails most users will remain on the same real servers. Of course without the
> full sync clients must reestablish TCP connections, but for this case we can
> use Sloppy TCP to create a TCP connection state on any TCP packet.
> 
> What do you think of this idea?

	Agreed, if we don't find big problems with the
Sloppy TCP mode the only problem will be what happens with
netfilter conntracks. But it is a problem even now, even
if we create sync conn in backup, we do not provide any
information to netfilter about such connection and it
would be expected to see packets in INVALID state.

	Looking at the code I don't see problems
Sloppy TCP mode to be enabled, ip_vs_out is called
before ip_vs_in in every hook, so there is no chance
to create connection in a wrong direction. Of course,
we have to do some tests, especially on loopback device.

	May be the patch for Sloppy TCP mode should be
extended to assume that old state is sSR if packet that
creates the connection has no RST flag. This will allow
connection to enter sES state if need, it will not stay
always in sCL state. As for the initial check, it should be:

	if ((sysctl_sloppy_tcp(net_ipvs(net)) || th->syn) &&
+	    !th->rst &&

	Also, one can enable Sloppy TCP mode for short
time during switchover, it should be safer to run with
disabled mode.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-30  6:37           ` Julian Anastasov
@ 2013-06-07  7:53             ` Alexander Frolkin
  0 siblings, 0 replies; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-07  7:53 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: Aleksey Chudov, lvs-devel

Hi,

> 	May be the patch for Sloppy TCP mode should be
> extended to assume that old state is sSR if packet that
> creates the connection has no RST flag. This will allow
> connection to enter sES state if need, it will not stay
> always in sCL state.

I'll have to read the code a bit more to completely understand that, but
it seems to make sense!

> As for the initial check, it should be:
> 	if ((sysctl_sloppy_tcp(net_ipvs(net)) || th->syn) &&
> +	    !th->rst &&

Make sense.

> 	Also, one can enable Sloppy TCP mode for short
> time during switchover, it should be safer to run with
> disabled mode.

The problem is that switchover won't necessarily be controlled if a
server fails.


Alex


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-27 21:11     ` [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling Julian Anastasov
@ 2013-06-07  8:12       ` Alexander Frolkin
  2013-06-10 19:31         ` Julian Anastasov
  2013-06-10 15:12       ` Alexander Frolkin
  1 sibling, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-07  8:12 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> 	OTOH, the difference is very small: the port.
> The problem is that we add only global controls, it
> would be good if we can configure such parameters
> per virtual service:
> - use port in source hash

Well, this one can be configured per service by changing the scheduler.
Or are you concerned about the fact that the code for SHP and SH is
essentially the same and should be merged?

> 	The problem here is that we call ip_vs_service_find()
> after checking th->syn. So, may be it is better to have
> global sysctl flag here, as in your patch.

I don't think a global sysctl is a problem for sloppy TCP (SCTP).  I
think it's unlikely that you'll want to enable it on one service but not
on another.

> IP_VS_SVC_F_SCHED1: scheduler flag 1 (SH: fallback to other dest if 
> weight=0), i.e. the sh_rebalance flag
> IP_VS_SVC_F_SCHED2: scheduler flag 1 (SH: add port in hash)
> IP_VS_SVC_F_SCHED3: scheduler flag 2 (SH: consider mask/plen)

This isn't a bad idea, and it will probably find other uses, too.

Is there a reason why the SH fallback behaviour shouldn't be default?
That is, is there a reason why the current behaviour (client connection
gets reset if it is directed to a realserver with weight 0) is
desirable?

> 	Note that latest SH version supports weights and
> RCU, you have to consider it for next patch versions.

I'll take a look at the latest version.


Alex


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-27 21:11     ` [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling Julian Anastasov
  2013-06-07  8:12       ` Alexander Frolkin
@ 2013-06-10 15:12       ` Alexander Frolkin
  2013-06-10 16:03         ` Alexander Frolkin
  2013-06-10 20:52         ` Julian Anastasov
  1 sibling, 2 replies; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-10 15:12 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

[-- Attachment #1: Type: text/plain, Size: 458 bytes --]

Hi,

Attached is a patch for sloppy TCP and SCTP against the upstream kernel.

checkpatch.pl throws up errors, but they refer to stuff that was there
before, not my changes.

I've added a bit of code to set the initial state for a sloppy TCP
connection.  I have a template (in the patch) for a similar bit of code
for SCTP, but I'm hoping that someone can help me work out what the
initial state should be, since I know next to nothing about SCTP...


Alex


[-- Attachment #2: sloppy.patch --]
[-- Type: text/plain, Size: 6204 bytes --]

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4c062cc..49a93bb 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1002,6 +1002,8 @@ struct netns_ipvs {
 	int			sysctl_sync_sock_size;
 	int			sysctl_cache_bypass;
 	int			sysctl_expire_nodest_conn;
+	int			sysctl_sloppy_tcp;
+	int			sysctl_sloppy_sctp;
 	int			sysctl_expire_quiescent_template;
 	int			sysctl_sync_threshold[2];
 	unsigned int		sysctl_sync_refresh_period;
@@ -1044,6 +1046,8 @@ struct netns_ipvs {
 #define DEFAULT_SYNC_THRESHOLD	3
 #define DEFAULT_SYNC_PERIOD	50
 #define DEFAULT_SYNC_VER	1
+#define DEFAULT_SLOPPY_TCP	0
+#define DEFAULT_SLOPPY_SCTP	0
 #define DEFAULT_SYNC_REFRESH_PERIOD	(0U * HZ)
 #define DEFAULT_SYNC_RETRIES		0
 #define IPVS_SYNC_WAKEUP_RATE	8
@@ -1080,6 +1084,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
 	return ipvs->sysctl_sync_ver;
 }
 
+static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sloppy_tcp;
+}
+
+static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sloppy_sctp;
+}
+
 static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 {
 	return ACCESS_ONCE(ipvs->sysctl_sync_ports);
@@ -1133,6 +1147,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
 	return DEFAULT_SYNC_VER;
 }
 
+static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SLOPPY_TCP;
+}
+
+static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SLOPPY_SCTP;
+}
+
 static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 {
 	return 1;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5b142fb..1e68c4f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1739,6 +1739,18 @@ static struct ctl_table vs_vars[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "sloppy_tcp",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sloppy_sctp",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "expire_quiescent_template",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
@@ -3722,6 +3734,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
 	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
 	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+	tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
 	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
 	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
 	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 8646488..c7f6e98 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -15,6 +15,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 {
 	struct net *net;
 	struct ip_vs_service *svc;
+	struct netns_ipvs *ipvs;
 	sctp_chunkhdr_t _schunkh, *sch;
 	sctp_sctphdr_t *sh, _sctph;
 
@@ -27,13 +28,15 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	if (sch == NULL)
 		return 0;
 	net = skb_net(skb);
+	ipvs = net_ipvs(net);
 	rcu_read_lock();
-	if ((sch->type == SCTP_CID_INIT) &&
+	if ((sysctl_sloppy_sctp(ipvs) ||
+	    (sch->type == SCTP_CID_INIT)) &&
 	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
 				      &iph->daddr, sh->dest))) {
 		int ignored;
 
-		if (ip_vs_todrop(net_ipvs(net))) {
+		if (ip_vs_todrop(ipvs)) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
@@ -55,6 +58,21 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 			rcu_read_unlock();
 			return 0;
 		}
+
+		/* If we create the state in the middle of a conversation
+		 * (sloppy SCTP mode), then set the initial state to ...?
+		 */
+		if (sch->type != SCTP_CID_INIT) {
+			/* (*cpp)->state = IP_VS_SCTP_S_...?; */
+
+			IP_VS_DBG_BUF(8, "%s  %s:%d->"
+				"%s:%d new sloppy state\n"
+				pd->pp->name,
+				IP_VS_DBG_ADDR((*cpp)->af, &(*cpp)->daddr),
+				ntohs((*cpp)->dport),
+				IP_VS_DBG_ADDR((*cpp)->af, &(*cpp)->caddr),
+				ntohs((*cpp)->cport));
+		}
 	}
 	rcu_read_unlock();
 	/* NF_ACCEPT */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 50a1594..e75bfff 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -39,6 +39,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	struct net *net;
 	struct ip_vs_service *svc;
 	struct tcphdr _tcph, *th;
+	struct netns_ipvs *ipvs;
 
 	th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
@@ -46,14 +47,15 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		return 0;
 	}
 	net = skb_net(skb);
+	ipvs = net_ipvs(net);
 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	rcu_read_lock();
-	if (th->syn &&
+	if ((sysctl_sloppy_tcp(ipvs) || th->syn) && !th->rst &&
 	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
 				      &iph->daddr, th->dest))) {
 		int ignored;
 
-		if (ip_vs_todrop(net_ipvs(net))) {
+		if (ip_vs_todrop(ipvs)) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
@@ -76,6 +78,25 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 			rcu_read_unlock();
 			return 0;
 		}
+
+		/* If we create the state in the middle of a conversation
+		 * (sloppy TCP mode), then set the initial state to sSR
+		 */
+		if (!th->syn) {
+			(*cpp)->state = IP_VS_TCP_S_SYN_RECV;
+
+			IP_VS_DBG_BUF(8, "%s [%c%c%c%c] %s:%d->"
+				"%s:%d: new sloppy state\n",
+				pd->pp->name,
+				th->syn ? 'S' : '.',
+				th->fin ? 'F' : '.',
+				th->ack ? 'A' : '.',
+				th->rst ? 'R' : '.',
+				IP_VS_DBG_ADDR((*cpp)->af, &(*cpp)->daddr),
+				ntohs((*cpp)->dport),
+				IP_VS_DBG_ADDR((*cpp)->af, &(*cpp)->caddr),
+				ntohs((*cpp)->cport));
+		}
 	}
 	rcu_read_unlock();
 	/* NF_ACCEPT */

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-10 15:12       ` Alexander Frolkin
@ 2013-06-10 16:03         ` Alexander Frolkin
  2013-06-10 20:52         ` Julian Anastasov
  1 sibling, 0 replies; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-10 16:03 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

[-- Attachment #1: Type: text/plain, Size: 117 bytes --]

Hi,

> Attached is a patch for sloppy TCP and SCTP against the upstream kernel.

Oops, fixed patch attached.


Alex


[-- Attachment #2: sloppy.patch --]
[-- Type: text/plain, Size: 6205 bytes --]

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4c062cc..49a93bb 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1002,6 +1002,8 @@ struct netns_ipvs {
 	int			sysctl_sync_sock_size;
 	int			sysctl_cache_bypass;
 	int			sysctl_expire_nodest_conn;
+	int			sysctl_sloppy_tcp;
+	int			sysctl_sloppy_sctp;
 	int			sysctl_expire_quiescent_template;
 	int			sysctl_sync_threshold[2];
 	unsigned int		sysctl_sync_refresh_period;
@@ -1044,6 +1046,8 @@ struct netns_ipvs {
 #define DEFAULT_SYNC_THRESHOLD	3
 #define DEFAULT_SYNC_PERIOD	50
 #define DEFAULT_SYNC_VER	1
+#define DEFAULT_SLOPPY_TCP	0
+#define DEFAULT_SLOPPY_SCTP	0
 #define DEFAULT_SYNC_REFRESH_PERIOD	(0U * HZ)
 #define DEFAULT_SYNC_RETRIES		0
 #define IPVS_SYNC_WAKEUP_RATE	8
@@ -1080,6 +1084,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
 	return ipvs->sysctl_sync_ver;
 }
 
+static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sloppy_tcp;
+}
+
+static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sloppy_sctp;
+}
+
 static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 {
 	return ACCESS_ONCE(ipvs->sysctl_sync_ports);
@@ -1133,6 +1147,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
 	return DEFAULT_SYNC_VER;
 }
 
+static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SLOPPY_TCP;
+}
+
+static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SLOPPY_SCTP;
+}
+
 static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 {
 	return 1;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5b142fb..1e68c4f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1739,6 +1739,18 @@ static struct ctl_table vs_vars[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "sloppy_tcp",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sloppy_sctp",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "expire_quiescent_template",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
@@ -3722,6 +3734,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
 	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
 	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+	tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
 	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
 	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
 	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 8646488..c7f6e98 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -15,6 +15,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 {
 	struct net *net;
 	struct ip_vs_service *svc;
+	struct netns_ipvs *ipvs;
 	sctp_chunkhdr_t _schunkh, *sch;
 	sctp_sctphdr_t *sh, _sctph;
 
@@ -27,13 +28,15 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	if (sch == NULL)
 		return 0;
 	net = skb_net(skb);
+	ipvs = net_ipvs(net);
 	rcu_read_lock();
-	if ((sch->type == SCTP_CID_INIT) &&
+	if ((sysctl_sloppy_sctp(ipvs) ||
+	    (sch->type == SCTP_CID_INIT)) &&
 	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
 				      &iph->daddr, sh->dest))) {
 		int ignored;
 
-		if (ip_vs_todrop(net_ipvs(net))) {
+		if (ip_vs_todrop(ipvs)) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
@@ -55,6 +58,21 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 			rcu_read_unlock();
 			return 0;
 		}
+
+		/* If we create the state in the middle of a conversation
+		 * (sloppy SCTP mode), then set the initial state to ...?
+		 */
+		if (sch->type != SCTP_CID_INIT) {
+			/* (*cpp)->state = IP_VS_SCTP_S_...?; */
+
+			IP_VS_DBG_BUF(8, "%s  %s:%d->"
+				"%s:%d new sloppy state\n",
+				pd->pp->name,
+				IP_VS_DBG_ADDR((*cpp)->af, &(*cpp)->daddr),
+				ntohs((*cpp)->dport),
+				IP_VS_DBG_ADDR((*cpp)->af, &(*cpp)->caddr),
+				ntohs((*cpp)->cport));
+		}
 	}
 	rcu_read_unlock();
 	/* NF_ACCEPT */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 50a1594..e75bfff 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -39,6 +39,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	struct net *net;
 	struct ip_vs_service *svc;
 	struct tcphdr _tcph, *th;
+	struct netns_ipvs *ipvs;
 
 	th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
@@ -46,14 +47,15 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		return 0;
 	}
 	net = skb_net(skb);
+	ipvs = net_ipvs(net);
 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	rcu_read_lock();
-	if (th->syn &&
+	if ((sysctl_sloppy_tcp(ipvs) || th->syn) && !th->rst &&
 	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
 				      &iph->daddr, th->dest))) {
 		int ignored;
 
-		if (ip_vs_todrop(net_ipvs(net))) {
+		if (ip_vs_todrop(ipvs)) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
@@ -76,6 +78,25 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 			rcu_read_unlock();
 			return 0;
 		}
+
+		/* If we create the state in the middle of a conversation
+		 * (sloppy TCP mode), then set the initial state to sSR
+		 */
+		if (!th->syn) {
+			(*cpp)->state = IP_VS_TCP_S_SYN_RECV;
+
+			IP_VS_DBG_BUF(8, "%s [%c%c%c%c] %s:%d->"
+				"%s:%d: new sloppy state\n",
+				pd->pp->name,
+				th->syn ? 'S' : '.',
+				th->fin ? 'F' : '.',
+				th->ack ? 'A' : '.',
+				th->rst ? 'R' : '.',
+				IP_VS_DBG_ADDR((*cpp)->af, &(*cpp)->daddr),
+				ntohs((*cpp)->dport),
+				IP_VS_DBG_ADDR((*cpp)->af, &(*cpp)->caddr),
+				ntohs((*cpp)->cport));
+		}
 	}
 	rcu_read_unlock();
 	/* NF_ACCEPT */

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-07  8:12       ` Alexander Frolkin
@ 2013-06-10 19:31         ` Julian Anastasov
  2013-06-11  8:38           ` Alexander Frolkin
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-10 19:31 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Fri, 7 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> > 	OTOH, the difference is very small: the port.
> > The problem is that we add only global controls, it
> > would be good if we can configure such parameters
> > per virtual service:
> > - use port in source hash
> 
> Well, this one can be configured per service by changing the scheduler.
> Or are you concerned about the fact that the code for SHP and SH is
> essentially the same and should be merged?

	Yes, if we find a way to configure SH, there is
no need for separate SHP.

> > 	The problem here is that we call ip_vs_service_find()
> > after checking th->syn. So, may be it is better to have
> > global sysctl flag here, as in your patch.
> 
> I don't think a global sysctl is a problem for sloppy TCP (SCTP).  I
> think it's unlikely that you'll want to enable it on one service but not
> on another.

	Agreed.

> > IP_VS_SVC_F_SCHED1: scheduler flag 1 (SH: fallback to other dest if 
> > weight=0), i.e. the sh_rebalance flag
> > IP_VS_SVC_F_SCHED2: scheduler flag 1 (SH: add port in hash)
> > IP_VS_SVC_F_SCHED3: scheduler flag 2 (SH: consider mask/plen)
> 
> This isn't a bad idea, and it will probably find other uses, too.
> 
> Is there a reason why the SH fallback behaviour shouldn't be default?
> That is, is there a reason why the current behaviour (client connection
> gets reset if it is directed to a realserver with weight 0) is
> desirable?

	I don't know, the authors preferred this behaviour.

> > 	Note that latest SH version supports weights and
> > RCU, you have to consider it for next patch versions.
> 
> I'll take a look at the latest version.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-10 15:12       ` Alexander Frolkin
  2013-06-10 16:03         ` Alexander Frolkin
@ 2013-06-10 20:52         ` Julian Anastasov
  2013-06-11 12:38           ` Alexander Frolkin
  1 sibling, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-10 20:52 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Mon, 10 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> Attached is a patch for sloppy TCP and SCTP against the upstream kernel.

	Please post patches inline, not as attached file.
Refer to Documentation/email-clients.txt for details about
your email client.

	The check for initial sSR state should be in
set_tcp_state() and set_sctp_state() because ip_vs_set_state()
can be called also from ip_vs_leave(). For example:

	int old_state;

	old_state = (cp->state != IP_VS_TCP_S_NONE) ?
		    cp->state : IP_VS_TCP_S_SYN_RECV;
	new_state = 
		pd->tcp_state_table[state_off+state_idx].next_state[old_state];

	But may be we can do it in a simple way: both
tables tcp_states[] and tcp_states_dos[] can be changed
for INPUT and INPUT-ONLY - the sNO column to contain
sES for the "ack" case, it happens only for sloppy mode:

diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 50a1594..6b2c6d6 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -401,7 +401,7 @@ static struct tcp_states_t tcp_states [] = {
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
 
 /*	OUTPUT */
@@ -415,7 +415,7 @@ static struct tcp_states_t tcp_states [] = {
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 };
 
@@ -424,7 +424,7 @@ static struct tcp_states_t tcp_states_dos [] = {
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
-/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 
 /*	OUTPUT */
@@ -438,7 +438,7 @@ static struct tcp_states_t tcp_states_dos [] = {
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 };
 

	When you post the patch inline I'll comment
about some styling problems.

> checkpatch.pl throws up errors, but they refer to stuff that was there
> before, not my changes.
> 
> I've added a bit of code to set the initial state for a sloppy TCP
> connection.  I have a template (in the patch) for a similar bit of code
> for SCTP, but I'm hoping that someone can help me work out what the
> initial state should be, since I know next to nothing about SCTP...

	We have to check what states should be changed
for SCTP, I'll think more tomorrow. There should be changes
for the 'STATE : IP_VS_SCTP_S_NONE' case.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-10 19:31         ` Julian Anastasov
@ 2013-06-11  8:38           ` Alexander Frolkin
  2013-06-11 19:57             ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-11  8:38 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> > Is there a reason why the SH fallback behaviour shouldn't be default?
> > That is, is there a reason why the current behaviour (client connection
> > gets reset if it is directed to a realserver with weight 0) is
> > desirable?
> I don't know, the authors preferred this behaviour.

Is it worth looking at changing this?  Or is this going to be too
difficult a change to push through?

I just don't understand why rejecting a client connection when there are
servers available is desirable behaviour.


Alex


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-10 20:52         ` Julian Anastasov
@ 2013-06-11 12:38           ` Alexander Frolkin
  2013-06-11 20:13             ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-11 12:38 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> 	When you post the patch inline I'll comment
> about some styling problems.

Updated patch, including your changes from the last email:

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4405886..22bea5d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1002,6 +1002,8 @@ struct netns_ipvs {
 	int			sysctl_sync_sock_size;
 	int			sysctl_cache_bypass;
 	int			sysctl_expire_nodest_conn;
+	int			sysctl_sloppy_tcp;
+	int			sysctl_sloppy_sctp;
 	int			sysctl_expire_quiescent_template;
 	int			sysctl_sync_threshold[2];
 	unsigned int		sysctl_sync_refresh_period;
@@ -1044,6 +1046,8 @@ struct netns_ipvs {
 #define DEFAULT_SYNC_THRESHOLD	3
 #define DEFAULT_SYNC_PERIOD	50
 #define DEFAULT_SYNC_VER	1
+#define DEFAULT_SLOPPY_TCP	0
+#define DEFAULT_SLOPPY_SCTP	0
 #define DEFAULT_SYNC_REFRESH_PERIOD	(0U * HZ)
 #define DEFAULT_SYNC_RETRIES		0
 #define IPVS_SYNC_WAKEUP_RATE	8
@@ -1080,6 +1084,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
 	return ipvs->sysctl_sync_ver;
 }
 
+static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sloppy_tcp;
+}
+
+static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sloppy_sctp;
+}
+
 static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 {
 	return ACCESS_ONCE(ipvs->sysctl_sync_ports);
@@ -1133,6 +1147,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
 	return DEFAULT_SYNC_VER;
 }
 
+static inline int sysctl_sloppy_tcp(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SLOPPY_TCP;
+}
+
+static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
+{
+	return DEFAULT_SLOPPY_SCTP;
+}
+
 static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 {
 	return 1;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7014649..04f8cbc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1739,6 +1739,18 @@ static struct ctl_table vs_vars[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "sloppy_tcp",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sloppy_sctp",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "expire_quiescent_template",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
@@ -3722,6 +3734,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
 	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
 	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+	tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
 	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
 	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
 	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 8646488..9b1867b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -15,6 +15,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 {
 	struct net *net;
 	struct ip_vs_service *svc;
+	struct netns_ipvs *ipvs;
 	sctp_chunkhdr_t _schunkh, *sch;
 	sctp_sctphdr_t *sh, _sctph;
 
@@ -27,13 +28,15 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	if (sch == NULL)
 		return 0;
 	net = skb_net(skb);
+	ipvs = net_ipvs(net);
 	rcu_read_lock();
-	if ((sch->type == SCTP_CID_INIT) &&
+	if ((sysctl_sloppy_sctp(ipvs) ||
+	    (sch->type == SCTP_CID_INIT)) &&
 	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
 				      &iph->daddr, sh->dest))) {
 		int ignored;
 
-		if (ip_vs_todrop(net_ipvs(net))) {
+		if (ip_vs_todrop(ipvs)) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 50a1594..3fd23fa 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -39,6 +39,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	struct net *net;
 	struct ip_vs_service *svc;
 	struct tcphdr _tcph, *th;
+	struct netns_ipvs *ipvs;
 
 	th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
 	if (th == NULL) {
@@ -46,14 +47,15 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		return 0;
 	}
 	net = skb_net(skb);
+	ipvs = net_ipvs(net);
 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	rcu_read_lock();
-	if (th->syn &&
+	if ((sysctl_sloppy_tcp(ipvs) || th->syn) && !th->rst &&
 	    (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
 				      &iph->daddr, th->dest))) {
 		int ignored;
 
-		if (ip_vs_todrop(net_ipvs(net))) {
+		if (ip_vs_todrop(ipvs)) {
 			/*
 			 * It seems that we are very loaded.
 			 * We have to drop this packet :(
@@ -401,7 +403,7 @@ static struct tcp_states_t tcp_states [] = {
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
 
 /*	OUTPUT */
@@ -415,7 +417,7 @@ static struct tcp_states_t tcp_states [] = {
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 };
 
@@ -424,7 +426,7 @@ static struct tcp_states_t tcp_states_dos [] = {
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
-/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 
 /*	OUTPUT */
@@ -438,7 +440,7 @@ static struct tcp_states_t tcp_states_dos [] = {
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 };
 
I haven't touched the SCTP state tables yet.


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-11  8:38           ` Alexander Frolkin
@ 2013-06-11 19:57             ` Julian Anastasov
  2013-06-12 14:10               ` Alexander Frolkin
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-11 19:57 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Tue, 11 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> > > Is there a reason why the SH fallback behaviour shouldn't be default?
> > > That is, is there a reason why the current behaviour (client connection
> > > gets reset if it is directed to a realserver with weight 0) is
> > > desirable?
> > I don't know, the authors preferred this behaviour.
> 
> Is it worth looking at changing this?  Or is this going to be too
> difficult a change to push through?

	I'm not sure how SH is used, may be failed
dests are removed from the list to avoid connection
failures.

> I just don't understand why rejecting a client connection when there are
> servers available is desirable behaviour.

	The problem is that every move leads to problems:

- add/remove destination => mapping is changed for all dests

- set weight to 0 and allow fallback => mapping is changed for
	two connections from same IP

	As result, it is a bad idea to remove dests that are
failed. In such case, without fallback some clients are not
served, forever. But with fallback we can break the
implicit persistence. I see two kinds of uses:

- persistence implemented with SH => fallback is risky. Usually,
we use expire_quiescent_template for such cases when persistence
is used.

- same mapping for many directors => fallback is desired when
config is same on all directors and persistence behaviour is
not desired.

	So, it really depends what are our goals when using SH.
Not sure if we can apply the expire_quiescent_template flag to
the SH scheduler to control fallback.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-11 12:38           ` Alexander Frolkin
@ 2013-06-11 20:13             ` Julian Anastasov
  2013-06-12 10:49               ` Alexander Frolkin
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-11 20:13 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Tue, 11 Jun 2013, Alexander Frolkin wrote:

> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> index 8646488..9b1867b 100644
> --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
> +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> @@ -15,6 +15,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
>  {
>  	struct net *net;
>  	struct ip_vs_service *svc;
> +	struct netns_ipvs *ipvs;
>  	sctp_chunkhdr_t _schunkh, *sch;
>  	sctp_sctphdr_t *sh, _sctph;
>  
> @@ -27,13 +28,15 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
>  	if (sch == NULL)
>  		return 0;
>  	net = skb_net(skb);
> +	ipvs = net_ipvs(net);
>  	rcu_read_lock();
> -	if ((sch->type == SCTP_CID_INIT) &&
> +	if ((sysctl_sloppy_sctp(ipvs) ||
> +	    (sch->type == SCTP_CID_INIT)) &&

	Above change can be (all on same line):

-	if ((sch->type == SCTP_CID_INIT) &&
+	if ((sysctl_sloppy_sctp(ipvs) || sch->type == SCTP_CID_INIT) &&

> I haven't touched the SCTP state tables yet.

	Here is the SCTP part we can use:

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 8646488..ae61212 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -232,21 +234,21 @@ static struct ipvs_sctp_nextstate
 	 * STATE : IP_VS_SCTP_S_NONE
 	 */
 	/*next state *//*event */
-	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	{{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
 	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
 	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
-	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
-	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
-	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
-	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
-	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
 	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ },
 
	You can continue with an official submission with
all these changes. Subject should be in such format:

[PATCH] ipvs: ...

	You can check Documentation/SubmittingPatches

	For example:

[PATCH] ipvs: sloppy TCP and SCTP

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-11 20:13             ` Julian Anastasov
@ 2013-06-12 10:49               ` Alexander Frolkin
  0 siblings, 0 replies; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-12 10:49 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> 	You can continue with an official submission with
> all these changes.

I've incorporated your changes and sent a separate email with the patch.

Thanks for your help!


Alex


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-11 19:57             ` Julian Anastasov
@ 2013-06-12 14:10               ` Alexander Frolkin
  2013-06-12 20:47                 ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-12 14:10 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> > I just don't understand why rejecting a client connection when there are
> > servers available is desirable behaviour.
> 	The problem is that every move leads to problems:
> - add/remove destination => mapping is changed for all dests
> - set weight to 0 and allow fallback => mapping is changed for
> 	two connections from same IP

Fair enough, although I would guess two connections from the same IP
going to different servers wouldn't be an issue in many cases.

> - persistence implemented with SH => fallback is risky. Usually,
> we use expire_quiescent_template for such cases when persistence
> is used.

Can you elaborate on what you mean by "risky" here?

> - same mapping for many directors => fallback is desired when
> config is same on all directors and persistence behaviour is
> not desired.

Indeed.

> Not sure if we can apply the expire_quiescent_template flag to
> the SH scheduler to control fallback.

But then it's controlled by a sysctl, not per virtual server, which is
something we didn't want, I believe.

Are you happy for me to go ahead and add the per-service scheduler
flags (IP_VS_SVC_F_SCHED1, etc.), like you suggested previously?

At the moment, the patch looks like this (pending a decision on how to
enable the features):

diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 0df269d..abd8ed6 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -48,6 +48,10 @@
 
 #include <net/ip_vs.h>
 
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
 
 /*
  *      IPVS SH bucket
@@ -74,7 +78,9 @@ struct ip_vs_sh_state {
 /*
  *	Returns hash value for IPVS SH entry
  */
-static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int ip_vs_sh_hashkey(int af,
+	const union nf_inet_addr *addr, __be16 port,
+	unsigned int offset)
 {
 	__be32 addr_fold = addr->ip;
 
@@ -83,7 +89,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+		IP_VS_SH_TAB_MASK;
 }
 
 
@@ -91,9 +98,11 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
+ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr,
+	__be16 port, unsigned int offset)
 {
-	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
+	return rcu_dereference(
+		s->buckets[ip_vs_sh_hashkey(af, addr, port, offset)].dest);
 }
 
 
@@ -232,17 +241,43 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	struct ip_vs_dest *dest;
 	struct ip_vs_sh_state *s;
 	struct ip_vs_iphdr iph;
+	unsigned int offset;
+	unsigned int found;
 
 	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
+	/* XXX if L4 hash */
+	if (0)
+		port = ip_vs_sh_get_port(svc, skb, iph);
+	else
+		port = 0;
+
 	s = (struct ip_vs_sh_state *) svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
+	/* XXX if fallback */
+	if (0) {
+		found = 0;
+		for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+			dest = ip_vs_sh_get(svc->af, s, &iph.saddr,
+				port, offset);
+			if (!is_available(dest)) {
+				IP_VS_DBG_BUF(6, "SH: selected unavailable"
+					"server %s:%d, retrying with offset"
+					"%d\n",
+					IP_VS_DBG_ADDR(svc->af, &dest->addr),
+					ntohs(dest->port),
+					offset);
+			} else {
+				found = 1;
+				break;
+			}
+		}
+	} else {
+		dest = ip_vs_sh_get(svc->af, s, &iph.saddr, port, 0);
+		found = 1;
+	}
+	if (!found || !is_available(dest)) {
 		ip_vs_scheduler_err(svc, "no destination available");
 		return NULL;
 	}
@@ -255,6 +290,50 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	return dest;
 }
 
+/*
+ *	Helper function to determine if server is available
+ */
+static inline int
+is_available(struct ip_vs_dest *dest)
+{
+	return (!dest ||
+		!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
+		atomic_read(&dest->weight) <= 0 ||
+		is_overloaded(dest))
+}
+
+/*
+ *	Helper function to get port number
+ */
+static inline __be16
+ip_vs_sh_get_port(struct ip_vs_service *svc, const struct sk_buff *skb,
+	struct ip_vs_iphdr iph)
+{
+	__be16 port;
+	struct tcphdr _tcph, *th;
+	struct udphdr _udph, *uh;
+	sctp_sctphdr_t _sctph, *sh;
+
+	switch (svc->protocol) {
+	case IPPROTO_TCP:
+		th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
+		port = th->source;
+		break;
+	case IPPROTO_UDP:
+		uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
+		port = uh->source;
+		break;
+	case IPPROTO_SCTP:
+		sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph);
+		port = sh->source;
+		break;
+	default:
+		port = 0;
+	}
+
+	return port;
+}
+
 
 /*
  *      IPVS SH Scheduler structure


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-12 14:10               ` Alexander Frolkin
@ 2013-06-12 20:47                 ` Julian Anastasov
  2013-06-13  8:38                   ` Alexander Frolkin
                                     ` (2 more replies)
  0 siblings, 3 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-12 20:47 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Wed, 12 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> > > I just don't understand why rejecting a client connection when there are
> > > servers available is desirable behaviour.
> > 	The problem is that every move leads to problems:
> > - add/remove destination => mapping is changed for all dests
> > - set weight to 0 and allow fallback => mapping is changed for
> > 	two connections from same IP
> 
> Fair enough, although I would guess two connections from the same IP
> going to different servers wouldn't be an issue in many cases.
> 
> > - persistence implemented with SH => fallback is risky. Usually,
> > we use expire_quiescent_template for such cases when persistence
> > is used.
> 
> Can you elaborate on what you mean by "risky" here?

	Risky means: we break the rules of persistence,
one established connection can continue to work during
weight=0 and we create new connection to another real
server. Users can set weight=0 for seconds, clients
can retry on connection reject and when real server
is back we can continue in the same session.
Of course, all depends on applications. Some applications
are not tolerant to connection rejects.

	Persistence is explained here:

http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.persistent_connection.html

	SH is a poor choice for persistence, there is no
persistence timeout, no client netmask, no control with
expire_quiescent_template. That is why I'm not sure
who and how uses SH.

> > - same mapping for many directors => fallback is desired when
> > config is same on all directors and persistence behaviour is
> > not desired.
> 
> Indeed.
> 
> > Not sure if we can apply the expire_quiescent_template flag to
> > the SH scheduler to control fallback.
> 
> But then it's controlled by a sysctl, not per virtual server, which is
> something we didn't want, I believe.

	Yes

> Are you happy for me to go ahead and add the per-service scheduler
> flags (IP_VS_SVC_F_SCHED1, etc.), like you suggested previously?

	Yes, not sure what others think, may be changes for
ipvsadm will be needed:

git://git.kernel.org/pub/scm/utils/kernel/ipvsadm/ipvsadm.git

	And then we can start a wider discussion:

lvs-users@linuxvirtualserver.org
lvs-devel@vger.kernel.org

> At the moment, the patch looks like this (pending a decision on how to
> enable the features):
> 
> diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
> index 0df269d..abd8ed6 100644
> --- a/net/netfilter/ipvs/ip_vs_sh.c
> +++ b/net/netfilter/ipvs/ip_vs_sh.c
> @@ -48,6 +48,10 @@
>  
>  #include <net/ip_vs.h>
>  
> +#include <net/tcp.h>
> +#include <linux/udp.h>
> +#include <linux/sctp.h>
> +
>  
>  /*
>   *      IPVS SH bucket
> @@ -74,7 +78,9 @@ struct ip_vs_sh_state {
>  /*
>   *	Returns hash value for IPVS SH entry
>   */
> -static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
> +static inline unsigned int ip_vs_sh_hashkey(int af,
> +	const union nf_inet_addr *addr, __be16 port,
> +	unsigned int offset)
>  {
>  	__be32 addr_fold = addr->ip;
>  
> @@ -83,7 +89,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
>  		addr_fold = addr->ip6[0]^addr->ip6[1]^
>  			    addr->ip6[2]^addr->ip6[3];
>  #endif
> -	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
> +	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
> +		IP_VS_SH_TAB_MASK;
>  }
>  
>  
> @@ -91,9 +98,11 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
>   *      Get ip_vs_dest associated with supplied parameters.
>   */
>  static inline struct ip_vs_dest *
> -ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
> +ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr,
> +	__be16 port, unsigned int offset)
>  {
> -	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
> +	return rcu_dereference(
> +		s->buckets[ip_vs_sh_hashkey(af, addr, port, offset)].dest);
>  }
>  
>  
> @@ -232,17 +241,43 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
>  	struct ip_vs_dest *dest;
>  	struct ip_vs_sh_state *s;
>  	struct ip_vs_iphdr iph;
> +	unsigned int offset;
> +	unsigned int found;

	bool found;

	later use false/true.

>  
>  	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
>  
>  	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
>  
> +	/* XXX if L4 hash */
> +	if (0)
> +		port = ip_vs_sh_get_port(svc, skb, iph);
> +	else
> +		port = 0;
> +
>  	s = (struct ip_vs_sh_state *) svc->sched_data;
> -	dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
> -	if (!dest
> -	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)

	We should remove all IP_VS_DEST_F_AVAILABLE checks
from the SH and DH schedulers, such checks are needed only
for LBLC and LBLCR because only they can hold removed
dests in the scheduler context.

> -	    || atomic_read(&dest->weight) <= 0
> -	    || is_overloaded(dest)) {
> +	/* XXX if fallback */
> +	if (0) {
> +		found = 0;
> +		for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
> +			dest = ip_vs_sh_get(svc->af, s, &iph.saddr,
> +				port, offset);
> +			if (!is_available(dest)) {
> +				IP_VS_DBG_BUF(6, "SH: selected unavailable"
> +					"server %s:%d, retrying with offset"
> +					"%d\n",
> +					IP_VS_DBG_ADDR(svc->af, &dest->addr),
> +					ntohs(dest->port),
> +					offset);
> +			} else {
> +				found = 1;

	May be goto is more appropriate here to avoid second
is_available(dest) call.

> +				break;
> +			}
> +		}
> +	} else {
> +		dest = ip_vs_sh_get(svc->af, s, &iph.saddr, port, 0);
> +		found = 1;
> +	}
> +	if (!found || !is_available(dest)) {
>  		ip_vs_scheduler_err(svc, "no destination available");
>  		return NULL;
>  	}
> @@ -255,6 +290,50 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
>  	return dest;
>  }
>  
> +/*
> + *	Helper function to determine if server is available
> + */
> +static inline int
> +is_available(struct ip_vs_dest *dest)
> +{
> +	return (!dest ||
> +		!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
> +		atomic_read(&dest->weight) <= 0 ||
> +		is_overloaded(dest))
> +}
> +
> +/*
> + *	Helper function to get port number
> + */
> +static inline __be16
> +ip_vs_sh_get_port(struct ip_vs_service *svc, const struct sk_buff *skb,
> +	struct ip_vs_iphdr iph)

	Please use *iph here, passing large structure by
value is a bad idea.

> +{
> +	__be16 port;
> +	struct tcphdr _tcph, *th;
> +	struct udphdr _udph, *uh;
> +	sctp_sctphdr_t _sctph, *sh;
> +
> +	switch (svc->protocol) {

	Use iph->protocol instead of svc->protocol because not
all services have correct protocol.

> +	case IPPROTO_TCP:
> +		th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
> +		port = th->source;
> +		break;
> +	case IPPROTO_UDP:
> +		uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
> +		port = uh->source;
> +		break;
> +	case IPPROTO_SCTP:
> +		sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph);
> +		port = sh->source;
> +		break;
> +	default:
> +		port = 0;
> +	}
> +
> +	return port;
> +}
> +
>  
>  /*
>   *      IPVS SH Scheduler structure
> 
> 
> Alex

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-12 20:47                 ` Julian Anastasov
@ 2013-06-13  8:38                   ` Alexander Frolkin
  2013-06-13 12:56                   ` Alexander Frolkin
  2013-06-13 14:18                   ` Alexander Frolkin
  2 siblings, 0 replies; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-13  8:38 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> 	Yes, not sure what others think, may be changes for
> ipvsadm will be needed:
> 
> git://git.kernel.org/pub/scm/utils/kernel/ipvsadm/ipvsadm.git

Do you have any preferences for the command-line syntax to set the
flags?  --sched-flag-1 --sched-flag-2, --sched-flag 1 --sched-flag 2,
--sched-flags 12, something else?  Options 2 and 3 mean we can have
corresponding short options; I think option 3 makes the most sense with
-E (you set all the flags, instead of clearing flags by not specifying
the option, which is not obvious, I think), but we need a nice syntax
for the option argument.  What do you think?

The latest IPVS patch looks like this:

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index a245377..81af9b2 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -20,6 +20,9 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
 
 /*
  *      Destination Server Flags
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 0df269d..847d1c7 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -48,6 +48,10 @@
 
 #include <net/ip_vs.h>
 
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
 
 /*
  *      IPVS SH bucket
@@ -74,7 +78,9 @@ struct ip_vs_sh_state {
 /*
  *	Returns hash value for IPVS SH entry
  */
-static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int ip_vs_sh_hashkey(int af,
+	const union nf_inet_addr *addr, __be16 port,
+	unsigned int offset)
 {
 	__be32 addr_fold = addr->ip;
 
@@ -83,7 +89,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+		IP_VS_SH_TAB_MASK;
 }
 
 
@@ -91,9 +98,11 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
+ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr,
+	__be16 port, unsigned int offset)
 {
-	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
+	return rcu_dereference(
+		s->buckets[ip_vs_sh_hashkey(af, addr, port, offset)].dest);
 }
 
 
@@ -224,6 +233,50 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
 
 
 /*
+ *	Helper function to determine if server is available
+ */
+static inline int
+is_available(struct ip_vs_dest *dest)
+{
+	return (!dest ||
+		atomic_read(&dest->weight) <= 0 ||
+		is_overloaded(dest));
+}
+
+
+/*
+ *	Helper function to get port number
+ */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+	__be16 port;
+	struct tcphdr _tcph, *th;
+	struct udphdr _udph, *uh;
+	sctp_sctphdr_t _sctph, *sh;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+		port = th->source;
+		break;
+	case IPPROTO_UDP:
+		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+		port = uh->source;
+		break;
+	case IPPROTO_SCTP:
+		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+		port = sh->source;
+		break;
+	default:
+		port = 0;
+	}
+
+	return port;
+}
+
+
+/*
  *      Source Hashing scheduling
  */
 static struct ip_vs_dest *
@@ -232,21 +285,45 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	struct ip_vs_dest *dest;
 	struct ip_vs_sh_state *s;
 	struct ip_vs_iphdr iph;
+	__be16 port;
+	unsigned int offset;
+	bool found;
 
 	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
+	if (svc->flags & IP_VS_SVC_F_SCHED1)
+		port = ip_vs_sh_get_port(skb, &iph);
+	else
+		port = 0;
+
 	s = (struct ip_vs_sh_state *) svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
+	if (svc->flags & IP_VS_SVC_F_SCHED2) {
+		found = false;
+		for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+			dest = ip_vs_sh_get(svc->af, s, &iph.saddr,
+				port, offset);
+			if (!is_available(dest))
+				IP_VS_DBG_BUF(6, "SH: selected unavailable"
+					"server %s:%d, retrying with offset"
+					"%d\n",
+					IP_VS_DBG_ADDR(svc->af, &dest->addr),
+					ntohs(dest->port),
+					offset);
+			else
+				goto found_dest;
+		}
+	} else {
+		dest = ip_vs_sh_get(svc->af, s, &iph.saddr, port, 0);
+		found = true;
+	}
+	if (!found || !is_available(dest)) {
 		ip_vs_scheduler_err(svc, "no destination available");
 		return NULL;
 	}
 
+ found_dest:
 	IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
 		      IP_VS_DBG_ADDR(svc->af, &iph.saddr),
 		      IP_VS_DBG_ADDR(svc->af, &dest->addr),


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-12 20:47                 ` Julian Anastasov
  2013-06-13  8:38                   ` Alexander Frolkin
@ 2013-06-13 12:56                   ` Alexander Frolkin
  2013-06-13 19:50                     ` Julian Anastasov
  2013-06-13 14:18                   ` Alexander Frolkin
  2 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-13 12:56 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> > +{
> > +	__be16 port;
> > +	struct tcphdr _tcph, *th;
> > +	struct udphdr _udph, *uh;
> > +	sctp_sctphdr_t _sctph, *sh;
> > +
> > +	switch (svc->protocol) {
> 	Use iph->protocol instead of svc->protocol because not
> all services have correct protocol.

In order for this to work, I would have to change
ip_vs_fill_iph_addr_only in ip_vs_sh_schedule to ip_vs_fill_iph_skb (or
add an ip_vs_fill_iph_addr_proto_only function).  Obviously, I can do
this only when the flag is enabled.

I can also get it from skb, but that will make the code more
complicated.

What do you think?


Alex


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-12 20:47                 ` Julian Anastasov
  2013-06-13  8:38                   ` Alexander Frolkin
  2013-06-13 12:56                   ` Alexander Frolkin
@ 2013-06-13 14:18                   ` Alexander Frolkin
  2013-06-13 20:31                     ` Julian Anastasov
  2 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-13 14:18 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

I've patched ipvsadm and fixed up the kernel patch.

For the ipvsadm option, I've used (-b|--sched-flags) 123.  I don't
particularly like this style, but I wanted something working for
testing.

I'm using ip_vs_fill_iph_skb for now (if the flag is set), until I hear
back from you.

When you're happy with the patches, I can open the discussion up to the
users mailing list.

Kernel patch:

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index a245377..81af9b2 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -20,6 +20,9 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
 
 /*
  *      Destination Server Flags
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 0df269d..f9de4d2 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -48,6 +48,10 @@
 
 #include <net/ip_vs.h>
 
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
 
 /*
  *      IPVS SH bucket
@@ -74,7 +78,9 @@ struct ip_vs_sh_state {
 /*
  *	Returns hash value for IPVS SH entry
  */
-static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int ip_vs_sh_hashkey(int af,
+	const union nf_inet_addr *addr, __be16 port,
+	unsigned int offset)
 {
 	__be32 addr_fold = addr->ip;
 
@@ -83,7 +89,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+		IP_VS_SH_TAB_MASK;
 }
 
 
@@ -91,9 +98,11 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
+ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr,
+	__be16 port, unsigned int offset)
 {
-	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
+	return rcu_dereference(
+		s->buckets[ip_vs_sh_hashkey(af, addr, port, offset)].dest);
 }
 
 
@@ -224,6 +233,50 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
 
 
 /*
+ *	Helper function to determine if server is unavailable
+ */
+static inline int
+is_unavailable(struct ip_vs_dest *dest)
+{
+	return (!dest ||
+		atomic_read(&dest->weight) <= 0 ||
+		is_overloaded(dest));
+}
+
+
+/*
+ *	Helper function to get port number
+ */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+	__be16 port;
+	struct tcphdr _tcph, *th;
+	struct udphdr _udph, *uh;
+	sctp_sctphdr_t _sctph, *sh;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+		port = th->source;
+		break;
+	case IPPROTO_UDP:
+		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+		port = uh->source;
+		break;
+	case IPPROTO_SCTP:
+		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+		port = sh->source;
+		break;
+	default:
+		port = 0;
+	}
+
+	return port;
+}
+
+
+/*
  *      Source Hashing scheduling
  */
 static struct ip_vs_dest *
@@ -232,21 +285,45 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	struct ip_vs_dest *dest;
 	struct ip_vs_sh_state *s;
 	struct ip_vs_iphdr iph;
-
-	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
+	__be16 port;
+	unsigned int offset;
+	bool found;
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
+	if (svc->flags & IP_VS_SVC_F_SCHED1) {
+		ip_vs_fill_iph_skb(svc->af, skb, &iph);
+		port = ip_vs_sh_get_port(skb, &iph);
+	} else {
+		ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
+		port = 0;
+	}
+
 	s = (struct ip_vs_sh_state *) svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
+	if (svc->flags & IP_VS_SVC_F_SCHED2) {
+		found = false;
+		for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+			dest = ip_vs_sh_get(svc->af, s, &iph.saddr,
+				port, offset);
+			if (is_unavailable(dest))
+				IP_VS_DBG_BUF(6, "SH: selected unavailable "
+					"server %s:%d (offset %d)",
+					IP_VS_DBG_ADDR(svc->af, &dest->addr),
+					ntohs(dest->port),
+					offset);
+			else
+				goto found_dest;
+		}
+	} else {
+		dest = ip_vs_sh_get(svc->af, s, &iph.saddr, port, 0);
+		found = true;
+	}
+	if (!found || is_unavailable(dest)) {
 		ip_vs_scheduler_err(svc, "no destination available");
 		return NULL;
 	}
 
+ found_dest:
 	IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
 		      IP_VS_DBG_ADDR(svc->af, &iph.saddr),
 		      IP_VS_DBG_ADDR(svc->af, &dest->addr),

ipvsadm patch:

diff --git a/ipvsadm.8 b/ipvsadm.8
index 001ae74..e24f5d0 100644
--- a/ipvsadm.8
+++ b/ipvsadm.8
@@ -37,7 +37,7 @@ ipvsadm \- Linux Virtual Server administration
 .SH SYNOPSIS
 .B ipvsadm -A|E -t|u|f \fIservice-address\fP [-s \fIscheduler\fP]
 .ti 15
-.B [-p [\fItimeout\fP]] [-M \fInetmask\fP]
+.B [-p [\fItimeout\fP]] [-M \fInetmask\fP] [-b \fIsched-flags\fP]
 .br
 .B ipvsadm -D -t|u|f \fIservice-address\fP
 .br
@@ -248,6 +248,9 @@ addresses.
 .sp
 \fBsh\fR - Source Hashing: assigns jobs to servers through looking up
 a statically assigned hash table by their source IP addresses.
+Scheduler flag 1 makes the scheduler include the source port in the
+hash; flag 2 makes the scheduler find a different server when a
+client is directed to a server of weight 0.
 .sp
 \fBsed\fR - Shortest Expected Delay: assigns an incoming job to the
 server with the shortest expected delay. The expected delay that the
@@ -286,6 +289,11 @@ resolve problems with non-persistent cache clusters on the client side.
 IPv6 netmasks should be specified as a prefix length between 1 and 128.
 The default prefix length is 128.
 .TP
+.B -b, --sched-flags \fIsched-flags\fP
+Set scheduler flags for this virtual server.  The \fIsched-flags\fP is
+a string of numbers (1, 2, or 3) which specify which scheduler flags to
+set.  The function of the flags is scheduler-specific.
+.TP
 .B -r, --real-server \fIserver-address\fP
 Real server that an associated request for service may be assigned to.
 The \fIserver-address\fP is the \fIhost\fP address of a real server,
diff --git a/ipvsadm.c b/ipvsadm.c
index 0197515..878734e 100644
--- a/ipvsadm.c
+++ b/ipvsadm.c
@@ -182,7 +182,8 @@ static const char* cmdnames[] = {
 #define OPT_EXACT		0x100000
 #define OPT_ONEPACKET		0x200000
 #define OPT_PERSISTENCE_ENGINE  0x400000
-#define NUMBER_OF_OPT		23
+#define OPT_SCHED_FLAGS		0x800000
+#define NUMBER_OF_OPT		24
 
 static const char* optnames[] = {
 	"numeric",
@@ -208,6 +209,7 @@ static const char* optnames[] = {
 	"exact",
 	"ops",
 	"pe",
+	"sched-flags"
 };
 
 /*
@@ -220,21 +222,21 @@ static const char* optnames[] = {
  */
 static const char commands_v_options[NUMBER_OF_CMD][NUMBER_OF_OPT] =
 {
-	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe */
-/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
-/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
-/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x'},
-/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
-/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
-/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe  scf */
+/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
+/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
+/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x', 'x'},
+/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
+/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
+/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
 };
 
 /* printing format flags */
@@ -426,6 +428,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 		{ "ops", 'o', POPT_ARG_NONE, NULL, 'o', NULL, NULL },
 		{ "pe", '\0', POPT_ARG_STRING, &optarg, TAG_PERSISTENCE_ENGINE,
 		  NULL, NULL },
+		{ "sched-flags", 'b', POPT_ARG_STRING, &optarg, 'b', NULL, NULL },
 		{ NULL, 0, 0, NULL, 0, NULL, NULL }
 	};
 
@@ -656,6 +659,24 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 			set_option(options, OPT_PERSISTENCE_ENGINE);
 			strncpy(ce->svc.pe_name, optarg, IP_VS_PENAME_MAXLEN);
 			break;
+		case 'b':
+			set_option(options, OPT_SCHED_FLAGS);
+			ce->svc.flags &= ~(IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3);
+			for(; *optarg != '\0'; optarg++)
+				switch (*optarg - '0') {
+				case 1:
+					ce->svc.flags |= IP_VS_SVC_F_SCHED1;
+					break;
+				case 2:
+					ce->svc.flags |= IP_VS_SVC_F_SCHED2;
+					break;
+				case 3:
+					ce->svc.flags |= IP_VS_SVC_F_SCHED3;
+					break;
+				default:
+					fail(2, "invalid scheduler flag specified");
+				}
+			break;
 		default:
 			fail(2, "invalid option `%s'",
 			     poptBadOption(context, POPT_BADOPTION_NOALIAS));
@@ -1070,7 +1091,7 @@ static void usage_exit(const char *program, const int exit_status)
 	version(stream);
 	fprintf(stream,
 		"Usage:\n"
-		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine]\n"
+		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine] [-b scheduler_flags]\n"
 		"  %s -D -t|u|f service-address\n"
 		"  %s -C\n"
 		"  %s -R\n"
@@ -1139,7 +1160,8 @@ static void usage_exit(const char *program, const int exit_status)
 		"  --nosort                            disable sorting output of service/server entries\n"
 		"  --sort                              does nothing, for backwards compatibility\n"
 		"  --ops          -o                   one-packet scheduling\n"
-		"  --numeric      -n                   numeric output of addresses and ports\n",
+		"  --numeric      -n                   numeric output of addresses and ports\n"
+		"  --sched-flags  -b flags             scheduler flags\n",
 		DEF_SCHED);
 
 	exit(exit_status);
@@ -1488,6 +1510,15 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
 			printf(" pe %s", se->pe_name);
 		if (se->flags & IP_VS_SVC_F_ONEPACKET)
 			printf(" -o");
+		if (se->flags & (IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3)) {
+			printf(" -b ");
+			if (se->flags & IP_VS_SVC_F_SCHED1)
+				printf("1");
+			if (se->flags & IP_VS_SVC_F_SCHED2)
+				printf("2");
+			if (se->flags & IP_VS_SVC_F_SCHED3)
+				printf("3");
+		}
 	} else if (format & FMT_STATS) {
 		printf("%-33s", svc_name);
 		print_largenum(se->stats.conns, format);
@@ -1520,6 +1551,15 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
 		}
 		if (se->flags & IP_VS_SVC_F_ONEPACKET)
 			printf(" ops");
+		if (se->flags & (IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3)) {
+			printf(" sfl ");
+			if (se->flags & IP_VS_SVC_F_SCHED1)
+				printf("1");
+			if (se->flags & IP_VS_SVC_F_SCHED2)
+				printf("2");
+			if (se->flags & IP_VS_SVC_F_SCHED3)
+				printf("3");
+		}
 	}
 	printf("\n");
 
diff --git a/libipvs/ip_vs.h b/libipvs/ip_vs.h
index 5e1d544..33b0115 100644
--- a/libipvs/ip_vs.h
+++ b/libipvs/ip_vs.h
@@ -29,6 +29,10 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
+
 
 /*
  *      IPVS sync daemon states


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-13 12:56                   ` Alexander Frolkin
@ 2013-06-13 19:50                     ` Julian Anastasov
  0 siblings, 0 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-13 19:50 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Thu, 13 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> > > +{
> > > +	__be16 port;
> > > +	struct tcphdr _tcph, *th;
> > > +	struct udphdr _udph, *uh;
> > > +	sctp_sctphdr_t _sctph, *sh;
> > > +
> > > +	switch (svc->protocol) {
> > 	Use iph->protocol instead of svc->protocol because not
> > all services have correct protocol.
> 
> In order for this to work, I would have to change
> ip_vs_fill_iph_addr_only in ip_vs_sh_schedule to ip_vs_fill_iph_skb (or
> add an ip_vs_fill_iph_addr_proto_only function).  Obviously, I can do
> this only when the flag is enabled.
> 
> I can also get it from skb, but that will make the code more
> complicated.
> 
> What do you think?

	May be I have to remove this ip_vs_fill_iph_addr_only
function, we should provide iph to schedulers. I'll post
patch this weekend.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-13 14:18                   ` Alexander Frolkin
@ 2013-06-13 20:31                     ` Julian Anastasov
  2013-06-14 10:22                       ` Alexander Frolkin
  2013-06-14 11:47                       ` Alexander Frolkin
  0 siblings, 2 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-13 20:31 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Thu, 13 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> I've patched ipvsadm and fixed up the kernel patch.
> 
> For the ipvsadm option, I've used (-b|--sched-flags) 123.  I don't

	Not sure if we need "-b".

> particularly like this style, but I wanted something working for
> testing.

	I guess it is difficult to maintain many options,
may be one option --sched-flags should be enough, for example:

--sched-flags sh-fallback,sh-port

	In all cases we should not use any of the
--sched-flag-1 variants, better to have scheduler
specific tokens that will set some IP_VS_SVC_F_SCHED* flags.

> I'm using ip_vs_fill_iph_skb for now (if the flag is set), until I hear
> back from you.

	OK, we will rely on provided iph later...

> When you're happy with the patches, I can open the discussion up to the
> users mailing list.
> 
> Kernel patch:
> 
> diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> index a245377..81af9b2 100644
> --- a/include/uapi/linux/ip_vs.h
> +++ b/include/uapi/linux/ip_vs.h
> @@ -20,6 +20,9 @@
>  #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
>  #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
>  #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
> +#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
> +#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
> +#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */

	We have to make the mapping of scheduler flags
public, for example, add:

#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1
#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2

	also in libipvs/ip_vs.h, as usually.

>  /*
>   *      Destination Server Flags
> diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
> index 0df269d..f9de4d2 100644
> --- a/net/netfilter/ipvs/ip_vs_sh.c
> +++ b/net/netfilter/ipvs/ip_vs_sh.c
> @@ -48,6 +48,10 @@
>  
>  #include <net/ip_vs.h>
>  
> +#include <net/tcp.h>
> +#include <linux/udp.h>
> +#include <linux/sctp.h>
> +
>  
>  /*
>   *      IPVS SH bucket
> @@ -74,7 +78,9 @@ struct ip_vs_sh_state {
>  /*
>   *	Returns hash value for IPVS SH entry
>   */
> -static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
> +static inline unsigned int ip_vs_sh_hashkey(int af,
> +	const union nf_inet_addr *addr, __be16 port,
> +	unsigned int offset)

	Arguments should be properly aligned, you can reorder
them, if needed.

> +	if (svc->flags & IP_VS_SVC_F_SCHED2) {
> +		found = false;
> +		for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
> +			dest = ip_vs_sh_get(svc->af, s, &iph.saddr,
> +				port, offset);
> +			if (is_unavailable(dest))
> +				IP_VS_DBG_BUF(6, "SH: selected unavailable "
> +					"server %s:%d (offset %d)",
> +					IP_VS_DBG_ADDR(svc->af, &dest->addr),

	dest can be NULL => crash

> +					ntohs(dest->port),
> +					offset);

	May be we have to put this for loop in new func, so that
IP_VS_DBG_BUF args are properly aligned? Another option is
to move IP_VS_DBG_BUF into is_unavailable(svc, dest, offset)
and to use it only when dest != NULL.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-13 20:31                     ` Julian Anastasov
@ 2013-06-14 10:22                       ` Alexander Frolkin
  2013-06-16  6:52                         ` Julian Anastasov
  2013-06-14 11:47                       ` Alexander Frolkin
  1 sibling, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-14 10:22 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> > For the ipvsadm option, I've used (-b|--sched-flags) 123.  I don't
> 	Not sure if we need "-b".

Well, okay.  I just have a personal preference for short options. :-)
It makes ipvsadm -Sn output look nicer, too.

> 	I guess it is difficult to maintain many options,
> may be one option --sched-flags should be enough, for example:
> --sched-flags sh-fallback,sh-port

Okay, that looks good.

I've done some refactoring to simplify ip_vs_sh_schedule.  Let me know
what you think.

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index a245377..2945822 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -20,6 +20,12 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
+
+#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
+#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
 
 /*
  *      Destination Server Flags
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 0df269d..4bb9636 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -48,6 +48,10 @@
 
 #include <net/ip_vs.h>
 
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
 
 /*
  *      IPVS SH bucket
@@ -71,10 +75,37 @@ struct ip_vs_sh_state {
 	struct rcu_head			rcu_head;
 };
 
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *	Helper function to determine if server is unavailable
+ */
+static inline int
+is_unavailable(struct ip_vs_dest *dest)
+{
+	return (!dest ||
+		atomic_read(&dest->weight) <= 0 ||
+		is_overloaded(dest));
+}
+
+
 /*
  *	Returns hash value for IPVS SH entry
  */
-static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int
+ip_vs_sh_hashkey(int af,
+		 const union nf_inet_addr *addr,
+		 __be16 port,
+		 unsigned int offset)
 {
 	__be32 addr_fold = addr->ip;
 
@@ -83,7 +114,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+		IP_VS_SH_TAB_MASK;
 }
 
 
@@ -91,13 +123,55 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
+ip_vs_sh_get(struct ip_vs_service *svc,
+	     struct ip_vs_sh_state *s,
+	     const union nf_inet_addr *addr,
+	     __be16 port)
 {
-	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
+	struct ip_vs_dest *dest;
+
+	dest = rcu_dereference(
+		s->buckets[ip_vs_sh_hashkey(svc->af, addr, port, 0)].dest);
+
+	return is_unavailable(dest) ? NULL : dest;
 }
 
 
 /*
+ *	As ip_vs_sh_get, but with fallback if selected server is unavailable
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc,
+		      struct ip_vs_sh_state *s,
+		      const union nf_inet_addr *addr,
+		      __be16 port)
+{
+	unsigned int offset;
+	struct ip_vs_dest *dest;
+
+	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+		dest = rcu_dereference(s->buckets[
+			ip_vs_sh_hashkey(svc->af, addr, port, offset)].dest);
+		if (!is_unavailable(dest))
+			return dest;
+#ifdef CONFIG_IP_VS_DEBUG
+		else if (dest)
+			IP_VS_DBG_BUF(6, "SH: selected unavailable server "
+				"%s:%d (offset %d)",
+				IP_VS_DBG_ADDR(svc->af, &dest->addr),
+				ntohs(dest->port),
+				offset);
+		else
+			IP_VS_DBG(6, "SH: selected null server "
+				"(offset %d)",
+				offset);
+#endif
+	}
+
+	return NULL;
+}
+
+/*
  *      Assign all the hash buckets of the specified table with the service.
  */
 static int
@@ -214,12 +288,34 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
 
 
 /*
- *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- *      consider that the server is overloaded here.
+ *	Helper function to get port number
  */
-static inline int is_overloaded(struct ip_vs_dest *dest)
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
 {
-	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+	__be16 port;
+	struct tcphdr _tcph, *th;
+	struct udphdr _udph, *uh;
+	sctp_sctphdr_t _sctph, *sh;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+		port = th->source;
+		break;
+	case IPPROTO_UDP:
+		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+		port = uh->source;
+		break;
+	case IPPROTO_SCTP:
+		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+		port = sh->source;
+		break;
+	default:
+		port = 0;
+	}
+
+	return port;
 }
 
 
@@ -232,17 +328,23 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	struct ip_vs_dest *dest;
 	struct ip_vs_sh_state *s;
 	struct ip_vs_iphdr iph;
-
-	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
+	__be16 port = 0;
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
+	ip_vs_fill_iph_skb(svc->af, skb, &iph);
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+		port = ip_vs_sh_get_port(skb, &iph);
+
 	s = (struct ip_vs_sh_state *) svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+		dest = ip_vs_sh_get_fallback(svc, s, &iph.saddr, port);
+	else
+		dest = ip_vs_sh_get(svc, s, &iph.saddr, port);
+
+	if (!dest) {
 		ip_vs_scheduler_err(svc, "no destination available");
 		return NULL;
 	}


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-13 20:31                     ` Julian Anastasov
  2013-06-14 10:22                       ` Alexander Frolkin
@ 2013-06-14 11:47                       ` Alexander Frolkin
  2013-06-16  8:30                         ` Julian Anastasov
  1 sibling, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-14 11:47 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> --sched-flags sh-fallback,sh-port

Updated ipvsadm patch below.  I've kept -b for now: there are no long
options in ipvsadm -S output, and I think it would be nice to keep it
that way.

diff --git a/ipvsadm.8 b/ipvsadm.8
index 001ae74..9a9e9b3 100644
--- a/ipvsadm.8
+++ b/ipvsadm.8
@@ -37,7 +37,7 @@ ipvsadm \- Linux Virtual Server administration
 .SH SYNOPSIS
 .B ipvsadm -A|E -t|u|f \fIservice-address\fP [-s \fIscheduler\fP]
 .ti 15
-.B [-p [\fItimeout\fP]] [-M \fInetmask\fP]
+.B [-p [\fItimeout\fP]] [-M \fInetmask\fP] [-b \fIsched-flags\fP]
 .br
 .B ipvsadm -D -t|u|f \fIservice-address\fP
 .br
@@ -248,6 +248,9 @@ addresses.
 .sp
 \fBsh\fR - Source Hashing: assigns jobs to servers through looking up
 a statically assigned hash table by their source IP addresses.
+This scheduler has two flags: sh-fallback, which enables fallback to a
+different server if the selected server was unavailable, and sh-port,
+which adds the source port number to the hash computation.
 .sp
 \fBsed\fR - Shortest Expected Delay: assigns an incoming job to the
 server with the shortest expected delay. The expected delay that the
@@ -286,6 +289,11 @@ resolve problems with non-persistent cache clusters on the client side.
 IPv6 netmasks should be specified as a prefix length between 1 and 128.
 The default prefix length is 128.
 .TP
+.B -b, --sched-flags \fIsched-flags\fP
+Set scheduler flags for this virtual server.  \fIsched-flags\fP is a
+comma-separated list of flags.  See the scheduler descriptions for
+valid scheduler flags.
+.TP
 .B -r, --real-server \fIserver-address\fP
 Real server that an associated request for service may be assigned to.
 The \fIserver-address\fP is the \fIhost\fP address of a real server,
diff --git a/ipvsadm.c b/ipvsadm.c
index 0197515..9679cf7 100644
--- a/ipvsadm.c
+++ b/ipvsadm.c
@@ -182,7 +182,8 @@ static const char* cmdnames[] = {
 #define OPT_EXACT		0x100000
 #define OPT_ONEPACKET		0x200000
 #define OPT_PERSISTENCE_ENGINE  0x400000
-#define NUMBER_OF_OPT		23
+#define OPT_SCHED_FLAGS		0x800000
+#define NUMBER_OF_OPT		24
 
 static const char* optnames[] = {
 	"numeric",
@@ -207,7 +208,7 @@ static const char* optnames[] = {
 	"syncid",
 	"exact",
 	"ops",
-	"pe",
+	"pe"
 };
 
 /*
@@ -220,21 +221,21 @@ static const char* optnames[] = {
  */
 static const char commands_v_options[NUMBER_OF_CMD][NUMBER_OF_OPT] =
 {
-	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe */
-/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
-/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
-/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x'},
-/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
-/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
-/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe  -b */
+/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
+/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
+/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x', 'x'},
+/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
+/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
+/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
 };
 
 /* printing format flags */
@@ -285,6 +286,7 @@ enum {
 	TAG_SORT,
 	TAG_NO_SORT,
 	TAG_PERSISTENCE_ENGINE,
+	TAG_SCHED_FLAGS
 };
 
 /* various parsing helpers & parsing functions */
@@ -363,7 +365,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 {
 	int c, parse;
 	poptContext context;
-	char *optarg=NULL;
+	char *optarg=NULL, *flag;
 	struct poptOption options_table[] = {
 		{ "add-service", 'A', POPT_ARG_NONE, NULL, 'A', NULL, NULL },
 		{ "edit-service", 'E', POPT_ARG_NONE, NULL, 'E', NULL, NULL },
@@ -426,6 +428,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 		{ "ops", 'o', POPT_ARG_NONE, NULL, 'o', NULL, NULL },
 		{ "pe", '\0', POPT_ARG_STRING, &optarg, TAG_PERSISTENCE_ENGINE,
 		  NULL, NULL },
+		{ "sched-flags", 'b', POPT_ARG_STRING, &optarg, 'b', NULL, NULL },
 		{ NULL, 0, 0, NULL, 0, NULL, NULL }
 	};
 
@@ -656,6 +659,25 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 			set_option(options, OPT_PERSISTENCE_ENGINE);
 			strncpy(ce->svc.pe_name, optarg, IP_VS_PENAME_MAXLEN);
 			break;
+		case 'b':
+			set_option(options, OPT_SCHED_FLAGS);
+			ce->svc.flags &= ~(IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3);
+			flag = strtok(optarg, ",");
+			do {
+				if (!strcmp(flag, "flag-1"))
+					ce->svc.flags |= IP_VS_SVC_F_SCHED1;
+				else if (!strcmp(flag, "flag-2"))
+					ce->svc.flags |= IP_VS_SVC_F_SCHED2;
+				else if (!strcmp(flag, "flag-3"))
+					ce->svc.flags |= IP_VS_SVC_F_SCHED3;
+				else if (!strcmp(flag, "sh-fallback"))
+					ce->svc.flags |= IP_VS_SVC_F_SCHED_SH_FALLBACK;
+				else if (!strcmp(flag, "sh-port"))
+					ce->svc.flags |= IP_VS_SVC_F_SCHED_SH_PORT;
+				else
+					fail(2, "invalid scheduler flag `%s'", flag);
+			} while ((flag = strtok(NULL, ",")) != NULL);
+			break;
 		default:
 			fail(2, "invalid option `%s'",
 			     poptBadOption(context, POPT_BADOPTION_NOALIAS));
@@ -1070,7 +1092,7 @@ static void usage_exit(const char *program, const int exit_status)
 	version(stream);
 	fprintf(stream,
 		"Usage:\n"
-		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine]\n"
+		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine] [-b sched-flags]\n"
 		"  %s -D -t|u|f service-address\n"
 		"  %s -C\n"
 		"  %s -R\n"
@@ -1139,7 +1161,8 @@ static void usage_exit(const char *program, const int exit_status)
 		"  --nosort                            disable sorting output of service/server entries\n"
 		"  --sort                              does nothing, for backwards compatibility\n"
 		"  --ops          -o                   one-packet scheduling\n"
-		"  --numeric      -n                   numeric output of addresses and ports\n",
+		"  --numeric      -n                   numeric output of addresses and ports\n"
+		"  --sched-flags  -b flags             scheduler flags (comma-separated)\n",
 		DEF_SCHED);
 
 	exit(exit_status);
@@ -1396,6 +1419,27 @@ static void print_largenum(unsigned long long i, unsigned int format)
 		printf("%8lluT", i / 1000000000000ULL);
 }
 
+static void print_sched_flags(ipvs_service_entry_t *se) {
+	char flags[64] = "";
+
+	if (!strcmp(se->sched_name, "sh")) {
+		if (se->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+			strcat(flags, "sh-fallback,");
+		if (se->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+			strcat(flags, "sh-port,");
+	} else {
+		if (se->flags & IP_VS_SVC_F_SCHED1)
+			strcat(flags, "flag-1,");
+		if (se->flags & IP_VS_SVC_F_SCHED2)
+			strcat(flags, "flag-2,");
+		if (se->flags & IP_VS_SVC_F_SCHED3)
+			strcat(flags, "flag-3,");
+	}
+
+	flags[strlen(flags)-1] = '\0';
+
+	printf("%s", flags);
+}
 
 static void print_title(unsigned int format)
 {
@@ -1488,6 +1532,10 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
 			printf(" pe %s", se->pe_name);
 		if (se->flags & IP_VS_SVC_F_ONEPACKET)
 			printf(" -o");
+		if (se->flags & (IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3)) {
+			printf(" -b ");
+			print_sched_flags(se);
+		}
 	} else if (format & FMT_STATS) {
 		printf("%-33s", svc_name);
 		print_largenum(se->stats.conns, format);
@@ -1504,6 +1552,11 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
 		print_largenum(se->stats.outbps, format);
 	} else {
 		printf("%s %s", svc_name, se->sched_name);
+		if (se->flags & (IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3)) {
+			printf(" (");
+			print_sched_flags(se);
+			printf(")");
+		}
 		if (se->flags & IP_VS_SVC_F_PERSISTENT) {
 			printf(" persistent %u", se->timeout);
 			if (se->af == AF_INET)
diff --git a/libipvs/ip_vs.h b/libipvs/ip_vs.h
index 5e1d544..4db14ff 100644
--- a/libipvs/ip_vs.h
+++ b/libipvs/ip_vs.h
@@ -29,6 +29,13 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
+
+#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
+#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
+
 
 /*
  *      IPVS sync daemon states


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-14 10:22                       ` Alexander Frolkin
@ 2013-06-16  6:52                         ` Julian Anastasov
  2013-06-17  8:32                           ` Alexander Frolkin
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-16  6:52 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Fri, 14 Jun 2013, Alexander Frolkin wrote:

> I've done some refactoring to simplify ip_vs_sh_schedule.  Let me know
> what you think.

> diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
> index 0df269d..4bb9636 100644
> --- a/net/netfilter/ipvs/ip_vs_sh.c
> +++ b/net/netfilter/ipvs/ip_vs_sh.c
> @@ -48,6 +48,10 @@
>  
>  #include <net/ip_vs.h>
>  
> +#include <net/tcp.h>
> +#include <linux/udp.h>
> +#include <linux/sctp.h>
> +
>  
>  /*
>   *      IPVS SH bucket
> @@ -71,10 +75,37 @@ struct ip_vs_sh_state {
>  	struct rcu_head			rcu_head;
>  };
>  
> +
> +/*
> + *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
> + *      consider that the server is overloaded here.
> + */

	There is already a requirement the multiline
comments in net/ to be in such format:

/* First line
 * ...
 * last line
 */

	Do it for all comments that you add in this patch.

> +static inline int is_overloaded(struct ip_vs_dest *dest)

	'bool' should work even here.

> +{
> +	return dest->flags & IP_VS_DEST_F_OVERLOAD;
> +}
> +

	Only one empty line between functions.

> +
> +/*
> + *	Helper function to determine if server is unavailable
> + */
> +static inline int

	bool

> +is_unavailable(struct ip_vs_dest *dest)

	This is preferred (args on same line):

static inline bool is_unavailable(struct ip_vs_dest *dest)

> +{
> +	return (!dest ||
> +		atomic_read(&dest->weight) <= 0 ||
> +		is_overloaded(dest));

	and without outer ().

> +}
> +
> +
>  /*
>   *	Returns hash value for IPVS SH entry
>   */
> -static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
> +static inline unsigned int
> +ip_vs_sh_hashkey(int af,
> +		 const union nf_inet_addr *addr,
> +		 __be16 port,
> +		 unsigned int offset)

	May be you can put more args on same line.

>  {
>  	__be32 addr_fold = addr->ip;
>  
> @@ -83,7 +114,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
>  		addr_fold = addr->ip6[0]^addr->ip6[1]^
>  			    addr->ip6[2]^addr->ip6[3];
>  #endif
> -	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
> +	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
> +		IP_VS_SH_TAB_MASK;
>  }
>  
>  
> @@ -91,13 +123,55 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
>   *      Get ip_vs_dest associated with supplied parameters.
>   */
>  static inline struct ip_vs_dest *
> -ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
> +ip_vs_sh_get(struct ip_vs_service *svc,
> +	     struct ip_vs_sh_state *s,
> +	     const union nf_inet_addr *addr,

	More args on same line.

> +	     __be16 port)
>  {
> -	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
> +	struct ip_vs_dest *dest;

	unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
	struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);

> +
> +	dest = rcu_dereference(
> +		s->buckets[ip_vs_sh_hashkey(svc->af, addr, port, 0)].dest);
> +
> +	return is_unavailable(dest) ? NULL : dest;
>  }
>  
>  
>  /*
> + *	As ip_vs_sh_get, but with fallback if selected server is unavailable
> + */
> +static inline struct ip_vs_dest *
> +ip_vs_sh_get_fallback(struct ip_vs_service *svc,

	More args on same line.

> +		      struct ip_vs_sh_state *s,
> +		      const union nf_inet_addr *addr,
> +		      __be16 port)
> +{
> +	unsigned int offset;
> +	struct ip_vs_dest *dest;
> +
> +	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {

		Can we use:

		unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port,
						     offset);

		dest = rcu_dereference(s->buckets[hash].dest);

> +		dest = rcu_dereference(s->buckets[
> +			ip_vs_sh_hashkey(svc->af, addr, port, offset)].dest);
> +		if (!is_unavailable(dest))
> +			return dest;
> +#ifdef CONFIG_IP_VS_DEBUG

	Remove the CONFIG_IP_VS_DEBUG because we will add 'break'.

> +		else if (dest)
> +			IP_VS_DBG_BUF(6, "SH: selected unavailable server "
> +				"%s:%d (offset %d)",
> +				IP_VS_DBG_ADDR(svc->af, &dest->addr),
> +				ntohs(dest->port),
> +				offset);

	All args to IP_VS_DBG_BUF should be aligned to same column.

> +		else
> +			IP_VS_DBG(6, "SH: selected null server "
> +				"(offset %d)",
> +				offset);

	Here too, also: "(offset %d)", offset);

	Add 'break' for this case when dest is NULL (no
dests). Then we have to add {} for all branches as per
CodingStyle:

	if (!is_unavailable(dest))
		return dest;
	if (dest) {
		IP_VS_DBG_BUF
	} else {
		IP_VS_DBG
		break;
	}

> +#endif
> +	}
> +
> +	return NULL;
> +}
> +
> +/*
>   *      Assign all the hash buckets of the specified table with the service.
>   */
>  static int
> @@ -214,12 +288,34 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
>  
>  
>  /*
> - *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
> - *      consider that the server is overloaded here.
> + *	Helper function to get port number
>   */
> -static inline int is_overloaded(struct ip_vs_dest *dest)
> +static inline __be16
> +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
>  {
> -	return dest->flags & IP_VS_DEST_F_OVERLOAD;
> +	__be16 port;
> +	struct tcphdr _tcph, *th;
> +	struct udphdr _udph, *uh;
> +	sctp_sctphdr_t _sctph, *sh;
> +
> +	switch (iph->protocol) {
> +	case IPPROTO_TCP:
> +		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
> +		port = th->source;
> +		break;
> +	case IPPROTO_UDP:
> +		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
> +		port = uh->source;
> +		break;
> +	case IPPROTO_SCTP:
> +		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
> +		port = sh->source;
> +		break;
> +	default:
> +		port = 0;
> +	}
> +
> +	return port;
>  }
>  
>  
> @@ -232,17 +328,23 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
>  	struct ip_vs_dest *dest;
>  	struct ip_vs_sh_state *s;
>  	struct ip_vs_iphdr iph;

	I just posted the patch that provides iph as argument.

> -
> -	ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
> +	__be16 port = 0;
>  
>  	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
>  
> +	ip_vs_fill_iph_skb(svc->af, skb, &iph);
> +
> +	if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
> +		port = ip_vs_sh_get_port(skb, &iph);
> +
>  	s = (struct ip_vs_sh_state *) svc->sched_data;
> -	dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
> -	if (!dest
> -	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
> -	    || atomic_read(&dest->weight) <= 0
> -	    || is_overloaded(dest)) {
> +
> +	if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
> +		dest = ip_vs_sh_get_fallback(svc, s, &iph.saddr, port);
> +	else
> +		dest = ip_vs_sh_get(svc, s, &iph.saddr, port);
> +
> +	if (!dest) {
>  		ip_vs_scheduler_err(svc, "no destination available");
>  		return NULL;
>  	}
> 
> 
> Alex

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-14 11:47                       ` Alexander Frolkin
@ 2013-06-16  8:30                         ` Julian Anastasov
  2013-06-17 10:35                           ` Alexander Frolkin
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-16  8:30 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Fri, 14 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> > --sched-flags sh-fallback,sh-port
> 
> Updated ipvsadm patch below.  I've kept -b for now: there are no long
> options in ipvsadm -S output, and I think it would be nice to keep it
> that way.

	OK, if others do not come up with another idea.

> diff --git a/ipvsadm.c b/ipvsadm.c
> index 0197515..9679cf7 100644
> --- a/ipvsadm.c
> +++ b/ipvsadm.c
> @@ -182,7 +182,8 @@ static const char* cmdnames[] = {
>  #define OPT_EXACT		0x100000
>  #define OPT_ONEPACKET		0x200000
>  #define OPT_PERSISTENCE_ENGINE  0x400000
> -#define NUMBER_OF_OPT		23
> +#define OPT_SCHED_FLAGS		0x800000
> +#define NUMBER_OF_OPT		24
>  
>  static const char* optnames[] = {
>  	"numeric",
> @@ -207,7 +208,7 @@ static const char* optnames[] = {
>  	"syncid",
>  	"exact",
>  	"ops",
> -	"pe",
> +	"pe"

	sched-flags here?

>  };
>  
>  /*
> @@ -220,21 +221,21 @@ static const char* optnames[] = {
>   */
>  static const char commands_v_options[NUMBER_OF_CMD][NUMBER_OF_OPT] =
>  {
> -	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe */
> -/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
> -/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
> -/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x'},
> -/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
> -/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
> -/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe  -b */
> +/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
> +/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
> +/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x', 'x'},
> +/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
> +/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
> +/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
>  };
>  
>  /* printing format flags */
> @@ -285,6 +286,7 @@ enum {
>  	TAG_SORT,
>  	TAG_NO_SORT,
>  	TAG_PERSISTENCE_ENGINE,
> +	TAG_SCHED_FLAGS

	TAG_SCHED_FLAGS is not needed with 'b'

>  };
>  
>  /* various parsing helpers & parsing functions */
> @@ -363,7 +365,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  {
>  	int c, parse;
>  	poptContext context;
> -	char *optarg=NULL;
> +	char *optarg=NULL, *flag;
>  	struct poptOption options_table[] = {
>  		{ "add-service", 'A', POPT_ARG_NONE, NULL, 'A', NULL, NULL },
>  		{ "edit-service", 'E', POPT_ARG_NONE, NULL, 'E', NULL, NULL },
> @@ -426,6 +428,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  		{ "ops", 'o', POPT_ARG_NONE, NULL, 'o', NULL, NULL },
>  		{ "pe", '\0', POPT_ARG_STRING, &optarg, TAG_PERSISTENCE_ENGINE,
>  		  NULL, NULL },
> +		{ "sched-flags", 'b', POPT_ARG_STRING, &optarg, 'b', NULL, NULL },
>  		{ NULL, 0, 0, NULL, 0, NULL, NULL }
>  	};
>  
> @@ -656,6 +659,25 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  			set_option(options, OPT_PERSISTENCE_ENGINE);
>  			strncpy(ce->svc.pe_name, optarg, IP_VS_PENAME_MAXLEN);
>  			break;
> +		case 'b':
> +			set_option(options, OPT_SCHED_FLAGS);
> +			ce->svc.flags &= ~(IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3);
> +			flag = strtok(optarg, ",");

	Moving the following parsing in separate
function parse_sched_flags(sched, optarg) will look better:

	ce->svc.flags |= parse_sched_flags(ce->svc.sched_name, optarg);

	By this way we can restrict sh-fallback and sh-port
to the "sh" scheduler. It seems ce->svc.sched_name is
empty initially (DEF_SCHED).

	For example:

> +			do {

	char *need = NULL;

> +				if (!strcmp(flag, "flag-1"))
> +					ce->svc.flags |= IP_VS_SVC_F_SCHED1;
> +				else if (!strcmp(flag, "flag-2"))
> +					ce->svc.flags |= IP_VS_SVC_F_SCHED2;
> +				else if (!strcmp(flag, "flag-3"))
> +					ce->svc.flags |= IP_VS_SVC_F_SCHED3;
> +				else if (!strcmp(flag, "sh-fallback"))
> +					ce->svc.flags |= IP_VS_SVC_F_SCHED_SH_FALLBACK;

	need = "sh";


> +				else if (!strcmp(flag, "sh-port"))
> +					ce->svc.flags |= IP_VS_SVC_F_SCHED_SH_PORT;

	need = "sh";

> +				else
> +					fail(2, "invalid scheduler flag `%s'", flag);

	if (need && strcmp((sched && *sched) ? sched : DEF_SCHED, need))
		fail(2, "Incompatible scheduler flag `%s', flag);

> +			} while ((flag = strtok(NULL, ",")) != NULL);
> +			break;
>  		default:
>  			fail(2, "invalid option `%s'",
>  			     poptBadOption(context, POPT_BADOPTION_NOALIAS));
> @@ -1070,7 +1092,7 @@ static void usage_exit(const char *program, const int exit_status)
>  	version(stream);
>  	fprintf(stream,
>  		"Usage:\n"
> -		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine]\n"
> +		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine] [-b sched-flags]\n"
>  		"  %s -D -t|u|f service-address\n"
>  		"  %s -C\n"
>  		"  %s -R\n"
> @@ -1139,7 +1161,8 @@ static void usage_exit(const char *program, const int exit_status)
>  		"  --nosort                            disable sorting output of service/server entries\n"
>  		"  --sort                              does nothing, for backwards compatibility\n"
>  		"  --ops          -o                   one-packet scheduling\n"
> -		"  --numeric      -n                   numeric output of addresses and ports\n",
> +		"  --numeric      -n                   numeric output of addresses and ports\n"
> +		"  --sched-flags  -b flags             scheduler flags (comma-separated)\n",
>  		DEF_SCHED);
>  
>  	exit(exit_status);
> @@ -1396,6 +1419,27 @@ static void print_largenum(unsigned long long i, unsigned int format)
>  		printf("%8lluT", i / 1000000000000ULL);
>  }
>  
> +static void print_sched_flags(ipvs_service_entry_t *se) {
> +	char flags[64] = "";
> +

	char flags[64];

	flags[0] = 0;

> +	if (!strcmp(se->sched_name, "sh")) {
> +		if (se->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
> +			strcat(flags, "sh-fallback,");
> +		if (se->flags & IP_VS_SVC_F_SCHED_SH_PORT)
> +			strcat(flags, "sh-port,");
> +	} else {
> +		if (se->flags & IP_VS_SVC_F_SCHED1)
> +			strcat(flags, "flag-1,");
> +		if (se->flags & IP_VS_SVC_F_SCHED2)
> +			strcat(flags, "flag-2,");
> +		if (se->flags & IP_VS_SVC_F_SCHED3)
> +			strcat(flags, "flag-3,");
> +	}
> +

	This should be safer:

	if (flags[0]) {

> +	flags[strlen(flags)-1] = '\0';
> +
> +	printf("%s", flags);

	}

	but now print_sched_flags() is guarded,
so it is not a big deal.

> +}
>  
>  static void print_title(unsigned int format)
>  {
> @@ -1488,6 +1532,10 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
>  			printf(" pe %s", se->pe_name);
>  		if (se->flags & IP_VS_SVC_F_ONEPACKET)
>  			printf(" -o");
> +		if (se->flags & (IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3)) {
> +			printf(" -b ");
> +			print_sched_flags(se);
> +		}
>  	} else if (format & FMT_STATS) {
>  		printf("%-33s", svc_name);
>  		print_largenum(se->stats.conns, format);
> @@ -1504,6 +1552,11 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
>  		print_largenum(se->stats.outbps, format);
>  	} else {
>  		printf("%s %s", svc_name, se->sched_name);
> +		if (se->flags & (IP_VS_SVC_F_SCHED1 | IP_VS_SVC_F_SCHED2 | IP_VS_SVC_F_SCHED3)) {
> +			printf(" (");
> +			print_sched_flags(se);
> +			printf(")");
> +		}
>  		if (se->flags & IP_VS_SVC_F_PERSISTENT) {
>  			printf(" persistent %u", se->timeout);
>  			if (se->af == AF_INET)
> diff --git a/libipvs/ip_vs.h b/libipvs/ip_vs.h
> index 5e1d544..4db14ff 100644
> --- a/libipvs/ip_vs.h
> +++ b/libipvs/ip_vs.h
> @@ -29,6 +29,13 @@
>  #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
>  #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
>  #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
> +#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
> +#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
> +#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
> +
> +#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
> +#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
> +
>  
>  /*
>   *      IPVS sync daemon states
> 
> 
> Alex

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-16  6:52                         ` Julian Anastasov
@ 2013-06-17  8:32                           ` Alexander Frolkin
  2013-06-17  9:00                             ` Julian Anastasov
  2013-06-17  9:04                             ` Julian Anastasov
  0 siblings, 2 replies; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-17  8:32 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> 	Add 'break' for this case when dest is NULL (no
> dests). Then we have to add {} for all branches as per
> CodingStyle:
> 
> 	if (!is_unavailable(dest))
> 		return dest;
> 	if (dest) {
> 		IP_VS_DBG_BUF
> 	} else {
> 		IP_VS_DBG
> 		break;
> 	}

So if dest is NULL, there is no point trying to choose a different
server?


Alex


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-17  8:32                           ` Alexander Frolkin
@ 2013-06-17  9:00                             ` Julian Anastasov
  2013-06-17  9:04                             ` Julian Anastasov
  1 sibling, 0 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-17  9:00 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Mon, 17 Jun 2013, Alexander Frolkin wrote:

> > 	Add 'break' for this case when dest is NULL (no
> > dests). Then we have to add {} for all branches as per
> > CodingStyle:
> > 
> > 	if (!is_unavailable(dest))
> > 		return dest;
> > 	if (dest) {
> > 		IP_VS_DBG_BUF
> > 	} else {
> > 		IP_VS_DBG
> > 		break;
> > 	}
> 
> So if dest is NULL, there is no point trying to choose a different
> server?

	Yes, you will find 256 NULLs there.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-17  8:32                           ` Alexander Frolkin
  2013-06-17  9:00                             ` Julian Anastasov
@ 2013-06-17  9:04                             ` Julian Anastasov
  2013-06-17 11:11                               ` Alexander Frolkin
  1 sibling, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-17  9:04 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Mon, 17 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> > 	Add 'break' for this case when dest is NULL (no
> > dests). Then we have to add {} for all branches as per
> > CodingStyle:
> > 
> > 	if (!is_unavailable(dest))
> > 		return dest;
> > 	if (dest) {
> > 		IP_VS_DBG_BUF
> > 	} else {
> > 		IP_VS_DBG
> > 		break;
> > 	}
> 
> So if dest is NULL, there is no point trying to choose a different
> server?

	Wait, it seems is_unavailable() can succeed for other
reasons, so the 'if (!dest) ... break;' check should be first,
may be out of is_unavailable().

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-16  8:30                         ` Julian Anastasov
@ 2013-06-17 10:35                           ` Alexander Frolkin
  2013-06-17 19:48                             ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-17 10:35 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> 	Moving the following parsing in separate
> function parse_sched_flags(sched, optarg) will look better:
> 
> 	ce->svc.flags |= parse_sched_flags(ce->svc.sched_name, optarg);
> 
> 	By this way we can restrict sh-fallback and sh-port
> to the "sh" scheduler. It seems ce->svc.sched_name is
> empty initially (DEF_SCHED).

I've moved the code to parse the scheduler flags to the end of the
function, when the scheduler is known:

diff --git a/ipvsadm.8 b/ipvsadm.8
index 001ae74..9a9e9b3 100644
--- a/ipvsadm.8
+++ b/ipvsadm.8
@@ -37,7 +37,7 @@ ipvsadm \- Linux Virtual Server administration
 .SH SYNOPSIS
 .B ipvsadm -A|E -t|u|f \fIservice-address\fP [-s \fIscheduler\fP]
 .ti 15
-.B [-p [\fItimeout\fP]] [-M \fInetmask\fP]
+.B [-p [\fItimeout\fP]] [-M \fInetmask\fP] [-b \fIsched-flags\fP]
 .br
 .B ipvsadm -D -t|u|f \fIservice-address\fP
 .br
@@ -248,6 +248,9 @@ addresses.
 .sp
 \fBsh\fR - Source Hashing: assigns jobs to servers through looking up
 a statically assigned hash table by their source IP addresses.
+This scheduler has two flags: sh-fallback, which enables fallback to a
+different server if the selected server was unavailable, and sh-port,
+which adds the source port number to the hash computation.
 .sp
 \fBsed\fR - Shortest Expected Delay: assigns an incoming job to the
 server with the shortest expected delay. The expected delay that the
@@ -286,6 +289,11 @@ resolve problems with non-persistent cache clusters on the client side.
 IPv6 netmasks should be specified as a prefix length between 1 and 128.
 The default prefix length is 128.
 .TP
+.B -b, --sched-flags \fIsched-flags\fP
+Set scheduler flags for this virtual server.  \fIsched-flags\fP is a
+comma-separated list of flags.  See the scheduler descriptions for
+valid scheduler flags.
+.TP
 .B -r, --real-server \fIserver-address\fP
 Real server that an associated request for service may be assigned to.
 The \fIserver-address\fP is the \fIhost\fP address of a real server,
diff --git a/ipvsadm.c b/ipvsadm.c
index 0197515..968c3b7 100644
--- a/ipvsadm.c
+++ b/ipvsadm.c
@@ -182,7 +182,8 @@ static const char* cmdnames[] = {
 #define OPT_EXACT		0x100000
 #define OPT_ONEPACKET		0x200000
 #define OPT_PERSISTENCE_ENGINE  0x400000
-#define NUMBER_OF_OPT		23
+#define OPT_SCHED_FLAGS		0x800000
+#define NUMBER_OF_OPT		24
 
 static const char* optnames[] = {
 	"numeric",
@@ -208,6 +209,7 @@ static const char* optnames[] = {
 	"exact",
 	"ops",
 	"pe",
+	"sched-flags",
 };
 
 /*
@@ -220,21 +222,21 @@ static const char* optnames[] = {
  */
 static const char commands_v_options[NUMBER_OF_CMD][NUMBER_OF_OPT] =
 {
-	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe */
-/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
-/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
-/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x'},
-/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
-/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
-/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe  -b */
+/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
+/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
+/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x', 'x'},
+/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
+/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
+/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
 };
 
 /* printing format flags */
@@ -302,6 +304,7 @@ static int parse_service(char *buf, ipvs_service_t *svc);
 static int parse_netmask(char *buf, u_int32_t *addr);
 static int parse_timeout(char *buf, int min, int max);
 static unsigned int parse_fwmark(char *buf);
+static unsigned int parse_sched_flags(const char *sched, char *optarg);
 
 /* check the options based on the commands_v_options table */
 static void generic_opt_check(int command, int options);
@@ -363,7 +366,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 {
 	int c, parse;
 	poptContext context;
-	char *optarg=NULL;
+	char *optarg=NULL, sched_flags_arg[128];
 	struct poptOption options_table[] = {
 		{ "add-service", 'A', POPT_ARG_NONE, NULL, 'A', NULL, NULL },
 		{ "edit-service", 'E', POPT_ARG_NONE, NULL, 'E', NULL, NULL },
@@ -426,9 +429,12 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 		{ "ops", 'o', POPT_ARG_NONE, NULL, 'o', NULL, NULL },
 		{ "pe", '\0', POPT_ARG_STRING, &optarg, TAG_PERSISTENCE_ENGINE,
 		  NULL, NULL },
+		{ "sched-flags", 'b', POPT_ARG_STRING, &optarg, 'b', NULL, NULL },
 		{ NULL, 0, 0, NULL, 0, NULL, NULL }
 	};
 
+	sched_flags_arg[0] = '\0';
+
 	context = poptGetContext("ipvsadm", argc, (const char **)argv,
 				 options_table, 0);
 
@@ -656,6 +662,10 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 			set_option(options, OPT_PERSISTENCE_ENGINE);
 			strncpy(ce->svc.pe_name, optarg, IP_VS_PENAME_MAXLEN);
 			break;
+		case 'b':
+			set_option(options, OPT_SCHED_FLAGS);
+			strncpy(sched_flags_arg, optarg, sizeof(sched_flags_arg));
+			break;
 		default:
 			fail(2, "invalid option `%s'",
 			     poptBadOption(context, POPT_BADOPTION_NOALIAS));
@@ -690,6 +700,14 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 	if ((optarg=(char *)poptGetArg(context)))
 		fail(2, "unexpected argument %s", optarg);
 
+	if (sched_flags_arg[0]) {
+		ce->svc.flags &= ~(IP_VS_SVC_F_SCHED1 |
+				   IP_VS_SVC_F_SCHED2 |
+				   IP_VS_SVC_F_SCHED3);
+		ce->svc.flags |= parse_sched_flags(ce->svc.sched_name,
+						   sched_flags_arg);
+	}
+
 	poptFreeContext(context);
 
 	return 0;
@@ -989,6 +1007,37 @@ parse_service(char *buf, ipvs_service_t *svc)
 	return result;
 }
 
+static unsigned int parse_sched_flags(const char *sched, char *optarg)
+{
+	unsigned int flags = 0;
+	char *flag;
+	const char *scheduler = (sched && *sched) ? sched : DEF_SCHED;
+
+	flag = strtok(optarg, ",");
+	do {
+		if (!strcmp(flag, "flag-1")) {
+			flags |= IP_VS_SVC_F_SCHED1;
+		} else if (!strcmp(flag, "flag-2")) {
+			flags |= IP_VS_SVC_F_SCHED2;
+		} else if (!strcmp(flag, "flag-3")) {
+			flags |= IP_VS_SVC_F_SCHED3;
+		} else if (!strcmp(flag, "sh-fallback")) {
+			flags |= IP_VS_SVC_F_SCHED_SH_FALLBACK;
+			if (strcmp(scheduler, "sh"))
+				fail(2, "incompatible scheduler flag `%s'",
+				     flag);
+		} else if (!strcmp(flag, "sh-port")) {
+			flags |= IP_VS_SVC_F_SCHED_SH_PORT;
+			if (strcmp(scheduler, "sh"))
+				fail(2, "incompatible scheduler flag `%s'",
+				     flag);
+		} else {
+			fail(2, "invalid scheduler flag `%s'", flag);
+		}
+	} while ((flag = strtok(NULL, ",")) != NULL);
+
+	return flags;
+}
 
 static void
 generic_opt_check(int command, int options)
@@ -1070,7 +1119,7 @@ static void usage_exit(const char *program, const int exit_status)
 	version(stream);
 	fprintf(stream,
 		"Usage:\n"
-		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine]\n"
+		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine] [-b sched-flags]\n"
 		"  %s -D -t|u|f service-address\n"
 		"  %s -C\n"
 		"  %s -R\n"
@@ -1139,7 +1188,8 @@ static void usage_exit(const char *program, const int exit_status)
 		"  --nosort                            disable sorting output of service/server entries\n"
 		"  --sort                              does nothing, for backwards compatibility\n"
 		"  --ops          -o                   one-packet scheduling\n"
-		"  --numeric      -n                   numeric output of addresses and ports\n",
+		"  --numeric      -n                   numeric output of addresses and ports\n"
+		"  --sched-flags  -b flags             scheduler flags (comma-separated)\n",
 		DEF_SCHED);
 
 	exit(exit_status);
@@ -1396,6 +1446,32 @@ static void print_largenum(unsigned long long i, unsigned int format)
 		printf("%8lluT", i / 1000000000000ULL);
 }
 
+static void print_sched_flags(ipvs_service_entry_t *se) {
+	char flags[64];
+
+	flags[0] = '\0';
+
+	if (!strcmp(se->sched_name, "sh")) {
+		if (se->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+			strcat(flags, "sh-fallback,");
+		if (se->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+			strcat(flags, "sh-port,");
+		if (se->flags & IP_VS_SVC_F_SCHED3)
+			strcat(flags, "flag-3,");
+	} else {
+		if (se->flags & IP_VS_SVC_F_SCHED1)
+			strcat(flags, "flag-1,");
+		if (se->flags & IP_VS_SVC_F_SCHED2)
+			strcat(flags, "flag-2,");
+		if (se->flags & IP_VS_SVC_F_SCHED3)
+			strcat(flags, "flag-3,");
+	}
+
+	if (flags[0]) {
+		flags[strlen(flags)-1] = '\0';
+		printf("%s", flags);
+	}
+}
 
 static void print_title(unsigned int format)
 {
@@ -1488,6 +1564,12 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
 			printf(" pe %s", se->pe_name);
 		if (se->flags & IP_VS_SVC_F_ONEPACKET)
 			printf(" -o");
+		if (se->flags & (IP_VS_SVC_F_SCHED1 |
+				 IP_VS_SVC_F_SCHED2 |
+				 IP_VS_SVC_F_SCHED3)) {
+			printf(" -b ");
+			print_sched_flags(se);
+		}
 	} else if (format & FMT_STATS) {
 		printf("%-33s", svc_name);
 		print_largenum(se->stats.conns, format);
@@ -1504,6 +1586,13 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
 		print_largenum(se->stats.outbps, format);
 	} else {
 		printf("%s %s", svc_name, se->sched_name);
+		if (se->flags & (IP_VS_SVC_F_SCHED1 |
+				 IP_VS_SVC_F_SCHED2 |
+				 IP_VS_SVC_F_SCHED3)) {
+			printf(" (");
+			print_sched_flags(se);
+			printf(")");
+		}
 		if (se->flags & IP_VS_SVC_F_PERSISTENT) {
 			printf(" persistent %u", se->timeout);
 			if (se->af == AF_INET)
diff --git a/libipvs/ip_vs.h b/libipvs/ip_vs.h
index 5e1d544..4db14ff 100644
--- a/libipvs/ip_vs.h
+++ b/libipvs/ip_vs.h
@@ -29,6 +29,13 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
+
+#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
+#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
+
 
 /*
  *      IPVS sync daemon states


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-17  9:04                             ` Julian Anastasov
@ 2013-06-17 11:11                               ` Alexander Frolkin
  2013-06-17 20:05                                 ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-17 11:11 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

> 	Wait, it seems is_unavailable() can succeed for other
> reasons, so the 'if (!dest) ... break;' check should be first,
> may be out of is_unavailable().

I've moved things around a bit:

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index a245377..2945822 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -20,6 +20,12 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
+
+#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
+#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
 
 /*
  *      Destination Server Flags
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e0130f8..2f52129 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -48,6 +48,10 @@
 
 #include <net/ip_vs.h>
 
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
 
 /*
  *      IPVS SH bucket
@@ -71,10 +75,20 @@ struct ip_vs_sh_state {
 	struct rcu_head			rcu_head;
 };
 
+/* Helper function to determine if server is unavailable
+ */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+	return atomic_read(&dest->weight) <= 0 ||
+	       dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
 /*
  *	Returns hash value for IPVS SH entry
  */
-static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int
+ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
+		 __be16 port, unsigned int offset)
 {
 	__be32 addr_fold = addr->ip;
 
@@ -83,7 +97,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+		IP_VS_SH_TAB_MASK;
 }
 
 
@@ -91,12 +106,43 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
+ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+	     const union nf_inet_addr *addr, __be16 port)
 {
-	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
+	unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+	struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
+
+	return (!dest || is_unavailable(dest)) ? NULL : dest;
 }
 
 
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+		      const union nf_inet_addr *addr, __be16 port)
+{
+	unsigned int offset;
+	unsigned int hash;
+	struct ip_vs_dest *dest;
+
+	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+		hash = ip_vs_sh_hashkey(svc->af, addr, port, offset);
+		dest = rcu_dereference(s->buckets[hash].dest);
+		if (!dest)
+			break;
+		else if (is_unavailable(dest))
+			IP_VS_DBG_BUF(6, "SH: selected unavailable server "
+				      "%s:%d (offset %d)",
+				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+				      ntohs(dest->port), offset);
+		else
+			return dest;
+	}
+
+	return NULL;
+}
+
 /*
  *      Assign all the hash buckets of the specified table with the service.
  */
@@ -213,13 +259,34 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
 }
 
 
-/*
- *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- *      consider that the server is overloaded here.
+/* Helper function to get port number
  */
-static inline int is_overloaded(struct ip_vs_dest *dest)
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
 {
-	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+	__be16 port;
+	struct tcphdr _tcph, *th;
+	struct udphdr _udph, *uh;
+	sctp_sctphdr_t _sctph, *sh;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+		port = th->source;
+		break;
+	case IPPROTO_UDP:
+		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+		port = uh->source;
+		break;
+	case IPPROTO_SCTP:
+		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+		port = sh->source;
+		break;
+	default:
+		port = 0;
+	}
+
+	return port;
 }
 
 
@@ -232,15 +299,21 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_sh_state *s;
+	__be16 port = 0;
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
+	if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+		port = ip_vs_sh_get_port(skb, &iph);
+
 	s = (struct ip_vs_sh_state *) svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, s, &iph->saddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+		dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+	else
+		dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+
+	if (!dest) {
 		ip_vs_scheduler_err(svc, "no destination available");
 		return NULL;
 	}


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-17 10:35                           ` Alexander Frolkin
@ 2013-06-17 19:48                             ` Julian Anastasov
  2013-06-18  9:08                               ` Alexander Frolkin
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-17 19:48 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Mon, 17 Jun 2013, Alexander Frolkin wrote:

>  /* check the options based on the commands_v_options table */
>  static void generic_opt_check(int command, int options);
> @@ -363,7 +366,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  {
>  	int c, parse;
>  	poptContext context;
> -	char *optarg=NULL;
> +	char *optarg=NULL, sched_flags_arg[128];
>  	struct poptOption options_table[] = {
>  		{ "add-service", 'A', POPT_ARG_NONE, NULL, 'A', NULL, NULL },
>  		{ "edit-service", 'E', POPT_ARG_NONE, NULL, 'E', NULL, NULL },
> @@ -426,9 +429,12 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  		{ "ops", 'o', POPT_ARG_NONE, NULL, 'o', NULL, NULL },
>  		{ "pe", '\0', POPT_ARG_STRING, &optarg, TAG_PERSISTENCE_ENGINE,
>  		  NULL, NULL },
> +		{ "sched-flags", 'b', POPT_ARG_STRING, &optarg, 'b', NULL, NULL },
>  		{ NULL, 0, 0, NULL, 0, NULL, NULL }
>  	};
>  
> +	sched_flags_arg[0] = '\0';
> +
>  	context = poptGetContext("ipvsadm", argc, (const char **)argv,
>  				 options_table, 0);
>  
> @@ -656,6 +662,10 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  			set_option(options, OPT_PERSISTENCE_ENGINE);
>  			strncpy(ce->svc.pe_name, optarg, IP_VS_PENAME_MAXLEN);
>  			break;
> +		case 'b':
> +			set_option(options, OPT_SCHED_FLAGS);
> +			strncpy(sched_flags_arg, optarg, sizeof(sched_flags_arg));

	Such strncpy calls are not very safe, they should be
strncpy(dest, src, space - 1); dest[space - 1] = 0;

	Better to use
	snprintf(sched_flags_arg, sizeof(sched_flags_arg), "%s", optarg);

> +			break;

> +static unsigned int parse_sched_flags(const char *sched, char *optarg)
> +{
> +	unsigned int flags = 0;
> +	char *flag;
> +	const char *scheduler = (sched && *sched) ? sched : DEF_SCHED;

	We can reuse 'sched' here, no need for 'scheduler':

	sched = (sched && *sched) ? sched : DEF_SCHED;

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-17 11:11                               ` Alexander Frolkin
@ 2013-06-17 20:05                                 ` Julian Anastasov
  2013-06-18  9:30                                   ` Alexander Frolkin
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-17 20:05 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Mon, 17 Jun 2013, Alexander Frolkin wrote:

> I've moved things around a bit:

> diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
> index e0130f8..2f52129 100644
> --- a/net/netfilter/ipvs/ip_vs_sh.c
> +++ b/net/netfilter/ipvs/ip_vs_sh.c

> +/* Helper function to determine if server is unavailable
> + */

	If not a multi-line, the comment can be:

/* Helper function to determine if server is unavailable */

	But some function names are too obvious :)

> +static inline bool is_unavailable(struct ip_vs_dest *dest)

> +static inline struct ip_vs_dest *
> +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
> +		      const union nf_inet_addr *addr, __be16 port)
> +{
> +	unsigned int offset;
> +	unsigned int hash;
> +	struct ip_vs_dest *dest;
> +
> +	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
> +		hash = ip_vs_sh_hashkey(svc->af, addr, port, offset);
> +		dest = rcu_dereference(s->buckets[hash].dest);
> +		if (!dest)
> +			break;
> +		else if (is_unavailable(dest))

	No need for extra 'else'

> @@ -232,15 +299,21 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
>  {
>  	struct ip_vs_dest *dest;
>  	struct ip_vs_sh_state *s;
> +	__be16 port = 0;
>  
>  	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
>  
> +	if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
> +		port = ip_vs_sh_get_port(skb, &iph);

	May be 'iph', not '&iph' ?

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-17 19:48                             ` Julian Anastasov
@ 2013-06-18  9:08                               ` Alexander Frolkin
  2013-06-18 20:41                                 ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-18  9:08 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

Latest version of ipvsadm patch:

diff --git a/ipvsadm.8 b/ipvsadm.8
index 001ae74..9a9e9b3 100644
--- a/ipvsadm.8
+++ b/ipvsadm.8
@@ -37,7 +37,7 @@ ipvsadm \- Linux Virtual Server administration
 .SH SYNOPSIS
 .B ipvsadm -A|E -t|u|f \fIservice-address\fP [-s \fIscheduler\fP]
 .ti 15
-.B [-p [\fItimeout\fP]] [-M \fInetmask\fP]
+.B [-p [\fItimeout\fP]] [-M \fInetmask\fP] [-b \fIsched-flags\fP]
 .br
 .B ipvsadm -D -t|u|f \fIservice-address\fP
 .br
@@ -248,6 +248,9 @@ addresses.
 .sp
 \fBsh\fR - Source Hashing: assigns jobs to servers through looking up
 a statically assigned hash table by their source IP addresses.
+This scheduler has two flags: sh-fallback, which enables fallback to a
+different server if the selected server was unavailable, and sh-port,
+which adds the source port number to the hash computation.
 .sp
 \fBsed\fR - Shortest Expected Delay: assigns an incoming job to the
 server with the shortest expected delay. The expected delay that the
@@ -286,6 +289,11 @@ resolve problems with non-persistent cache clusters on the client side.
 IPv6 netmasks should be specified as a prefix length between 1 and 128.
 The default prefix length is 128.
 .TP
+.B -b, --sched-flags \fIsched-flags\fP
+Set scheduler flags for this virtual server.  \fIsched-flags\fP is a
+comma-separated list of flags.  See the scheduler descriptions for
+valid scheduler flags.
+.TP
 .B -r, --real-server \fIserver-address\fP
 Real server that an associated request for service may be assigned to.
 The \fIserver-address\fP is the \fIhost\fP address of a real server,
diff --git a/ipvsadm.c b/ipvsadm.c
index 0197515..5b8c036 100644
--- a/ipvsadm.c
+++ b/ipvsadm.c
@@ -182,7 +182,8 @@ static const char* cmdnames[] = {
 #define OPT_EXACT		0x100000
 #define OPT_ONEPACKET		0x200000
 #define OPT_PERSISTENCE_ENGINE  0x400000
-#define NUMBER_OF_OPT		23
+#define OPT_SCHED_FLAGS		0x800000
+#define NUMBER_OF_OPT		24
 
 static const char* optnames[] = {
 	"numeric",
@@ -208,6 +209,7 @@ static const char* optnames[] = {
 	"exact",
 	"ops",
 	"pe",
+	"sched-flags",
 };
 
 /*
@@ -220,21 +222,21 @@ static const char* optnames[] = {
  */
 static const char commands_v_options[NUMBER_OF_CMD][NUMBER_OF_OPT] =
 {
-	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe */
-/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
-/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
-/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x'},
-/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
-/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
-/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
-/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe  -b */
+/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
+/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
+/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x', 'x'},
+/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
+/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
+/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
+/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
 };
 
 /* printing format flags */
@@ -302,6 +304,7 @@ static int parse_service(char *buf, ipvs_service_t *svc);
 static int parse_netmask(char *buf, u_int32_t *addr);
 static int parse_timeout(char *buf, int min, int max);
 static unsigned int parse_fwmark(char *buf);
+static unsigned int parse_sched_flags(const char *sched, char *optarg);
 
 /* check the options based on the commands_v_options table */
 static void generic_opt_check(int command, int options);
@@ -363,7 +366,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 {
 	int c, parse;
 	poptContext context;
-	char *optarg=NULL;
+	char *optarg=NULL, sched_flags_arg[128];
 	struct poptOption options_table[] = {
 		{ "add-service", 'A', POPT_ARG_NONE, NULL, 'A', NULL, NULL },
 		{ "edit-service", 'E', POPT_ARG_NONE, NULL, 'E', NULL, NULL },
@@ -426,9 +429,12 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 		{ "ops", 'o', POPT_ARG_NONE, NULL, 'o', NULL, NULL },
 		{ "pe", '\0', POPT_ARG_STRING, &optarg, TAG_PERSISTENCE_ENGINE,
 		  NULL, NULL },
+		{ "sched-flags", 'b', POPT_ARG_STRING, &optarg, 'b', NULL, NULL },
 		{ NULL, 0, 0, NULL, 0, NULL, NULL }
 	};
 
+	sched_flags_arg[0] = '\0';
+
 	context = poptGetContext("ipvsadm", argc, (const char **)argv,
 				 options_table, 0);
 
@@ -656,6 +662,10 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 			set_option(options, OPT_PERSISTENCE_ENGINE);
 			strncpy(ce->svc.pe_name, optarg, IP_VS_PENAME_MAXLEN);
 			break;
+		case 'b':
+			set_option(options, OPT_SCHED_FLAGS);
+			snprintf(sched_flags_arg, sizeof(sched_flags_arg), "%s", optarg);
+			break;
 		default:
 			fail(2, "invalid option `%s'",
 			     poptBadOption(context, POPT_BADOPTION_NOALIAS));
@@ -690,6 +700,14 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
 	if ((optarg=(char *)poptGetArg(context)))
 		fail(2, "unexpected argument %s", optarg);
 
+	if (sched_flags_arg[0]) {
+		ce->svc.flags &= ~(IP_VS_SVC_F_SCHED1 |
+				   IP_VS_SVC_F_SCHED2 |
+				   IP_VS_SVC_F_SCHED3);
+		ce->svc.flags |= parse_sched_flags(ce->svc.sched_name,
+						   sched_flags_arg);
+	}
+
 	poptFreeContext(context);
 
 	return 0;
@@ -989,6 +1007,38 @@ parse_service(char *buf, ipvs_service_t *svc)
 	return result;
 }
 
+static unsigned int parse_sched_flags(const char *sched, char *optarg)
+{
+	unsigned int flags = 0;
+	char *flag;
+
+	sched = (sched && *sched) ? sched : DEF_SCHED;
+
+	flag = strtok(optarg, ",");
+	do {
+		if (!strcmp(flag, "flag-1")) {
+			flags |= IP_VS_SVC_F_SCHED1;
+		} else if (!strcmp(flag, "flag-2")) {
+			flags |= IP_VS_SVC_F_SCHED2;
+		} else if (!strcmp(flag, "flag-3")) {
+			flags |= IP_VS_SVC_F_SCHED3;
+		} else if (!strcmp(flag, "sh-fallback")) {
+			flags |= IP_VS_SVC_F_SCHED_SH_FALLBACK;
+			if (strcmp(sched, "sh"))
+				fail(2, "incompatible scheduler flag `%s'",
+				     flag);
+		} else if (!strcmp(flag, "sh-port")) {
+			flags |= IP_VS_SVC_F_SCHED_SH_PORT;
+			if (strcmp(sched, "sh"))
+				fail(2, "incompatible scheduler flag `%s'",
+				     flag);
+		} else {
+			fail(2, "invalid scheduler flag `%s'", flag);
+		}
+	} while ((flag = strtok(NULL, ",")) != NULL);
+
+	return flags;
+}
 
 static void
 generic_opt_check(int command, int options)
@@ -1070,7 +1120,7 @@ static void usage_exit(const char *program, const int exit_status)
 	version(stream);
 	fprintf(stream,
 		"Usage:\n"
-		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine]\n"
+		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine] [-b sched-flags]\n"
 		"  %s -D -t|u|f service-address\n"
 		"  %s -C\n"
 		"  %s -R\n"
@@ -1139,7 +1189,8 @@ static void usage_exit(const char *program, const int exit_status)
 		"  --nosort                            disable sorting output of service/server entries\n"
 		"  --sort                              does nothing, for backwards compatibility\n"
 		"  --ops          -o                   one-packet scheduling\n"
-		"  --numeric      -n                   numeric output of addresses and ports\n",
+		"  --numeric      -n                   numeric output of addresses and ports\n"
+		"  --sched-flags  -b flags             scheduler flags (comma-separated)\n",
 		DEF_SCHED);
 
 	exit(exit_status);
@@ -1396,6 +1447,32 @@ static void print_largenum(unsigned long long i, unsigned int format)
 		printf("%8lluT", i / 1000000000000ULL);
 }
 
+static void print_sched_flags(ipvs_service_entry_t *se) {
+	char flags[64];
+
+	flags[0] = '\0';
+
+	if (!strcmp(se->sched_name, "sh")) {
+		if (se->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+			strcat(flags, "sh-fallback,");
+		if (se->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+			strcat(flags, "sh-port,");
+		if (se->flags & IP_VS_SVC_F_SCHED3)
+			strcat(flags, "flag-3,");
+	} else {
+		if (se->flags & IP_VS_SVC_F_SCHED1)
+			strcat(flags, "flag-1,");
+		if (se->flags & IP_VS_SVC_F_SCHED2)
+			strcat(flags, "flag-2,");
+		if (se->flags & IP_VS_SVC_F_SCHED3)
+			strcat(flags, "flag-3,");
+	}
+
+	if (flags[0]) {
+		flags[strlen(flags)-1] = '\0';
+		printf("%s", flags);
+	}
+}
 
 static void print_title(unsigned int format)
 {
@@ -1488,6 +1565,12 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
 			printf(" pe %s", se->pe_name);
 		if (se->flags & IP_VS_SVC_F_ONEPACKET)
 			printf(" -o");
+		if (se->flags & (IP_VS_SVC_F_SCHED1 |
+				 IP_VS_SVC_F_SCHED2 |
+				 IP_VS_SVC_F_SCHED3)) {
+			printf(" -b ");
+			print_sched_flags(se);
+		}
 	} else if (format & FMT_STATS) {
 		printf("%-33s", svc_name);
 		print_largenum(se->stats.conns, format);
@@ -1504,6 +1587,13 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
 		print_largenum(se->stats.outbps, format);
 	} else {
 		printf("%s %s", svc_name, se->sched_name);
+		if (se->flags & (IP_VS_SVC_F_SCHED1 |
+				 IP_VS_SVC_F_SCHED2 |
+				 IP_VS_SVC_F_SCHED3)) {
+			printf(" (");
+			print_sched_flags(se);
+			printf(")");
+		}
 		if (se->flags & IP_VS_SVC_F_PERSISTENT) {
 			printf(" persistent %u", se->timeout);
 			if (se->af == AF_INET)
diff --git a/libipvs/ip_vs.h b/libipvs/ip_vs.h
index 5e1d544..4db14ff 100644
--- a/libipvs/ip_vs.h
+++ b/libipvs/ip_vs.h
@@ -29,6 +29,13 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
+
+#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
+#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
+
 
 /*
  *      IPVS sync daemon states


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-17 20:05                                 ` Julian Anastasov
@ 2013-06-18  9:30                                   ` Alexander Frolkin
  2013-06-18 20:52                                     ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Alexander Frolkin @ 2013-06-18  9:30 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

Latest version of SH patch:

diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index a245377..2945822 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -20,6 +20,12 @@
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
 #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
+#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
+#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
+#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
+
+#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
+#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
 
 /*
  *      Destination Server Flags
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e0130f8..caa3eee 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -48,6 +48,10 @@
 
 #include <net/ip_vs.h>
 
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
 
 /*
  *      IPVS SH bucket
@@ -71,10 +75,19 @@ struct ip_vs_sh_state {
 	struct rcu_head			rcu_head;
 };
 
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+	return atomic_read(&dest->weight) <= 0 ||
+	       dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
 /*
  *	Returns hash value for IPVS SH entry
  */
-static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int
+ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
+		 __be16 port, unsigned int offset)
 {
 	__be32 addr_fold = addr->ip;
 
@@ -83,7 +96,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+		IP_VS_SH_TAB_MASK;
 }
 
 
@@ -91,12 +105,43 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
  *      Get ip_vs_dest associated with supplied parameters.
  */
 static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
+ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+	     const union nf_inet_addr *addr, __be16 port)
 {
-	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
+	unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+	struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
+
+	return (!dest || is_unavailable(dest)) ? NULL : dest;
 }
 
 
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+		      const union nf_inet_addr *addr, __be16 port)
+{
+	unsigned int offset;
+	unsigned int hash;
+	struct ip_vs_dest *dest;
+
+	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+		hash = ip_vs_sh_hashkey(svc->af, addr, port, offset);
+		dest = rcu_dereference(s->buckets[hash].dest);
+		if (!dest)
+			break;
+		if (is_unavailable(dest))
+			IP_VS_DBG_BUF(6, "SH: selected unavailable server "
+				      "%s:%d (offset %d)",
+				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+				      ntohs(dest->port), offset);
+		else
+			return dest;
+	}
+
+	return NULL;
+}
+
 /*
  *      Assign all the hash buckets of the specified table with the service.
  */
@@ -213,13 +258,33 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
 }
 
 
-/*
- *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
+/* Helper function to get port number */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
 {
-	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+	__be16 port;
+	struct tcphdr _tcph, *th;
+	struct udphdr _udph, *uh;
+	sctp_sctphdr_t _sctph, *sh;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+		port = th->source;
+		break;
+	case IPPROTO_UDP:
+		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+		port = uh->source;
+		break;
+	case IPPROTO_SCTP:
+		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+		port = sh->source;
+		break;
+	default:
+		port = 0;
+	}
+
+	return port;
 }
 
 
@@ -232,15 +297,21 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_sh_state *s;
+	__be16 port = 0;
 
 	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 
+	if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+		port = ip_vs_sh_get_port(skb, iph);
+
 	s = (struct ip_vs_sh_state *) svc->sched_data;
-	dest = ip_vs_sh_get(svc->af, s, &iph->saddr);
-	if (!dest
-	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-	    || atomic_read(&dest->weight) <= 0
-	    || is_overloaded(dest)) {
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+		dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+	else
+		dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+
+	if (!dest) {
 		ip_vs_scheduler_err(svc, "no destination available");
 		return NULL;
 	}


Alex


^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-18  9:08                               ` Alexander Frolkin
@ 2013-06-18 20:41                                 ` Julian Anastasov
  0 siblings, 0 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-18 20:41 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Tue, 18 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> Latest version of ipvsadm patch:

	I don't see problems with this version. If you post
an official patch I'll ack it. Others still have time for
comments.

> diff --git a/ipvsadm.8 b/ipvsadm.8
> index 001ae74..9a9e9b3 100644
> --- a/ipvsadm.8
> +++ b/ipvsadm.8
> @@ -37,7 +37,7 @@ ipvsadm \- Linux Virtual Server administration
>  .SH SYNOPSIS
>  .B ipvsadm -A|E -t|u|f \fIservice-address\fP [-s \fIscheduler\fP]
>  .ti 15
> -.B [-p [\fItimeout\fP]] [-M \fInetmask\fP]
> +.B [-p [\fItimeout\fP]] [-M \fInetmask\fP] [-b \fIsched-flags\fP]
>  .br
>  .B ipvsadm -D -t|u|f \fIservice-address\fP
>  .br
> @@ -248,6 +248,9 @@ addresses.
>  .sp
>  \fBsh\fR - Source Hashing: assigns jobs to servers through looking up
>  a statically assigned hash table by their source IP addresses.
> +This scheduler has two flags: sh-fallback, which enables fallback to a
> +different server if the selected server was unavailable, and sh-port,
> +which adds the source port number to the hash computation.
>  .sp
>  \fBsed\fR - Shortest Expected Delay: assigns an incoming job to the
>  server with the shortest expected delay. The expected delay that the
> @@ -286,6 +289,11 @@ resolve problems with non-persistent cache clusters on the client side.
>  IPv6 netmasks should be specified as a prefix length between 1 and 128.
>  The default prefix length is 128.
>  .TP
> +.B -b, --sched-flags \fIsched-flags\fP
> +Set scheduler flags for this virtual server.  \fIsched-flags\fP is a
> +comma-separated list of flags.  See the scheduler descriptions for
> +valid scheduler flags.
> +.TP
>  .B -r, --real-server \fIserver-address\fP
>  Real server that an associated request for service may be assigned to.
>  The \fIserver-address\fP is the \fIhost\fP address of a real server,
> diff --git a/ipvsadm.c b/ipvsadm.c
> index 0197515..5b8c036 100644
> --- a/ipvsadm.c
> +++ b/ipvsadm.c
> @@ -182,7 +182,8 @@ static const char* cmdnames[] = {
>  #define OPT_EXACT		0x100000
>  #define OPT_ONEPACKET		0x200000
>  #define OPT_PERSISTENCE_ENGINE  0x400000
> -#define NUMBER_OF_OPT		23
> +#define OPT_SCHED_FLAGS		0x800000
> +#define NUMBER_OF_OPT		24
>  
>  static const char* optnames[] = {
>  	"numeric",
> @@ -208,6 +209,7 @@ static const char* optnames[] = {
>  	"exact",
>  	"ops",
>  	"pe",
> +	"sched-flags",
>  };
>  
>  /*
> @@ -220,21 +222,21 @@ static const char* optnames[] = {
>   */
>  static const char commands_v_options[NUMBER_OF_CMD][NUMBER_OF_OPT] =
>  {
> -	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe */
> -/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
> -/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' '},
> -/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x'},
> -/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
> -/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x'},
> -/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> -/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +	/*   -n   -c   svc  -s   -p   -M   -r   fwd  -w   -x   -y   -mc  tot  dmn  -st  -rt  thr  -pc  srt  sid  -ex  ops  -pe  -b */
> +/*ADD*/     {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
> +/*EDIT*/    {'x', 'x', '+', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', ' ', ' '},
> +/*DEL*/     {'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*FLUSH*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*LIST*/    {' ', '1', '1', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', '1', '1', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'x', 'x', 'x'},
> +/*ADDSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*DELSRV*/  {'x', 'x', '+', 'x', 'x', 'x', '+', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*EDITSRV*/ {'x', 'x', '+', 'x', 'x', 'x', '+', ' ', ' ', ' ', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*TIMEOUT*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*STARTD*/  {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
> +/*STOPD*/   {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', ' ', 'x', 'x', 'x', 'x'},
> +/*RESTORE*/ {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*SAVE*/    {' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
> +/*ZERO*/    {'x', 'x', ' ', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'},
>  };
>  
>  /* printing format flags */
> @@ -302,6 +304,7 @@ static int parse_service(char *buf, ipvs_service_t *svc);
>  static int parse_netmask(char *buf, u_int32_t *addr);
>  static int parse_timeout(char *buf, int min, int max);
>  static unsigned int parse_fwmark(char *buf);
> +static unsigned int parse_sched_flags(const char *sched, char *optarg);
>  
>  /* check the options based on the commands_v_options table */
>  static void generic_opt_check(int command, int options);
> @@ -363,7 +366,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  {
>  	int c, parse;
>  	poptContext context;
> -	char *optarg=NULL;
> +	char *optarg=NULL, sched_flags_arg[128];
>  	struct poptOption options_table[] = {
>  		{ "add-service", 'A', POPT_ARG_NONE, NULL, 'A', NULL, NULL },
>  		{ "edit-service", 'E', POPT_ARG_NONE, NULL, 'E', NULL, NULL },
> @@ -426,9 +429,12 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  		{ "ops", 'o', POPT_ARG_NONE, NULL, 'o', NULL, NULL },
>  		{ "pe", '\0', POPT_ARG_STRING, &optarg, TAG_PERSISTENCE_ENGINE,
>  		  NULL, NULL },
> +		{ "sched-flags", 'b', POPT_ARG_STRING, &optarg, 'b', NULL, NULL },
>  		{ NULL, 0, 0, NULL, 0, NULL, NULL }
>  	};
>  
> +	sched_flags_arg[0] = '\0';
> +
>  	context = poptGetContext("ipvsadm", argc, (const char **)argv,
>  				 options_table, 0);
>  
> @@ -656,6 +662,10 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  			set_option(options, OPT_PERSISTENCE_ENGINE);
>  			strncpy(ce->svc.pe_name, optarg, IP_VS_PENAME_MAXLEN);
>  			break;
> +		case 'b':
> +			set_option(options, OPT_SCHED_FLAGS);
> +			snprintf(sched_flags_arg, sizeof(sched_flags_arg), "%s", optarg);
> +			break;
>  		default:
>  			fail(2, "invalid option `%s'",
>  			     poptBadOption(context, POPT_BADOPTION_NOALIAS));
> @@ -690,6 +700,14 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce,
>  	if ((optarg=(char *)poptGetArg(context)))
>  		fail(2, "unexpected argument %s", optarg);
>  
> +	if (sched_flags_arg[0]) {
> +		ce->svc.flags &= ~(IP_VS_SVC_F_SCHED1 |
> +				   IP_VS_SVC_F_SCHED2 |
> +				   IP_VS_SVC_F_SCHED3);
> +		ce->svc.flags |= parse_sched_flags(ce->svc.sched_name,
> +						   sched_flags_arg);
> +	}
> +
>  	poptFreeContext(context);
>  
>  	return 0;
> @@ -989,6 +1007,38 @@ parse_service(char *buf, ipvs_service_t *svc)
>  	return result;
>  }
>  
> +static unsigned int parse_sched_flags(const char *sched, char *optarg)
> +{
> +	unsigned int flags = 0;
> +	char *flag;
> +
> +	sched = (sched && *sched) ? sched : DEF_SCHED;
> +
> +	flag = strtok(optarg, ",");
> +	do {
> +		if (!strcmp(flag, "flag-1")) {
> +			flags |= IP_VS_SVC_F_SCHED1;
> +		} else if (!strcmp(flag, "flag-2")) {
> +			flags |= IP_VS_SVC_F_SCHED2;
> +		} else if (!strcmp(flag, "flag-3")) {
> +			flags |= IP_VS_SVC_F_SCHED3;
> +		} else if (!strcmp(flag, "sh-fallback")) {
> +			flags |= IP_VS_SVC_F_SCHED_SH_FALLBACK;
> +			if (strcmp(sched, "sh"))
> +				fail(2, "incompatible scheduler flag `%s'",
> +				     flag);
> +		} else if (!strcmp(flag, "sh-port")) {
> +			flags |= IP_VS_SVC_F_SCHED_SH_PORT;
> +			if (strcmp(sched, "sh"))
> +				fail(2, "incompatible scheduler flag `%s'",
> +				     flag);
> +		} else {
> +			fail(2, "invalid scheduler flag `%s'", flag);
> +		}
> +	} while ((flag = strtok(NULL, ",")) != NULL);
> +
> +	return flags;
> +}
>  
>  static void
>  generic_opt_check(int command, int options)
> @@ -1070,7 +1120,7 @@ static void usage_exit(const char *program, const int exit_status)
>  	version(stream);
>  	fprintf(stream,
>  		"Usage:\n"
> -		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine]\n"
> +		"  %s -A|E -t|u|f service-address [-s scheduler] [-p [timeout]] [-M netmask] [--pe persistence_engine] [-b sched-flags]\n"
>  		"  %s -D -t|u|f service-address\n"
>  		"  %s -C\n"
>  		"  %s -R\n"
> @@ -1139,7 +1189,8 @@ static void usage_exit(const char *program, const int exit_status)
>  		"  --nosort                            disable sorting output of service/server entries\n"
>  		"  --sort                              does nothing, for backwards compatibility\n"
>  		"  --ops          -o                   one-packet scheduling\n"
> -		"  --numeric      -n                   numeric output of addresses and ports\n",
> +		"  --numeric      -n                   numeric output of addresses and ports\n"
> +		"  --sched-flags  -b flags             scheduler flags (comma-separated)\n",
>  		DEF_SCHED);
>  
>  	exit(exit_status);
> @@ -1396,6 +1447,32 @@ static void print_largenum(unsigned long long i, unsigned int format)
>  		printf("%8lluT", i / 1000000000000ULL);
>  }
>  
> +static void print_sched_flags(ipvs_service_entry_t *se) {
> +	char flags[64];
> +
> +	flags[0] = '\0';
> +
> +	if (!strcmp(se->sched_name, "sh")) {
> +		if (se->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
> +			strcat(flags, "sh-fallback,");
> +		if (se->flags & IP_VS_SVC_F_SCHED_SH_PORT)
> +			strcat(flags, "sh-port,");
> +		if (se->flags & IP_VS_SVC_F_SCHED3)
> +			strcat(flags, "flag-3,");
> +	} else {
> +		if (se->flags & IP_VS_SVC_F_SCHED1)
> +			strcat(flags, "flag-1,");
> +		if (se->flags & IP_VS_SVC_F_SCHED2)
> +			strcat(flags, "flag-2,");
> +		if (se->flags & IP_VS_SVC_F_SCHED3)
> +			strcat(flags, "flag-3,");
> +	}
> +
> +	if (flags[0]) {
> +		flags[strlen(flags)-1] = '\0';
> +		printf("%s", flags);
> +	}
> +}
>  
>  static void print_title(unsigned int format)
>  {
> @@ -1488,6 +1565,12 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
>  			printf(" pe %s", se->pe_name);
>  		if (se->flags & IP_VS_SVC_F_ONEPACKET)
>  			printf(" -o");
> +		if (se->flags & (IP_VS_SVC_F_SCHED1 |
> +				 IP_VS_SVC_F_SCHED2 |
> +				 IP_VS_SVC_F_SCHED3)) {
> +			printf(" -b ");
> +			print_sched_flags(se);
> +		}
>  	} else if (format & FMT_STATS) {
>  		printf("%-33s", svc_name);
>  		print_largenum(se->stats.conns, format);
> @@ -1504,6 +1587,13 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format)
>  		print_largenum(se->stats.outbps, format);
>  	} else {
>  		printf("%s %s", svc_name, se->sched_name);
> +		if (se->flags & (IP_VS_SVC_F_SCHED1 |
> +				 IP_VS_SVC_F_SCHED2 |
> +				 IP_VS_SVC_F_SCHED3)) {
> +			printf(" (");
> +			print_sched_flags(se);
> +			printf(")");
> +		}
>  		if (se->flags & IP_VS_SVC_F_PERSISTENT) {
>  			printf(" persistent %u", se->timeout);
>  			if (se->af == AF_INET)
> diff --git a/libipvs/ip_vs.h b/libipvs/ip_vs.h
> index 5e1d544..4db14ff 100644
> --- a/libipvs/ip_vs.h
> +++ b/libipvs/ip_vs.h
> @@ -29,6 +29,13 @@
>  #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
>  #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
>  #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
> +#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
> +#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
> +#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
> +
> +#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
> +#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
> +
>  
>  /*
>   *      IPVS sync daemon states
> 
> 
> Alex

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-18  9:30                                   ` Alexander Frolkin
@ 2013-06-18 20:52                                     ` Julian Anastasov
  0 siblings, 0 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-18 20:52 UTC (permalink / raw)
  To: Alexander Frolkin; +Cc: lvs-devel


	Hello,

On Tue, 18 Jun 2013, Alexander Frolkin wrote:

> Hi,
> 
> Latest version of SH patch:
> 
> diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
> index a245377..2945822 100644
> --- a/include/uapi/linux/ip_vs.h
> +++ b/include/uapi/linux/ip_vs.h
> @@ -20,6 +20,12 @@
>  #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
>  #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
>  #define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
> +#define IP_VS_SVC_F_SCHED1	0x0008		/* scheduler flag 1 */
> +#define IP_VS_SVC_F_SCHED2	0x0010		/* scheduler flag 2 */
> +#define IP_VS_SVC_F_SCHED3	0x0020		/* scheduler flag 3 */
> +
> +#define IP_VS_SVC_F_SCHED_SH_FALLBACK	IP_VS_SVC_F_SCHED1 /* SH fallback */
> +#define IP_VS_SVC_F_SCHED_SH_PORT	IP_VS_SVC_F_SCHED2 /* SH use port */
>  
>  /*
>   *      Destination Server Flags
> diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
> index e0130f8..caa3eee 100644
> --- a/net/netfilter/ipvs/ip_vs_sh.c
> +++ b/net/netfilter/ipvs/ip_vs_sh.c
> @@ -48,6 +48,10 @@
>  
>  #include <net/ip_vs.h>
>  
> +#include <net/tcp.h>
> +#include <linux/udp.h>
> +#include <linux/sctp.h>
> +
>  
>  /*
>   *      IPVS SH bucket
> @@ -71,10 +75,19 @@ struct ip_vs_sh_state {
>  	struct rcu_head			rcu_head;
>  };
>  
> +/* Helper function to determine if server is unavailable */
> +static inline bool is_unavailable(struct ip_vs_dest *dest)
> +{
> +	return atomic_read(&dest->weight) <= 0 ||
> +	       dest->flags & IP_VS_DEST_F_OVERLOAD;
> +}
> +
>  /*
>   *	Returns hash value for IPVS SH entry
>   */
> -static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
> +static inline unsigned int
> +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
> +		 __be16 port, unsigned int offset)
>  {
>  	__be32 addr_fold = addr->ip;
>  
> @@ -83,7 +96,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
>  		addr_fold = addr->ip6[0]^addr->ip6[1]^
>  			    addr->ip6[2]^addr->ip6[3];
>  #endif
> -	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
> +	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
> +		IP_VS_SH_TAB_MASK;
>  }
>  
>  
> @@ -91,12 +105,43 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
>   *      Get ip_vs_dest associated with supplied parameters.
>   */
>  static inline struct ip_vs_dest *
> -ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
> +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
> +	     const union nf_inet_addr *addr, __be16 port)
>  {
> -	return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
> +	unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
> +	struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
> +
> +	return (!dest || is_unavailable(dest)) ? NULL : dest;
>  }
>  
>  
> +/* As ip_vs_sh_get, but with fallback if selected server is unavailable

	You can also fix this comment.

> + */
> +static inline struct ip_vs_dest *
> +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
> +		      const union nf_inet_addr *addr, __be16 port)
> +{
> +	unsigned int offset;
> +	unsigned int hash;
> +	struct ip_vs_dest *dest;
> +
> +	for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
> +		hash = ip_vs_sh_hashkey(svc->af, addr, port, offset);
> +		dest = rcu_dereference(s->buckets[hash].dest);
> +		if (!dest)
> +			break;
> +		if (is_unavailable(dest))
> +			IP_VS_DBG_BUF(6, "SH: selected unavailable server "
> +				      "%s:%d (offset %d)",
> +				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
> +				      ntohs(dest->port), offset);
> +		else
> +			return dest;
> +	}
> +
> +	return NULL;
> +}
> +
>  /*
>   *      Assign all the hash buckets of the specified table with the service.
>   */
> @@ -213,13 +258,33 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
>  }
>  
>  
> -/*
> - *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
> - *      consider that the server is overloaded here.
> - */
> -static inline int is_overloaded(struct ip_vs_dest *dest)
> +/* Helper function to get port number */
> +static inline __be16
> +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
>  {
> -	return dest->flags & IP_VS_DEST_F_OVERLOAD;
> +	__be16 port;
> +	struct tcphdr _tcph, *th;
> +	struct udphdr _udph, *uh;
> +	sctp_sctphdr_t _sctph, *sh;
> +
> +	switch (iph->protocol) {
> +	case IPPROTO_TCP:
> +		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
> +		port = th->source;
> +		break;
> +	case IPPROTO_UDP:
> +		uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
> +		port = uh->source;
> +		break;
> +	case IPPROTO_SCTP:
> +		sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
> +		port = sh->source;
> +		break;
> +	default:
> +		port = 0;
> +	}
> +
> +	return port;
>  }
>  
>  
> @@ -232,15 +297,21 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
>  {
>  	struct ip_vs_dest *dest;
>  	struct ip_vs_sh_state *s;
> +	__be16 port = 0;
>  
>  	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
>  
> +	if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
> +		port = ip_vs_sh_get_port(skb, iph);
> +
>  	s = (struct ip_vs_sh_state *) svc->sched_data;
> -	dest = ip_vs_sh_get(svc->af, s, &iph->saddr);
> -	if (!dest
> -	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
> -	    || atomic_read(&dest->weight) <= 0
> -	    || is_overloaded(dest)) {
> +
> +	if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
> +		dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
> +	else
> +		dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
> +
> +	if (!dest) {
>  		ip_vs_scheduler_err(svc, "no destination available");
>  		return NULL;
>  	}
> 
> 
> Alex

	After fixing above comment and if tests looks ok
you can post an official patch.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-05-28 13:41         ` Aleksey Chudov
  2013-05-30  6:37           ` Julian Anastasov
@ 2013-06-19  9:03           ` Julian Anastasov
  2013-06-19 19:25             ` Julian Anastasov
                               ` (2 more replies)
  1 sibling, 3 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-19  9:03 UTC (permalink / raw)
  To: Aleksey Chudov; +Cc: lvs-devel


	Hello,

On Tue, 28 May 2013, Aleksey Chudov wrote:

> Currently we are using multiple active / standby server pairs and synchronize
> them with each other. So half of the servers are constantly doing nothing. We
> are searching how to use all the servers in active / active mode while
> maintaining high availability and sessions persistence in case of failure of
> one of the load balancers. Unfortunately the proposed stateless scheme with SH
> scheduler and Sloppy TCP is not suitable for as since we are using WLC and WRR
> schedulers. As you mentioned SH scheduler has several drawbacks because of
> which we can not use it. Also, we can not synchronize all connections between
> all servers, since it would require a lot of memory and the search for such a
> huge connection table is likely to be slower.
> 
> But we can solve the sync problem in such a way as done in the conntrackd
> which allows filtering by flow state. The easiest option is to make the filter
> only for IP_VS_CONN_F_TEMPLATE state. Thus if all the load balancers will sync
> persistent templates with each other then even if one of the load balancers
> fails most users will remain on the same real servers. Of course without the
> full sync clients must reestablish TCP connections, but for this case we can
> use Sloppy TCP to create a TCP connection state on any TCP packet.
> 
> What do you think of this idea?

	Here is something that is compile-tested. You
will need the "ipvs: sloppy TCP and SCTP" patch by
Alexander Frolkin posted on 13 Jun. Let me know if
you need more help in applying and testing such patches,
so that we can be more confident when releasing such
optimization officially.

From: Julian Anastasov <ja@ssi.bg>
[PATCH] ipvs: add sync_persist_mode flag

Add sync_persist_mode flag to reduce sync traffic
by syncing only persistent templates.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 Documentation/networking/ipvs-sysctl.txt |   13 +++++++++++++
 include/net/ip_vs.h                      |   11 +++++++++++
 net/netfilter/ipvs/ip_vs_ctl.c           |    7 +++++++
 net/netfilter/ipvs/ip_vs_sync.c          |   12 ++++++++++++
 4 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt
index 9573d0c..7a3c047 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -181,6 +181,19 @@ snat_reroute - BOOLEAN
 	always be the same as the original route so it is an optimisation
 	to disable snat_reroute and avoid the recalculation.
 
+sync_persist_mode - INTEGER
+	default 0
+
+	Controls the synchronisation of connections when using persistence
+
+	0: All types of connections are synchronised
+	1: Attempt to reduce the synchronisation traffic depending on
+	the connection type. For persistent services avoid synchronisation
+	for normal connections, do it only for persistence templates.
+	In such case, for TCP and SCTP it may need enabling sloppy_tcp and
+	sloppy_sctp flags on backup servers. For non-persistent services
+	such optimization is not applied, mode 0 is assumed.
+
 sync_version - INTEGER
 	default 1
 
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index e667df1..f0d70f0 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -975,6 +975,7 @@ struct netns_ipvs {
 	int			sysctl_snat_reroute;
 	int			sysctl_sync_ver;
 	int			sysctl_sync_ports;
+	int			sysctl_sync_persist_mode;
 	unsigned long		sysctl_sync_qlen_max;
 	int			sysctl_sync_sock_size;
 	int			sysctl_cache_bypass;
@@ -1076,6 +1077,11 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 	return ACCESS_ONCE(ipvs->sysctl_sync_ports);
 }
 
+static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_sync_persist_mode;
+}
+
 static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
 {
 	return ipvs->sysctl_sync_qlen_max;
@@ -1139,6 +1145,11 @@ static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 	return 1;
 }
 
+static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs)
+{
+	return 0;
+}
+
 static inline unsigned long sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
 {
 	return IPVS_SYNC_QLEN_MAX;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 1b14abb..0c129cc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1715,6 +1715,12 @@ static struct ctl_table vs_vars[] = {
 		.proc_handler	= &proc_do_sync_ports,
 	},
 	{
+		.procname	= "sync_persist_mode",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "sync_qlen_max",
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
@@ -3728,6 +3734,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	tbl[idx++].data = &ipvs->sysctl_sync_ver;
 	ipvs->sysctl_sync_ports = 1;
 	tbl[idx++].data = &ipvs->sysctl_sync_ports;
+	tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
 	ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
 	tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
 	ipvs->sysctl_sync_sock_size = 0;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 2fc6639..03c43c0 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -425,6 +425,16 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
 	return sb;
 }
 
+/* Check if connection is controlled by persistence */
+static inline bool in_peristence(struct ip_vs_conn *cp)
+{
+	for (cp = cp->control; cp; cp = cp->control) {
+		if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+			return true;
+	}
+	return false;
+}
+
 /* Check if conn should be synced.
  * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
  * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
@@ -447,6 +457,8 @@ static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
 	/* Check if we sync in current state */
 	if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
 		force = 0;
+	else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_peristence(cp)))
+		return 0;
 	else if (likely(cp->protocol == IPPROTO_TCP)) {
 		if (!((1 << cp->state) &
 		      ((1 << IP_VS_TCP_S_ESTABLISHED) |
-- 
1.7.3.4

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-19  9:03           ` Julian Anastasov
@ 2013-06-19 19:25             ` Julian Anastasov
  2013-06-20 17:02               ` Aleksey Chudov
  2013-06-19 20:44             ` Aleksey Chudov
  2013-06-22 11:20             ` [PATCH] ipvs: add sync_persist_mode flag Aleksey Chudov
  2 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-19 19:25 UTC (permalink / raw)
  To: Aleksey Chudov; +Cc: lvs-devel


	Hello,

On Wed, 19 Jun 2013, Julian Anastasov wrote:

> +static inline bool in_peristence(struct ip_vs_conn *cp)

	Ops :)

s/in_peristence/in_persistence/

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-19  9:03           ` Julian Anastasov
  2013-06-19 19:25             ` Julian Anastasov
@ 2013-06-19 20:44             ` Aleksey Chudov
  2013-06-22 11:20             ` [PATCH] ipvs: add sync_persist_mode flag Aleksey Chudov
  2 siblings, 0 replies; 52+ messages in thread
From: Aleksey Chudov @ 2013-06-19 20:44 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hello,

On 19.06.2013 12:03, Julian Anastasov wrote:
> 	Here is something that is compile-tested. You
> will need the "ipvs: sloppy TCP and SCTP" patch by
> Alexander Frolkin posted on 13 Jun. Let me know if
> you need more help in applying and testing such patches,
> so that we can be more confident when releasing such
> optimization officially.
>
> From: Julian Anastasov <ja@ssi.bg>
> [PATCH] ipvs: add sync_persist_mode flag
>
> Add sync_persist_mode flag to reduce sync traffic
> by syncing only persistent templates.
>
>

On 19.06.2013 22:25, Julian Anastasov wrote:
> On Wed, 19 Jun 2013, Julian Anastasov wrote:
>
>> >+static inline bool in_peristence(struct ip_vs_conn *cp)
> 	Ops
>
> s/in_peristence/in_persistence/


Thanks for the implementation. I applied sloppy_tcp and persist_mode 
patches manually to our kernel 2.6.39 considering your last ops :) Plan 
to start testing tomorrow.

Aleksey

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-19 19:25             ` Julian Anastasov
@ 2013-06-20 17:02               ` Aleksey Chudov
  2013-06-20 20:09                 ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Aleksey Chudov @ 2013-06-20 17:02 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hi,

I'm trying to calculate how much memory will be needed to synchronize 
the persistent templates across the entire cluster. How much memory 
consumes one persistent template?

Aleksey

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling
  2013-06-20 17:02               ` Aleksey Chudov
@ 2013-06-20 20:09                 ` Julian Anastasov
  0 siblings, 0 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-20 20:09 UTC (permalink / raw)
  To: Aleksey Chudov; +Cc: lvs-devel


	Hello,

On Thu, 20 Jun 2013, Aleksey Chudov wrote:

> Hi,
> 
> I'm trying to calculate how much memory will be needed to synchronize the
> persistent templates across the entire cluster. How much memory consumes one
> persistent template?

	Just like every connection, may be you can find
such message in dmesg/logs when IPVS starts:

	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
		sizeof(struct ip_vs_conn));

	You can also check 'slabtop' (cat /proc/slabinfo) on
load, look for ip_vs_conn. On my x86-32 platform it shows 240 bytes.
On x86-64 it should be more.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] ipvs: add sync_persist_mode flag
  2013-06-19  9:03           ` Julian Anastasov
  2013-06-19 19:25             ` Julian Anastasov
  2013-06-19 20:44             ` Aleksey Chudov
@ 2013-06-22 11:20             ` Aleksey Chudov
  2013-06-22 12:43               ` Julian Anastasov
  2 siblings, 1 reply; 52+ messages in thread
From: Aleksey Chudov @ 2013-06-22 11:20 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

On 19.06.2013 12:03, Julian Anastasov wrote:
> On Tue, 28 May 2013, Aleksey Chudov wrote:
>
>> Currently we are using multiple active / standby server pairs and synchronize
>> them with each other. So half of the servers are constantly doing nothing. We
>> are searching how to use all the servers in active / active mode while
>> maintaining high availability and sessions persistence in case of failure of
>> one of the load balancers. Unfortunately the proposed stateless scheme with SH
>> scheduler and Sloppy TCP is not suitable for as since we are using WLC and WRR
>> schedulers. As you mentioned SH scheduler has several drawbacks because of
>> which we can not use it. Also, we can not synchronize all connections between
>> all servers, since it would require a lot of memory and the search for such a
>> huge connection table is likely to be slower.
>>
>> But we can solve the sync problem in such a way as done in the conntrackd
>> which allows filtering by flow state. The easiest option is to make the filter
>> only for IP_VS_CONN_F_TEMPLATE state. Thus if all the load balancers will sync
>> persistent templates with each other then even if one of the load balancers
>> fails most users will remain on the same real servers. Of course without the
>> full sync clients must reestablish TCP connections, but for this case we can
>> use Sloppy TCP to create a TCP connection state on any TCP packet.
>>
>> What do you think of this idea?
> 	Here is something that is compile-tested. You
> will need the "ipvs: sloppy TCP and SCTP" patch by
> Alexander Frolkin posted on 13 Jun. Let me know if
> you need more help in applying and testing such patches,
> so that we can be more confident when releasing such
> optimization officially.
>
> From: Julian Anastasov <ja@ssi.bg>
> [PATCH] ipvs: add sync_persist_mode flag
>
> Add sync_persist_mode flag to reduce sync traffic
> by syncing only persistent templates.
>
>

I tested the changes on one pair of LAN servers. After turning on 
sync_persist_mode synchronization traffic decreased by 4 times! Also on 
LVS Backup I can see only persist connections.

First of all the Kernel on both servers have been updated. Then LVS 
Backup has been rebooted to drop all connections. After the reboot sync 
was disabled and all connections counters was zero. When I enabled sync 
again almost all persist connections from LVS Master have been synced to 
LVS Backup. But also I can see 0.04% of the connections in ESTABLISHED 
state, although they should not be there! After disabling sync all 
connections on LVS Backup completely disappear after about 5 minutes.

After turning on sloppy_tcp IP address have been moved from LVS Master 
to LVS Backup. According to our statistics, most on clients remained on 
the same real servers. I don't see any problems with sloppy_tcp.

Aleksey

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] ipvs: add sync_persist_mode flag
  2013-06-22 11:20             ` [PATCH] ipvs: add sync_persist_mode flag Aleksey Chudov
@ 2013-06-22 12:43               ` Julian Anastasov
  2013-06-22 21:11                 ` Aleksey Chudov
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-22 12:43 UTC (permalink / raw)
  To: Aleksey Chudov; +Cc: lvs-devel


	Hello,

On Sat, 22 Jun 2013, Aleksey Chudov wrote:

> > From: Julian Anastasov <ja@ssi.bg>
> > [PATCH] ipvs: add sync_persist_mode flag
> > 
> > Add sync_persist_mode flag to reduce sync traffic
> > by syncing only persistent templates.
> > 
> > 
> 
> I tested the changes on one pair of LAN servers. After turning on
> sync_persist_mode synchronization traffic decreased by 4 times! Also on LVS
> Backup I can see only persist connections.

	Thanks! Can we assume that you have small number
of connections from every client? 1-2 per IP? Or a small
sync_refresh_period value is used? Can you show all these
sync* sysctl parameters that were used?

> First of all the Kernel on both servers have been updated. Then LVS Backup has
> been rebooted to drop all connections. After the reboot sync was disabled and
> all connections counters was zero. When I enabled sync again almost all
> persist connections from LVS Master have been synced to LVS Backup. But also I
> can see 0.04% of the connections in ESTABLISHED state, although they should
> not be there! After disabling sync all connections on LVS Backup completely
> disappear after about 5 minutes.

	Is it possible some of your services to be with
persistence disabled? Because the sync_persist_mode=1 mode
will not affect non-persistent services - their synchronisation
should work as before.

	You can also try to grep for such established conns:

# Find such ESTABLISHED conns
grep ESTAB ip_vs_conn

# and see what kind of conns we have from such client IPs, do
# we have persistence templates, etc.
grep client_ip ip_vs_conn

> After turning on sloppy_tcp IP address have been moved from LVS Master to LVS
> Backup. According to our statistics, most on clients remained on the same real
> servers. I don't see any problems with sloppy_tcp.

	So, at least sloppy_tcp appears to work :)

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] ipvs: add sync_persist_mode flag
  2013-06-22 12:43               ` Julian Anastasov
@ 2013-06-22 21:11                 ` Aleksey Chudov
  2013-06-23  8:34                   ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Aleksey Chudov @ 2013-06-22 21:11 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hello,

On 22.06.2013 15:43, Julian Anastasov wrote:
>
>> I tested the changes on one pair of LAN servers. After turning on
>> sync_persist_mode synchronization traffic decreased by 4 times! Also on LVS
>> Backup I can see only persist connections.
> 	Thanks! Can we assume that you have small number
> of connections from every client? 1-2 per IP? Or a small
> sync_refresh_period value is used? Can you show all these
> sync* sysctl parameters that were used?

Each client opens about 3 connections.

Below is all ipvs sysctl parameters

# sysctl -a | grep net\.ipv4\.vs\.
net.ipv4.vs.amemthresh = 1048576
net.ipv4.vs.am_droprate = 10
net.ipv4.vs.drop_entry = 1
net.ipv4.vs.drop_packet = 0
net.ipv4.vs.conntrack = 0
net.ipv4.vs.secure_tcp = 1
net.ipv4.vs.snat_reroute = 1
net.ipv4.vs.sync_version = 1
net.ipv4.vs.sync_ports = 16
net.ipv4.vs.sync_persist_mode = 1
net.ipv4.vs.cache_bypass = 0
net.ipv4.vs.expire_nodest_conn = 1
net.ipv4.vs.sloppy_tcp = 1
net.ipv4.vs.sloppy_sctp = 0
net.ipv4.vs.expire_quiescent_template = 1
net.ipv4.vs.sync_threshold = 0  0
net.ipv4.vs.sync_refresh_period = 200
net.ipv4.vs.sync_retries = 0
net.ipv4.vs.nat_icmp_send = 0
net.ipv4.vs.lblc_expiration = 86400
net.ipv4.vs.lblcr_expiration = 86400


>> First of all the Kernel on both servers have been updated. Then LVS Backup has
>> been rebooted to drop all connections. After the reboot sync was disabled and
>> all connections counters was zero. When I enabled sync again almost all
>> persist connections from LVS Master have been synced to LVS Backup. But also I
>> can see 0.04% of the connections in ESTABLISHED state, although they should
>> not be there! After disabling sync all connections on LVS Backup completely
>> disappear after about 5 minutes.
> 	Is it possible some of your services to be with
> persistence disabled? Because the sync_persist_mode=1 mode
> will not affect non-persistent services - their synchronisation
> should work as before.

There is only one service on this pair of LVS servers:
ipvsadm -A -f 1 -s wlc -p 300

May be some connection on LVS Master erroneously considered non-persistent?

> 	You can also try to grep for such established conns:
>
> # Find such ESTABLISHED conns
> grep ESTAB ip_vs_conn
>
> # and see what kind of conns we have from such client IPs, do
> # we have persistence templates, etc.
> grep client_ip ip_vs_conn

For most ESTABLISHED conn on LVS Backup there is also persistence 
templates on both LVS Backup and LVS Master.


On 22.06.2013 15:53, Julian Anastasov wrote:
>
> 	I checked this file, there are 50 EST conns to
> X.X.X.X, is it the same VIP that is part of persistent
> services? All these conns are at their end of life, so
> may be in the last 15mins there were more like them?
>

Yes. There is only one service. And there is always about 50 - 70 
ESTABLISHED conns on LVS Backup. This should not create any problems. 
Just do not understand where they came from.

It looks like "end of life" conns because of reduced default tcp timeout.

# ipvsadm -l --timeout
Timeout (tcp tcpfin udp): 150 60 300


Aleksey

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] ipvs: add sync_persist_mode flag
  2013-06-22 21:11                 ` Aleksey Chudov
@ 2013-06-23  8:34                   ` Julian Anastasov
  2013-06-24 14:37                     ` Aleksey Chudov
  0 siblings, 1 reply; 52+ messages in thread
From: Julian Anastasov @ 2013-06-23  8:34 UTC (permalink / raw)
  To: Aleksey Chudov; +Cc: lvs-devel


	Hello,

On Sun, 23 Jun 2013, Aleksey Chudov wrote:

> May be some connection on LVS Master erroneously considered non-persistent?

	May be we can know more if we
grep ESTAB ip_vs_conn_sync

	It will show SYNC or LOCAL. As for persistent
conns I assume they all are SYNC.

	I see this:

- for some IPs from these 50 ESTABLISHED conns there are
persistent templates, some to same real server, others to
different real server

- there are only ESTABLISHED TCP conns, no other closing states
such as TIME_WAIT, etc. One option is the traffic to be
some outdated retransmissions with ACK bit. But this
can happen only if these ESTABLISHED conns are LOCAL in
Backup, if they are SYNC Master would assign them
real server by using the persistence scheduling. And
if they have assigned persistent template via cp->control,
they will not be synced when sync_persist_mode is 1.

	So, may be only the SYNC/LOCAL flag can
give us more information.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] ipvs: add sync_persist_mode flag
  2013-06-23  8:34                   ` Julian Anastasov
@ 2013-06-24 14:37                     ` Aleksey Chudov
  2013-06-24 19:57                       ` Julian Anastasov
  0 siblings, 1 reply; 52+ messages in thread
From: Aleksey Chudov @ 2013-06-24 14:37 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: lvs-devel

Hello,

Pair of LVS servers with a modified kernel worked under load for several 
days without any problems.

I completed several additional tests. I analyzed the traffic to the LVS 
servers using tcpdump. There is not SYN or RST packets spike at the time 
of migration of IP address. Thus, most of the connections have continued 
to operate normally. I also once again made sure that most of clients 
remain on the same real servers.

Seems like sync_persist_mode combined with sloppy_tcp works fine.


Regards,
Aleksey

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH] ipvs: add sync_persist_mode flag
  2013-06-24 14:37                     ` Aleksey Chudov
@ 2013-06-24 19:57                       ` Julian Anastasov
  0 siblings, 0 replies; 52+ messages in thread
From: Julian Anastasov @ 2013-06-24 19:57 UTC (permalink / raw)
  To: Aleksey Chudov; +Cc: lvs-devel


	Hello,

On Mon, 24 Jun 2013, Aleksey Chudov wrote:

> Hello,
> 
> Pair of LVS servers with a modified kernel worked under load for several days
> without any problems.
> 
> I completed several additional tests. I analyzed the traffic to the LVS
> servers using tcpdump. There is not SYN or RST packets spike at the time of
> migration of IP address. Thus, most of the connections have continued to
> operate normally. I also once again made sure that most of clients remain on
> the same real servers.
> 
> Seems like sync_persist_mode combined with sloppy_tcp works fine.

	Thanks! I just posted the patch.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2013-06-24 19:57 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-24 12:09 [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling Alexander Frolkin
2013-05-24 15:05 ` Julian Anastasov
2013-05-24 15:14   ` Alexander Frolkin
2013-05-24 16:18     ` Aleksey Chudov
2013-05-27 21:31       ` Julian Anastasov
2013-05-28 13:41         ` Aleksey Chudov
2013-05-30  6:37           ` Julian Anastasov
2013-06-07  7:53             ` Alexander Frolkin
2013-06-19  9:03           ` Julian Anastasov
2013-06-19 19:25             ` Julian Anastasov
2013-06-20 17:02               ` Aleksey Chudov
2013-06-20 20:09                 ` Julian Anastasov
2013-06-19 20:44             ` Aleksey Chudov
2013-06-22 11:20             ` [PATCH] ipvs: add sync_persist_mode flag Aleksey Chudov
2013-06-22 12:43               ` Julian Anastasov
2013-06-22 21:11                 ` Aleksey Chudov
2013-06-23  8:34                   ` Julian Anastasov
2013-06-24 14:37                     ` Aleksey Chudov
2013-06-24 19:57                       ` Julian Anastasov
2013-05-27 21:11     ` [PATCH] Sloppy TCP, SH rebalancing, SHP scheduling Julian Anastasov
2013-06-07  8:12       ` Alexander Frolkin
2013-06-10 19:31         ` Julian Anastasov
2013-06-11  8:38           ` Alexander Frolkin
2013-06-11 19:57             ` Julian Anastasov
2013-06-12 14:10               ` Alexander Frolkin
2013-06-12 20:47                 ` Julian Anastasov
2013-06-13  8:38                   ` Alexander Frolkin
2013-06-13 12:56                   ` Alexander Frolkin
2013-06-13 19:50                     ` Julian Anastasov
2013-06-13 14:18                   ` Alexander Frolkin
2013-06-13 20:31                     ` Julian Anastasov
2013-06-14 10:22                       ` Alexander Frolkin
2013-06-16  6:52                         ` Julian Anastasov
2013-06-17  8:32                           ` Alexander Frolkin
2013-06-17  9:00                             ` Julian Anastasov
2013-06-17  9:04                             ` Julian Anastasov
2013-06-17 11:11                               ` Alexander Frolkin
2013-06-17 20:05                                 ` Julian Anastasov
2013-06-18  9:30                                   ` Alexander Frolkin
2013-06-18 20:52                                     ` Julian Anastasov
2013-06-14 11:47                       ` Alexander Frolkin
2013-06-16  8:30                         ` Julian Anastasov
2013-06-17 10:35                           ` Alexander Frolkin
2013-06-17 19:48                             ` Julian Anastasov
2013-06-18  9:08                               ` Alexander Frolkin
2013-06-18 20:41                                 ` Julian Anastasov
2013-06-10 15:12       ` Alexander Frolkin
2013-06-10 16:03         ` Alexander Frolkin
2013-06-10 20:52         ` Julian Anastasov
2013-06-11 12:38           ` Alexander Frolkin
2013-06-11 20:13             ` Julian Anastasov
2013-06-12 10:49               ` Alexander Frolkin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.