All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] ipvs: allow rescheduling of new connections when port reuse is detected
@ 2015-02-23 18:02 Marcelo Ricardo Leitner
  2015-02-23 20:07 ` Julian Anastasov
  0 siblings, 1 reply; 3+ messages in thread
From: Marcelo Ricardo Leitner @ 2015-02-23 18:02 UTC (permalink / raw)
  To: lvs-devel; +Cc: Julian Anastasov

Currently, when TCP/SCTP port reusing happens, IPVS will find the old
entry and use it for the new one, behaving like a forced persistence.
But if you consider a cluster with a heavy load of small connections,
such reuse will happen often and may lead to a not optimal load
balancing and might prevent a new node from getting a fair load.

This patch introduces a new sysctl, conn_reuse_mode, that allows
controlling how to proceed when port reuse is detected. The default
value will allow rescheduling of new connections only if the old entry
was in TIME_WAIT state for TCP or CLOSED for SCTP.

Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
---

Notes:
    v1->v2:
      updated to add kfree(param->pe_data) in ip_vs_proc_conn() chunk

 Documentation/networking/ipvs-sysctl.txt | 21 ++++++++++++++++++++
 include/net/ip_vs.h                      | 11 +++++++++++
 net/netfilter/ipvs/ip_vs_core.c          | 33 ++++++++++++++++++++++++++++----
 net/netfilter/ipvs/ip_vs_ctl.c           |  8 ++++++++
 net/netfilter/ipvs/ip_vs_sync.c          | 21 ++++++++++++++++++--
 5 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt
index 7a3c047295914cbc8c4273506a9b6d35246a1750..3ba709531adba970595251fa73d6d471ed14c5c1 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -22,6 +22,27 @@ backup_only - BOOLEAN
 	If set, disable the director function while the server is
 	in backup mode to avoid packet loops for DR/TUN methods.
 
+conn_reuse_mode - INTEGER
+	1 - default
+
+	Controls how ipvs will deal with connections that are detected
+	port reuse. It is a bitmap, with the values being:
+
+	0: disable any special handling on port reuse. The new
+	connection will be delivered to the same real server that was
+	servicing the previous connection. This will effectively
+	disable expire_nodest_conn.
+
+	bit 1: enable rescheduling of new connections when it is safe.
+	That is, whenever expire_nodest_conn and for TCP sockets, when
+	the connection is in TIME_WAIT state (which is only possible if
+	you use NAT mode).
+
+	bit 2: it is bit 1 plus, for TCP connections, when connections
+	are in FIN_WAIT state, as this is the last state seen by load
+	balancer in Direct Routing mode. This bit helps on adding new
+	real servers to a very busy cluster.
+
 conntrack - BOOLEAN
 	0 - disabled (default)
 	not 0 - enabled
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 615b20b585452111a25085890d8fa875657dbe76..6c7ee0ae7ef1694671e4b6af0906b2fa077f5c7c 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -924,6 +924,7 @@ struct netns_ipvs {
 	int			sysctl_nat_icmp_send;
 	int			sysctl_pmtu_disc;
 	int			sysctl_backup_only;
+	int			sysctl_conn_reuse_mode;
 
 	/* ip_vs_lblc */
 	int			sysctl_lblc_expiration;
@@ -1042,6 +1043,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
 	       ipvs->sysctl_backup_only;
 }
 
+static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_conn_reuse_mode;
+}
+
 #else
 
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -1109,6 +1115,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
 	return 0;
 }
 
+static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
+{
+	return 1;
+}
+
 #endif
 
 /* IPVS core functions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b87ca32efa0b4e6edc7f251c2c32c4ba3b55659c..3ec9b1a54024fa421f330cf1d0eeb67da9683127 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1046,6 +1046,26 @@ static inline bool is_new_conn(const struct sk_buff *skb,
 	}
 }
 
+static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
+					int conn_reuse_mode)
+{
+	/* Controlled (FTP DATA or persistence)? */
+	if (cp->control)
+		return false;
+
+	switch (cp->protocol) {
+	case IPPROTO_TCP:
+		return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
+			((conn_reuse_mode & 2) &&
+			 (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
+			 (cp->flags & IP_VS_CONN_F_NOOUTPUT));
+	case IPPROTO_SCTP:
+		return cp->state == IP_VS_SCTP_S_CLOSED;
+	default:
+		return false;
+	}
+}
+
 /* Handle response packets: rewrite addresses and send away...
  */
 static unsigned int
@@ -1585,6 +1605,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	struct ip_vs_conn *cp;
 	int ret, pkts;
 	struct netns_ipvs *ipvs;
+	int conn_reuse_mode;
 
 	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
@@ -1653,10 +1674,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	 */
 	cp = pp->conn_in_get(af, skb, &iph, 0);
 
-	if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
-	    unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
-	    is_new_conn(skb, &iph)) {
-		ip_vs_conn_expire_now(cp);
+	conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
+	if (conn_reuse_mode && !iph.fragoffs &&
+	    is_new_conn(skb, &iph) && cp &&
+	    ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
+	      unlikely(!atomic_read(&cp->dest->weight))) ||
+	     unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
+		if (!atomic_read(&cp->n_control))
+			ip_vs_conn_expire_now(cp);
 		__ip_vs_conn_put(cp);
 		cp = NULL;
 	}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e55759056361c47ed1fcfa5c656541ba39bfd260..ec7f6f1e07cee1d15a6f839defc86aec8abd821e 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "conn_reuse_mode",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_IP_VS_DEBUG
 	{
 		.procname	= "debug_level",
@@ -3732,6 +3738,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	ipvs->sysctl_pmtu_disc = 1;
 	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
 	tbl[idx++].data = &ipvs->sysctl_backup_only;
+	ipvs->sysctl_conn_reuse_mode = 1;
+	tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
 
 
 	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index c47ffd7a0a709cb73834c84652f251960f25db79..f96229cdb6e184543b6b958575c08c5a3c1b4b72 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 	struct ip_vs_conn *cp;
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	if (!(flags & IP_VS_CONN_F_TEMPLATE))
+	if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
 		cp = ip_vs_conn_in_get(param);
-	else
+		if (cp && ((cp->dport != dport) ||
+			   !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
+			if (!(flags & IP_VS_CONN_F_INACTIVE)) {
+				ip_vs_conn_expire_now(cp);
+				__ip_vs_conn_put(cp);
+				cp = NULL;
+			} else {
+				/* This is the expiration message for the
+				 * connection that was already replaced, so we
+				 * just ignore it.
+				 */
+				__ip_vs_conn_put(cp);
+				kfree(param->pe_data);
+				return;
+			}
+		}
+	} else {
 		cp = ip_vs_ct_in_get(param);
+	}
 
 	if (cp) {
 		/* Free pe_data */
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] ipvs: allow rescheduling of new connections when port reuse is detected
  2015-02-23 18:02 [PATCH v2] ipvs: allow rescheduling of new connections when port reuse is detected Marcelo Ricardo Leitner
@ 2015-02-23 20:07 ` Julian Anastasov
  2015-02-25  4:46   ` Simon Horman
  0 siblings, 1 reply; 3+ messages in thread
From: Julian Anastasov @ 2015-02-23 20:07 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner; +Cc: lvs-devel, Simon Horman


	Hello,

On Mon, 23 Feb 2015, Marcelo Ricardo Leitner wrote:

> Currently, when TCP/SCTP port reusing happens, IPVS will find the old
> entry and use it for the new one, behaving like a forced persistence.
> But if you consider a cluster with a heavy load of small connections,
> such reuse will happen often and may lead to a not optimal load
> balancing and might prevent a new node from getting a fair load.
> 
> This patch introduces a new sysctl, conn_reuse_mode, that allows
> controlling how to proceed when port reuse is detected. The default
> value will allow rescheduling of new connections only if the old entry
> was in TIME_WAIT state for TCP or CLOSED for SCTP.
> 
> Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>

	Thanks, looks good. Simon, please apply to ipvs-next.

Signed-off-by: Julian Anastasov <ja@ssi.bg>

> ---
> 
> Notes:
>     v1->v2:
>       updated to add kfree(param->pe_data) in ip_vs_proc_conn() chunk
> 
>  Documentation/networking/ipvs-sysctl.txt | 21 ++++++++++++++++++++
>  include/net/ip_vs.h                      | 11 +++++++++++
>  net/netfilter/ipvs/ip_vs_core.c          | 33 ++++++++++++++++++++++++++++----
>  net/netfilter/ipvs/ip_vs_ctl.c           |  8 ++++++++
>  net/netfilter/ipvs/ip_vs_sync.c          | 21 ++++++++++++++++++--
>  5 files changed, 88 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt
> index 7a3c047295914cbc8c4273506a9b6d35246a1750..3ba709531adba970595251fa73d6d471ed14c5c1 100644
> --- a/Documentation/networking/ipvs-sysctl.txt
> +++ b/Documentation/networking/ipvs-sysctl.txt
> @@ -22,6 +22,27 @@ backup_only - BOOLEAN
>  	If set, disable the director function while the server is
>  	in backup mode to avoid packet loops for DR/TUN methods.
>  
> +conn_reuse_mode - INTEGER
> +	1 - default
> +
> +	Controls how ipvs will deal with connections that are detected
> +	port reuse. It is a bitmap, with the values being:
> +
> +	0: disable any special handling on port reuse. The new
> +	connection will be delivered to the same real server that was
> +	servicing the previous connection. This will effectively
> +	disable expire_nodest_conn.
> +
> +	bit 1: enable rescheduling of new connections when it is safe.
> +	That is, whenever expire_nodest_conn and for TCP sockets, when
> +	the connection is in TIME_WAIT state (which is only possible if
> +	you use NAT mode).
> +
> +	bit 2: it is bit 1 plus, for TCP connections, when connections
> +	are in FIN_WAIT state, as this is the last state seen by load
> +	balancer in Direct Routing mode. This bit helps on adding new
> +	real servers to a very busy cluster.
> +
>  conntrack - BOOLEAN
>  	0 - disabled (default)
>  	not 0 - enabled
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index 615b20b585452111a25085890d8fa875657dbe76..6c7ee0ae7ef1694671e4b6af0906b2fa077f5c7c 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -924,6 +924,7 @@ struct netns_ipvs {
>  	int			sysctl_nat_icmp_send;
>  	int			sysctl_pmtu_disc;
>  	int			sysctl_backup_only;
> +	int			sysctl_conn_reuse_mode;
>  
>  	/* ip_vs_lblc */
>  	int			sysctl_lblc_expiration;
> @@ -1042,6 +1043,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
>  	       ipvs->sysctl_backup_only;
>  }
>  
> +static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
> +{
> +	return ipvs->sysctl_conn_reuse_mode;
> +}
> +
>  #else
>  
>  static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
> @@ -1109,6 +1115,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
>  	return 0;
>  }
>  
> +static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
> +{
> +	return 1;
> +}
> +
>  #endif
>  
>  /* IPVS core functions
> diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
> index b87ca32efa0b4e6edc7f251c2c32c4ba3b55659c..3ec9b1a54024fa421f330cf1d0eeb67da9683127 100644
> --- a/net/netfilter/ipvs/ip_vs_core.c
> +++ b/net/netfilter/ipvs/ip_vs_core.c
> @@ -1046,6 +1046,26 @@ static inline bool is_new_conn(const struct sk_buff *skb,
>  	}
>  }
>  
> +static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
> +					int conn_reuse_mode)
> +{
> +	/* Controlled (FTP DATA or persistence)? */
> +	if (cp->control)
> +		return false;
> +
> +	switch (cp->protocol) {
> +	case IPPROTO_TCP:
> +		return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
> +			((conn_reuse_mode & 2) &&
> +			 (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
> +			 (cp->flags & IP_VS_CONN_F_NOOUTPUT));
> +	case IPPROTO_SCTP:
> +		return cp->state == IP_VS_SCTP_S_CLOSED;
> +	default:
> +		return false;
> +	}
> +}
> +
>  /* Handle response packets: rewrite addresses and send away...
>   */
>  static unsigned int
> @@ -1585,6 +1605,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
>  	struct ip_vs_conn *cp;
>  	int ret, pkts;
>  	struct netns_ipvs *ipvs;
> +	int conn_reuse_mode;
>  
>  	/* Already marked as IPVS request or reply? */
>  	if (skb->ipvs_property)
> @@ -1653,10 +1674,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
>  	 */
>  	cp = pp->conn_in_get(af, skb, &iph, 0);
>  
> -	if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
> -	    unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
> -	    is_new_conn(skb, &iph)) {
> -		ip_vs_conn_expire_now(cp);
> +	conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
> +	if (conn_reuse_mode && !iph.fragoffs &&
> +	    is_new_conn(skb, &iph) && cp &&
> +	    ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
> +	      unlikely(!atomic_read(&cp->dest->weight))) ||
> +	     unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
> +		if (!atomic_read(&cp->n_control))
> +			ip_vs_conn_expire_now(cp);
>  		__ip_vs_conn_put(cp);
>  		cp = NULL;
>  	}
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index e55759056361c47ed1fcfa5c656541ba39bfd260..ec7f6f1e07cee1d15a6f839defc86aec8abd821e 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = {
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec,
>  	},
> +	{
> +		.procname	= "conn_reuse_mode",
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
>  #ifdef CONFIG_IP_VS_DEBUG
>  	{
>  		.procname	= "debug_level",
> @@ -3732,6 +3738,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
>  	ipvs->sysctl_pmtu_disc = 1;
>  	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
>  	tbl[idx++].data = &ipvs->sysctl_backup_only;
> +	ipvs->sysctl_conn_reuse_mode = 1;
> +	tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
>  
>  
>  	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
> diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
> index c47ffd7a0a709cb73834c84652f251960f25db79..f96229cdb6e184543b6b958575c08c5a3c1b4b72 100644
> --- a/net/netfilter/ipvs/ip_vs_sync.c
> +++ b/net/netfilter/ipvs/ip_vs_sync.c
> @@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
>  	struct ip_vs_conn *cp;
>  	struct netns_ipvs *ipvs = net_ipvs(net);
>  
> -	if (!(flags & IP_VS_CONN_F_TEMPLATE))
> +	if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
>  		cp = ip_vs_conn_in_get(param);
> -	else
> +		if (cp && ((cp->dport != dport) ||
> +			   !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
> +			if (!(flags & IP_VS_CONN_F_INACTIVE)) {
> +				ip_vs_conn_expire_now(cp);
> +				__ip_vs_conn_put(cp);
> +				cp = NULL;
> +			} else {
> +				/* This is the expiration message for the
> +				 * connection that was already replaced, so we
> +				 * just ignore it.
> +				 */
> +				__ip_vs_conn_put(cp);
> +				kfree(param->pe_data);
> +				return;
> +			}
> +		}
> +	} else {
>  		cp = ip_vs_ct_in_get(param);
> +	}
>  
>  	if (cp) {
>  		/* Free pe_data */
> -- 
> 1.9.3

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] ipvs: allow rescheduling of new connections when port reuse is detected
  2015-02-23 20:07 ` Julian Anastasov
@ 2015-02-25  4:46   ` Simon Horman
  0 siblings, 0 replies; 3+ messages in thread
From: Simon Horman @ 2015-02-25  4:46 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: Marcelo Ricardo Leitner, lvs-devel

On Mon, Feb 23, 2015 at 10:07:21PM +0200, Julian Anastasov wrote:
> 
> 	Hello,
> 
> On Mon, 23 Feb 2015, Marcelo Ricardo Leitner wrote:
> 
> > Currently, when TCP/SCTP port reusing happens, IPVS will find the old
> > entry and use it for the new one, behaving like a forced persistence.
> > But if you consider a cluster with a heavy load of small connections,
> > such reuse will happen often and may lead to a not optimal load
> > balancing and might prevent a new node from getting a fair load.
> > 
> > This patch introduces a new sysctl, conn_reuse_mode, that allows
> > controlling how to proceed when port reuse is detected. The default
> > value will allow rescheduling of new connections only if the old entry
> > was in TIME_WAIT state for TCP or CLOSED for SCTP.
> > 
> > Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
> 
> 	Thanks, looks good. Simon, please apply to ipvs-next.
> 
> Signed-off-by: Julian Anastasov <ja@ssi.bg>

Thanks, applied to ipvs-next.

> > ---
> > 
> > Notes:
> >     v1->v2:
> >       updated to add kfree(param->pe_data) in ip_vs_proc_conn() chunk
> > 
> >  Documentation/networking/ipvs-sysctl.txt | 21 ++++++++++++++++++++
> >  include/net/ip_vs.h                      | 11 +++++++++++
> >  net/netfilter/ipvs/ip_vs_core.c          | 33 ++++++++++++++++++++++++++++----
> >  net/netfilter/ipvs/ip_vs_ctl.c           |  8 ++++++++
> >  net/netfilter/ipvs/ip_vs_sync.c          | 21 ++++++++++++++++++--
> >  5 files changed, 88 insertions(+), 6 deletions(-)
> > 
> > diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt
> > index 7a3c047295914cbc8c4273506a9b6d35246a1750..3ba709531adba970595251fa73d6d471ed14c5c1 100644
> > --- a/Documentation/networking/ipvs-sysctl.txt
> > +++ b/Documentation/networking/ipvs-sysctl.txt
> > @@ -22,6 +22,27 @@ backup_only - BOOLEAN
> >  	If set, disable the director function while the server is
> >  	in backup mode to avoid packet loops for DR/TUN methods.
> >  
> > +conn_reuse_mode - INTEGER
> > +	1 - default
> > +
> > +	Controls how ipvs will deal with connections that are detected
> > +	port reuse. It is a bitmap, with the values being:
> > +
> > +	0: disable any special handling on port reuse. The new
> > +	connection will be delivered to the same real server that was
> > +	servicing the previous connection. This will effectively
> > +	disable expire_nodest_conn.
> > +
> > +	bit 1: enable rescheduling of new connections when it is safe.
> > +	That is, whenever expire_nodest_conn and for TCP sockets, when
> > +	the connection is in TIME_WAIT state (which is only possible if
> > +	you use NAT mode).
> > +
> > +	bit 2: it is bit 1 plus, for TCP connections, when connections
> > +	are in FIN_WAIT state, as this is the last state seen by load
> > +	balancer in Direct Routing mode. This bit helps on adding new
> > +	real servers to a very busy cluster.
> > +
> >  conntrack - BOOLEAN
> >  	0 - disabled (default)
> >  	not 0 - enabled
> > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> > index 615b20b585452111a25085890d8fa875657dbe76..6c7ee0ae7ef1694671e4b6af0906b2fa077f5c7c 100644
> > --- a/include/net/ip_vs.h
> > +++ b/include/net/ip_vs.h
> > @@ -924,6 +924,7 @@ struct netns_ipvs {
> >  	int			sysctl_nat_icmp_send;
> >  	int			sysctl_pmtu_disc;
> >  	int			sysctl_backup_only;
> > +	int			sysctl_conn_reuse_mode;
> >  
> >  	/* ip_vs_lblc */
> >  	int			sysctl_lblc_expiration;
> > @@ -1042,6 +1043,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
> >  	       ipvs->sysctl_backup_only;
> >  }
> >  
> > +static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
> > +{
> > +	return ipvs->sysctl_conn_reuse_mode;
> > +}
> > +
> >  #else
> >  
> >  static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
> > @@ -1109,6 +1115,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
> >  	return 0;
> >  }
> >  
> > +static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
> > +{
> > +	return 1;
> > +}
> > +
> >  #endif
> >  
> >  /* IPVS core functions
> > diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
> > index b87ca32efa0b4e6edc7f251c2c32c4ba3b55659c..3ec9b1a54024fa421f330cf1d0eeb67da9683127 100644
> > --- a/net/netfilter/ipvs/ip_vs_core.c
> > +++ b/net/netfilter/ipvs/ip_vs_core.c
> > @@ -1046,6 +1046,26 @@ static inline bool is_new_conn(const struct sk_buff *skb,
> >  	}
> >  }
> >  
> > +static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
> > +					int conn_reuse_mode)
> > +{
> > +	/* Controlled (FTP DATA or persistence)? */
> > +	if (cp->control)
> > +		return false;
> > +
> > +	switch (cp->protocol) {
> > +	case IPPROTO_TCP:
> > +		return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
> > +			((conn_reuse_mode & 2) &&
> > +			 (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
> > +			 (cp->flags & IP_VS_CONN_F_NOOUTPUT));
> > +	case IPPROTO_SCTP:
> > +		return cp->state == IP_VS_SCTP_S_CLOSED;
> > +	default:
> > +		return false;
> > +	}
> > +}
> > +
> >  /* Handle response packets: rewrite addresses and send away...
> >   */
> >  static unsigned int
> > @@ -1585,6 +1605,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
> >  	struct ip_vs_conn *cp;
> >  	int ret, pkts;
> >  	struct netns_ipvs *ipvs;
> > +	int conn_reuse_mode;
> >  
> >  	/* Already marked as IPVS request or reply? */
> >  	if (skb->ipvs_property)
> > @@ -1653,10 +1674,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
> >  	 */
> >  	cp = pp->conn_in_get(af, skb, &iph, 0);
> >  
> > -	if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
> > -	    unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
> > -	    is_new_conn(skb, &iph)) {
> > -		ip_vs_conn_expire_now(cp);
> > +	conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
> > +	if (conn_reuse_mode && !iph.fragoffs &&
> > +	    is_new_conn(skb, &iph) && cp &&
> > +	    ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
> > +	      unlikely(!atomic_read(&cp->dest->weight))) ||
> > +	     unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
> > +		if (!atomic_read(&cp->n_control))
> > +			ip_vs_conn_expire_now(cp);
> >  		__ip_vs_conn_put(cp);
> >  		cp = NULL;
> >  	}
> > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> > index e55759056361c47ed1fcfa5c656541ba39bfd260..ec7f6f1e07cee1d15a6f839defc86aec8abd821e 100644
> > --- a/net/netfilter/ipvs/ip_vs_ctl.c
> > +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> > @@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = {
> >  		.mode		= 0644,
> >  		.proc_handler	= proc_dointvec,
> >  	},
> > +	{
> > +		.procname	= "conn_reuse_mode",
> > +		.maxlen		= sizeof(int),
> > +		.mode		= 0644,
> > +		.proc_handler	= proc_dointvec,
> > +	},
> >  #ifdef CONFIG_IP_VS_DEBUG
> >  	{
> >  		.procname	= "debug_level",
> > @@ -3732,6 +3738,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
> >  	ipvs->sysctl_pmtu_disc = 1;
> >  	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
> >  	tbl[idx++].data = &ipvs->sysctl_backup_only;
> > +	ipvs->sysctl_conn_reuse_mode = 1;
> > +	tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
> >  
> >  
> >  	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
> > diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
> > index c47ffd7a0a709cb73834c84652f251960f25db79..f96229cdb6e184543b6b958575c08c5a3c1b4b72 100644
> > --- a/net/netfilter/ipvs/ip_vs_sync.c
> > +++ b/net/netfilter/ipvs/ip_vs_sync.c
> > @@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
> >  	struct ip_vs_conn *cp;
> >  	struct netns_ipvs *ipvs = net_ipvs(net);
> >  
> > -	if (!(flags & IP_VS_CONN_F_TEMPLATE))
> > +	if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
> >  		cp = ip_vs_conn_in_get(param);
> > -	else
> > +		if (cp && ((cp->dport != dport) ||
> > +			   !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
> > +			if (!(flags & IP_VS_CONN_F_INACTIVE)) {
> > +				ip_vs_conn_expire_now(cp);
> > +				__ip_vs_conn_put(cp);
> > +				cp = NULL;
> > +			} else {
> > +				/* This is the expiration message for the
> > +				 * connection that was already replaced, so we
> > +				 * just ignore it.
> > +				 */
> > +				__ip_vs_conn_put(cp);
> > +				kfree(param->pe_data);
> > +				return;
> > +			}
> > +		}
> > +	} else {
> >  		cp = ip_vs_ct_in_get(param);
> > +	}
> >  
> >  	if (cp) {
> >  		/* Free pe_data */
> > -- 
> > 1.9.3
> 
> Regards
> 
> --
> Julian Anastasov <ja@ssi.bg>
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-02-25  4:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-23 18:02 [PATCH v2] ipvs: allow rescheduling of new connections when port reuse is detected Marcelo Ricardo Leitner
2015-02-23 20:07 ` Julian Anastasov
2015-02-25  4:46   ` Simon Horman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.