netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v2 net-next 0/3] tcp: add NV congestion control
@ 2015-07-22  4:21 Lawrence Brakmo
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-22  4:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet

This patchset adds support for NV congestion control.

The first patch replaces two arguments in the pkts_acked() function
of the congestion control modules with a struct, making it easier to
add more parameters later without modifying the existing congestion
control modules.

The second patch adds the number of bytes in_flight when a packet is sent
to the tcp_skb_cb without increasing its size.

The third patch adds NV congestion control support.

[RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked()
[RFC PATCH v2 net-next 2/3] tcp: add in_flight to tcp_skb_cb
[RFC PATCH v2 net-next 3/3] tcp: add NV congestion control

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>

 include/net/tcp.h          |  21 ++-
 net/ipv4/Kconfig           |  16 ++
 net/ipv4/Makefile          |   1 +
 net/ipv4/sysctl_net_ipv4.c |   9 +
 net/ipv4/tcp_bic.c         |   6 +-
 net/ipv4/tcp_cdg.c         |  14 +-
 net/ipv4/tcp_cubic.c       |   6 +-
 net/ipv4/tcp_htcp.c        |  10 +-
 net/ipv4/tcp_illinois.c    |  20 +-
 net/ipv4/tcp_input.c       |  12 +-
 net/ipv4/tcp_lp.c          |   6 +-
 net/ipv4/tcp_nv.c          | 479 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_output.c      |   4 +-
 net/ipv4/tcp_vegas.c       |   6 +-
 net/ipv4/tcp_vegas.h       |   2 +-
 net/ipv4/tcp_veno.c        |   6 +-
 net/ipv4/tcp_westwood.c    |   6 +-
 net/ipv4/tcp_yeah.c        |   6 +-
 18 files changed, 579 insertions(+), 51 deletions(-)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked()
  2015-07-22  4:21 [RFC PATCH v2 net-next 0/3] tcp: add NV congestion control Lawrence Brakmo
@ 2015-07-22  4:21 ` Lawrence Brakmo
  2015-07-22  5:46   ` Eric Dumazet
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 2/3] tcp: add in_flight to tcp_skb_cb Lawrence Brakmo
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control Lawrence Brakmo
  2 siblings, 1 reply; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-22  4:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet

Replace 2 arguments (cnt and rtt) in the congestion control modules'
pkts_acked() function with a struct. This will allow adding more
information without having to modify existing congestion control
modules (tcp_nv in particular needs bytes in flight when packet
was sent).

This was proposed by Neal Cardwell in his comments to the tcp_nv patch.

Signed-off-by: Lawrence Brakmo <lawrence@brakmo.org>
---
 include/net/tcp.h       |  7 ++++++-
 net/ipv4/tcp_bic.c      |  6 +++---
 net/ipv4/tcp_cdg.c      | 14 +++++++-------
 net/ipv4/tcp_cubic.c    |  6 +++---
 net/ipv4/tcp_htcp.c     | 10 +++++-----
 net/ipv4/tcp_illinois.c | 20 ++++++++++----------
 net/ipv4/tcp_input.c    |  7 +++++--
 net/ipv4/tcp_lp.c       |  6 +++---
 net/ipv4/tcp_vegas.c    |  6 +++---
 net/ipv4/tcp_vegas.h    |  2 +-
 net/ipv4/tcp_veno.c     |  6 +++---
 net/ipv4/tcp_westwood.c |  6 +++---
 net/ipv4/tcp_yeah.c     |  6 +++---
 13 files changed, 55 insertions(+), 47 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 364426a..26e7651 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -834,6 +834,11 @@ enum tcp_ca_ack_event_flags {
 
 union tcp_cc_info;
 
+struct ack_sample {
+	u32 pkts_acked;
+	s32 rtt_us;
+};
+
 struct tcp_congestion_ops {
 	struct list_head	list;
 	u32 key;
@@ -857,7 +862,7 @@ struct tcp_congestion_ops {
 	/* new value of cwnd after loss (optional) */
 	u32  (*undo_cwnd)(struct sock *sk);
 	/* hook for packet ack accounting (optional) */
-	void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us);
+	void (*pkts_acked)(struct sock *sk, struct ack_sample);
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index fd1405d..6a873f7 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -197,15 +197,15 @@ static void bictcp_state(struct sock *sk, u8 new_state)
 /* Track delayed acknowledgment ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void bictcp_acked(struct sock *sk, struct ack_sample sample)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (icsk->icsk_ca_state == TCP_CA_Open) {
 		struct bictcp *ca = inet_csk_ca(sk);
 
-		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
-		ca->delayed_ack += cnt;
+		ca->delayed_ack += sample.pkts_acked - 
+			(ca->delayed_ack >> ACK_RATIO_SHIFT);
 	}
 }
 
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 167b6a3..ef64106 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -294,12 +294,12 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
 }
 
-static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_cdg_acked(struct sock *sk, struct ack_sample sample)
 {
 	struct cdg *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (rtt_us <= 0)
+	if (sample.rtt_us <= 0)
 		return;
 
 	/* A heuristic for filtering delayed ACKs, adapted from:
@@ -307,20 +307,20 @@ static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
 	 * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
 	 */
 	if (tp->sacked_out == 0) {
-		if (num_acked == 1 && ca->delack) {
+		if (sample.pkts_acked == 1 && ca->delack) {
 			/* A delayed ACK is only used for the minimum if it is
 			 * provenly lower than an existing non-zero minimum.
 			 */
-			ca->rtt.min = min(ca->rtt.min, rtt_us);
+			ca->rtt.min = min(ca->rtt.min, sample.rtt_us);
 			ca->delack--;
 			return;
-		} else if (num_acked > 1 && ca->delack < 5) {
+		} else if (sample.pkts_acked > 1 && ca->delack < 5) {
 			ca->delack++;
 		}
 	}
 
-	ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
-	ca->rtt.max = max(ca->rtt.max, rtt_us);
+	ca->rtt.min = min_not_zero(ca->rtt.min, sample.rtt_us);
+	ca->rtt.max = max(ca->rtt.max, sample.rtt_us);
 }
 
 static u32 tcp_cdg_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 28011fb..070d629 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -416,21 +416,21 @@ static void hystart_update(struct sock *sk, u32 delay)
 /* Track delayed acknowledgment ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void bictcp_acked(struct sock *sk, struct ack_sample sample)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
 	u32 delay;
 
 	/* Some calls are for duplicates without timetamps */
-	if (rtt_us < 0)
+	if (sample.rtt_us < 0)
 		return;
 
 	/* Discard delay samples right after fast recovery */
 	if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
 		return;
 
-	delay = (rtt_us << 3) / USEC_PER_MSEC;
+	delay = (sample.rtt_us << 3) / USEC_PER_MSEC;
 	if (delay == 0)
 		delay = 1;
 
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 82f0d9e..4f0fb11 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -99,7 +99,7 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
 }
 
 static void measure_achieved_throughput(struct sock *sk,
-					u32 pkts_acked, s32 rtt)
+					struct ack_sample sample)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
@@ -107,10 +107,10 @@ static void measure_achieved_throughput(struct sock *sk,
 	u32 now = tcp_time_stamp;
 
 	if (icsk->icsk_ca_state == TCP_CA_Open)
-		ca->pkts_acked = pkts_acked;
+		ca->pkts_acked = sample.pkts_acked;
 
-	if (rtt > 0)
-		measure_rtt(sk, usecs_to_jiffies(rtt));
+	if (sample.rtt_us > 0)
+		measure_rtt(sk, usecs_to_jiffies(sample.rtt_us));
 
 	if (!use_bandwidth_switch)
 		return;
@@ -122,7 +122,7 @@ static void measure_achieved_throughput(struct sock *sk,
 		return;
 	}
 
-	ca->packetcount += pkts_acked;
+	ca->packetcount += sample.pkts_acked;
 
 	if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
 	    now - ca->lasttime >= ca->minRTT &&
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 2ab9bbb..7d18b71 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -82,30 +82,30 @@ static void tcp_illinois_init(struct sock *sk)
 }
 
 /* Measure RTT for each ack. */
-static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
+static void tcp_illinois_acked(struct sock *sk, struct ack_sample sample)
 {
 	struct illinois *ca = inet_csk_ca(sk);
 
-	ca->acked = pkts_acked;
+	ca->acked = sample.pkts_acked;
 
 	/* dup ack, no rtt sample */
-	if (rtt < 0)
+	if (sample.rtt_us < 0)
 		return;
 
 	/* ignore bogus values, this prevents wraparound in alpha math */
-	if (rtt > RTT_MAX)
-		rtt = RTT_MAX;
+	if (sample.rtt_us > RTT_MAX)
+		sample.rtt_us = RTT_MAX;
 
 	/* keep track of minimum RTT seen so far */
-	if (ca->base_rtt > rtt)
-		ca->base_rtt = rtt;
+	if (ca->base_rtt > sample.rtt_us)
+		ca->base_rtt = sample.rtt_us;
 
 	/* and max */
-	if (ca->max_rtt < rtt)
-		ca->max_rtt = rtt;
+	if (ca->max_rtt < sample.rtt_us)
+		ca->max_rtt = sample.rtt_us;
 
 	++ca->cnt_rtt;
-	ca->sum_rtt += rtt;
+	ca->sum_rtt += sample.rtt_us;
 }
 
 /* Maximum queuing delay */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1578fc2..4f641f6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3195,8 +3195,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		tcp_rearm_rto(sk);
 	}
 
-	if (icsk->icsk_ca_ops->pkts_acked)
-		icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+	if (icsk->icsk_ca_ops->pkts_acked) {
+		struct ack_sample sample = {pkts_acked, ca_rtt_us};
+
+		icsk->icsk_ca_ops->pkts_acked(sk, sample);
+	}
 
 #if FASTRETRANS_DEBUG > 0
 	WARN_ON((int)tp->sacked_out < 0);
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 1e70fa8..341c768 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -260,13 +260,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
  * newReno in increase case.
  * We work it out by following the idea from TCP-LP's paper directly
  */
-static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_lp_pkts_acked(struct sock *sk, struct ack_sample sample)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct lp *lp = inet_csk_ca(sk);
 
-	if (rtt_us > 0)
-		tcp_lp_rtt_sample(sk, rtt_us);
+	if (sample.rtt_us > 0)
+		tcp_lp_rtt_sample(sk, sample.rtt_us);
 
 	/* calc inference */
 	if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13951c4..17d0a84 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -107,16 +107,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init);
  *   o min-filter RTT samples from a much longer window (forever for now)
  *     to find the propagation delay (baseRTT)
  */
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+void tcp_vegas_pkts_acked(struct sock *sk, struct ack_sample sample)
 {
 	struct vegas *vegas = inet_csk_ca(sk);
 	u32 vrtt;
 
-	if (rtt_us < 0)
+	if (sample.rtt_us < 0)
 		return;
 
 	/* Never allow zero rtt or baseRTT */
-	vrtt = rtt_us + 1;
+	vrtt = sample.rtt_us + 1;
 
 	/* Filter to find propagation delay: */
 	if (vrtt < vegas->baseRTT)
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index ef9da53..469f9ad 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -17,7 +17,7 @@ struct vegas {
 
 void tcp_vegas_init(struct sock *sk);
 void tcp_vegas_state(struct sock *sk, u8 ca_state);
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_pkts_acked(struct sock *sk, struct ack_sample sample);
 void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
 size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
 			  union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 0d094b9..563d8fd 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -69,16 +69,16 @@ static void tcp_veno_init(struct sock *sk)
 }
 
 /* Do rtt sampling needed for Veno. */
-static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void tcp_veno_pkts_acked(struct sock *sk, struct ack_sample sample)
 {
 	struct veno *veno = inet_csk_ca(sk);
 	u32 vrtt;
 
-	if (rtt_us < 0)
+	if (sample.rtt_us < 0)
 		return;
 
 	/* Never allow zero rtt or baseRTT */
-	vrtt = rtt_us + 1;
+	vrtt = sample.rtt_us + 1;
 
 	/* Filter to find propagation delay: */
 	if (vrtt < veno->basertt)
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index c10732e..d330c3e9 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -99,12 +99,12 @@ static void westwood_filter(struct westwood *w, u32 delta)
  * Called after processing group of packets.
  * but all westwood needs is the last sample of srtt.
  */
-static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void tcp_westwood_pkts_acked(struct sock *sk, struct ack_sample sample)
 {
 	struct westwood *w = inet_csk_ca(sk);
 
-	if (rtt > 0)
-		w->rtt = usecs_to_jiffies(rtt);
+	if (sample.rtt_us > 0)
+		w->rtt = usecs_to_jiffies(sample.rtt_us);
 }
 
 /*
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 17d3566..66b34c0 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -56,15 +56,15 @@ static void tcp_yeah_init(struct sock *sk)
 	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
 }
 
-static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
+static void tcp_yeah_pkts_acked(struct sock *sk, struct ack_sample sample)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct yeah *yeah = inet_csk_ca(sk);
 
 	if (icsk->icsk_ca_state == TCP_CA_Open)
-		yeah->pkts_acked = pkts_acked;
+		yeah->pkts_acked = sample.pkts_acked;
 
-	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
+	tcp_vegas_pkts_acked(sk, sample);
 }
 
 static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH v2 net-next 2/3] tcp: add in_flight to tcp_skb_cb
  2015-07-22  4:21 [RFC PATCH v2 net-next 0/3] tcp: add NV congestion control Lawrence Brakmo
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
@ 2015-07-22  4:21 ` Lawrence Brakmo
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control Lawrence Brakmo
  2 siblings, 0 replies; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-22  4:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet

Based on comments by Neal Cardwell to tcp_nv patch:

  AFAICT this patch would not require an increase in the size of sk_buff
  cb[] if it were to take advantage of the fact that the tcp_skb_cb
  header.h4 and header.h6 fields are only used in the packet reception
  code path, and this in_flight field is only used on the transmit
  side. So the in_flight field could be placed in a struct that is
  itself placed in a union with the "header" union.

  That way the sender code can remember the in_flight value
  without requiring any extra space. And in the future other
  sender-side info could be stored in the "tx" struct, if needed.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/net/tcp.h     | 13 ++++++++++---
 net/ipv4/tcp_input.c  |  5 ++++-
 net/ipv4/tcp_output.c |  4 +++-
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 26e7651..2e62efe 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -755,11 +755,17 @@ struct tcp_skb_cb {
 	/* 1 byte hole */
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
-		struct inet_skb_parm	h4;
+		struct {
+			/* bytes in flight when this packet was sent */
+			__u32 in_flight;
+		} tx;   /* only used for outgoing skbs */
+		union {
+			struct inet_skb_parm	h4;
 #if IS_ENABLED(CONFIG_IPV6)
-		struct inet6_skb_parm	h6;
+			struct inet6_skb_parm	h6;
 #endif
-	} header;	/* For incoming frames		*/
+		} header;	/* For incoming skbs */
+	};
 };
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
@@ -837,6 +843,7 @@ union tcp_cc_info;
 struct ack_sample {
 	u32 pkts_acked;
 	s32 rtt_us;
+	u32 in_flight;
 };
 
 struct tcp_congestion_ops {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4f641f6..aca4ae5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3068,6 +3068,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	long ca_rtt_us = -1L;
 	struct sk_buff *skb;
 	u32 pkts_acked = 0;
+	u32 last_in_flight = 0;
 	bool rtt_update;
 	int flag = 0;
 
@@ -3107,6 +3108,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			if (!first_ackt.v64)
 				first_ackt = last_ackt;
 
+			last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
 			reord = min(pkts_acked, reord);
 			if (!after(scb->end_seq, tp->high_seq))
 				flag |= FLAG_ORIG_SACK_ACKED;
@@ -3196,7 +3198,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	}
 
 	if (icsk->icsk_ca_ops->pkts_acked) {
-		struct ack_sample sample = {pkts_acked, ca_rtt_us};
+		struct ack_sample sample = {pkts_acked, ca_rtt_us,
+					    last_in_flight};
 
 		icsk->icsk_ca_ops->pkts_acked(sk, sample);
 	}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7105784..e9deab5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -920,9 +920,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	int err;
 
 	BUG_ON(!skb || !tcp_skb_pcount(skb));
+	tp = tcp_sk(sk);
 
 	if (clone_it) {
 		skb_mstamp_get(&skb->skb_mstamp);
+		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
+			- tp->snd_una;
 
 		if (unlikely(skb_cloned(skb)))
 			skb = pskb_copy(skb, gfp_mask);
@@ -933,7 +936,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	}
 
 	inet = inet_sk(sk);
-	tp = tcp_sk(sk);
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control
  2015-07-22  4:21 [RFC PATCH v2 net-next 0/3] tcp: add NV congestion control Lawrence Brakmo
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 2/3] tcp: add in_flight to tcp_skb_cb Lawrence Brakmo
@ 2015-07-22  4:21 ` Lawrence Brakmo
  2015-07-22  6:50   ` Yuchung Cheng
  2 siblings, 1 reply; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-22  4:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet

This is a request for comments.

TCP-NV (New Vegas) is a major update to TCP-Vegas. An earlier version of
NV was presented at 2010's LPC (slides). It is a delayed based
congestion avoidance for the data center. This version has been tested
within a 10G rack where the HW RTTs are 20-50us.

A description of TCP-NV, including implementation and experimental
results, can be found at:
http://www.brakmo.org/networking/tcp-nv/TCPNV.html

The current version includes many module parameters to support
experimentation with the parameters.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/net/tcp.h          |   1 +
 net/ipv4/Kconfig           |  16 ++
 net/ipv4/Makefile          |   1 +
 net/ipv4/sysctl_net_ipv4.c |   9 +
 net/ipv4/tcp_input.c       |   2 +
 net/ipv4/tcp_nv.c          | 479 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 508 insertions(+)
 create mode 100644 net/ipv4/tcp_nv.c

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2e62efe..c0690ae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_nv_enable;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6fb3c90..c37b374 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -539,6 +539,22 @@ config TCP_CONG_VEGAS
 	window. TCP Vegas should provide less packet loss, but it is
 	not as aggressive as TCP Reno.
 
+config TCP_CONG_NV
+       tristate "TCP NV"
+       default m
+       ---help---
+       TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+       10G networks, measurement noise introduced by LRO, GRO and interrupt
+       coalescence. In addition, it will decrease its cwnd multiplicative
+       instead of linearly.
+
+       Note that in general congestion avoidance (cwnd decreased when # packets
+       queued grows) cannot coexist with congestion control (cwnd decreased only
+       when there is packet loss) due to fairness issues. One scenario when the
+       can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+
+       For further details see http://www.brakmo.org/networking/tcp-nv/
+
 config TCP_CONG_SCALABLE
 	tristate "Scalable TCP"
 	default n
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index efc43f3..06f335f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
 obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 433231c..31846d5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -730,6 +730,15 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_ms_jiffies,
 	},
 	{
+		.procname	= "tcp_nv_enable",
+		.data		= &sysctl_tcp_nv_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},		
+	{
 		.procname	= "icmp_msgs_per_sec",
 		.data		= &sysctl_icmp_msgs_per_sec,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aca4ae5..87560d9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -101,6 +101,8 @@ int sysctl_tcp_thin_dupack __read_mostly;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+int sysctl_tcp_nv_enable __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_nv_enable);
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
new file mode 100644
index 0000000..af451b6
--- /dev/null
+++ b/net/ipv4/tcp_nv.c
@@ -0,0 +1,479 @@
+/*
+ * TCP NV: TCP with Congestion Avoidance
+ *
+ * TCP-NV is a successor of TCP-Vegas that has been developed to
+ * deal with the issues that occur in modern networks. 
+ * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
+ * the ability to detect congestion before packet losses occur.
+ * When congestion (queue buildup) starts to occur, TCP-NV
+ * predicts what the cwnd size should be for the current
+ * throughput and it reduces the cwnd proportionally to
+ * the difference between the current cwnd and the predicted cwnd.
+ * TCP-NV behaves like Reno when no congestion is detected, or when
+ * recovering from packet losses.
+ *
+ * TODO:
+ * 1) Modify the behavior so cwnd can grow faster under certain conditions
+ * 2) Add mechanism to deal with reverse congestion.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+/* TCP NV parameters */
+static int nv_pad __read_mostly = 10;
+static int nv_pad_buffer __read_mostly = 2;
+static int nv_reset_period __read_mostly = 5;
+static int nv_min_cwnd = 10;
+static int nv_dec_eval_min_calls = 100;
+static int nv_ssthresh_eval_min_calls = 30;
+static int nv_rtt_min_cnt = 2;
+static int nv_cong_decrease_mult = 30*128/100;
+static int nv_ssthresh_factor = 8;
+static int nv_rtt_factor = 128;
+static int nv_rtt_cnt_dec_delta = 20; /* dec cwnd by this many RTTs */
+static int nv_dec_factor = 5;  /* actual value is factor/8 */
+static int nv_loss_dec_factor = 820; /* on loss reduce cwnd by 20% */
+static int nv_cwnd_growth_factor = 2; /* larger => cwnd grows slower */
+
+module_param(nv_pad, int, 0644);
+MODULE_PARM_DESC(nv_pad, "extra packets above congestion level");
+module_param(nv_pad_buffer, int, 0644);
+MODULE_PARM_DESC(nv_pad_buffer, "no growth buffer zone");
+module_param(nv_reset_period, int, 0644);
+MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
+module_param(nv_min_cwnd, int, 0644);
+MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
+		 " without losses");
+module_param(nv_dec_eval_min_calls, int, 0644);
+MODULE_PARM_DESC(nv_dec_eval_min_calls, "Wait for this many data points "
+		 "before declaring congestion (< 256)");
+module_param(nv_ssthresh_eval_min_calls, int, 0644);
+MODULE_PARM_DESC(nv_ssthresh_eval_min_calls, "Wait for this many data points "
+		 "before declaring congestion during initial slow-start");
+module_param(nv_rtt_min_cnt, int, 0644);
+MODULE_PARM_DESC(nv_rtt_min_cnt, "Wait for this many RTTs before declaring"
+		 " congestion (<64)");
+module_param(nv_cong_decrease_mult, int, 0644);
+MODULE_PARM_DESC(nv_cong_decrease_mult, "Congestion decrease factor");
+module_param(nv_ssthresh_factor, int, 0644);
+MODULE_PARM_DESC(nv_ssthresh_factor, "ssthresh factor");
+module_param(nv_rtt_factor, int, 0644);
+MODULE_PARM_DESC(nv_rtt_factor, "rtt averaging factor (0-256)");
+module_param(nv_rtt_cnt_dec_delta, int, 0644);
+MODULE_PARM_DESC(nv_rtt_cnt_dec_delta, "decrease cwnd for this many RTTs "
+		 "every 100 RTTs");
+module_param(nv_dec_factor, int, 0644);
+MODULE_PARM_DESC(nv_dec_factor, "decrease cwnd every ~192 RTTS by factor/8");
+module_param(nv_loss_dec_factor, int, 0644);
+MODULE_PARM_DESC(nv_loss_dec_factor, "on loss new cwnd = cwnd * this / 1024");
+module_param(nv_cwnd_growth_factor, int, 0644);
+MODULE_PARM_DESC(nv_cwnd_growth_factor, "larger => cwnd grows slower");
+
+/* TCP NV Parameters */
+struct tcpnv {
+	unsigned long nv_min_rtt_reset_jiffies;  /* when to switch to
+						  * nv_min_rtt_new */
+	u32 cnt;		/* increase cwnd by 1 after ACKs */
+	u32 loss_cwnd;	/* cwnd at last loss */
+	u8  nv_enable:1,
+		nv_allow_cwnd_growth:1, 	/* whether cwnd can grow */
+		nv_rtt_cnt:6;	/* RTTs without making ca decision */
+	u8  nv_eval_call_cnt;/* call count since last eval */
+	u8  nv_min_cwnd;	/* nv won't make a ca decision if cwnd is
+				 * smaller than this. It may grow to handle
+				 * TSO, LRO and interrupt coalescence because
+				 * with these a small cwnd cannot saturate
+				 * the link. Note that this is different from
+				 * sysctl_tcp_nv_min_cwnd */
+	u8  available;
+	u32 nv_last_rtt;	/* last rtt */
+	u32 nv_min_rtt;		/* active min rtt. Used to determine slope */
+	u32 nv_min_rtt_new;	/* min rtt for future use */
+	u32 nv_rtt_max_rate;  	/* max rate seen during current RTT */
+	u32 nv_rtt_start_seq;	/* current RTT ends when packet arrives
+				 * acking beyond nv_rtt_start_seq */
+	u32 nv_last_snd_una;	/* Previous value of tp->snd_una. It is
+				 * used to determine bytes acked since last
+				 * call to bictcp_acked */
+	u32 nv_no_cong_cnt;	/* Consecutive no congestion decisions */
+	u32 nv_rtt_cnt_dec;	/* RTTs since last temporary cwnd decrease */
+};
+
+#define NV_INIT_RTT	  0xffffffff
+#define NV_MIN_CWND	  4
+#define NV_MIN_CWND_GROW  2
+#define NV_TSO_CWND_BOUND 80
+
+static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = 0;
+	ca->nv_no_cong_cnt = 0;
+	ca->cnt = 0;
+	ca->nv_rtt_cnt = 0;
+	ca->nv_rtt_cnt_dec = 0;
+	ca->nv_allow_cwnd_growth = 1;
+	ca->nv_last_rtt = 0;
+	ca->nv_rtt_max_rate = 0;
+	ca->nv_rtt_start_seq = tp->snd_una;
+	ca->nv_eval_call_cnt = 0;
+	ca->nv_last_snd_una = tp->snd_una;
+}
+
+static void tcpnv_init(struct sock *sk)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	tcpnv_reset(ca, sk);
+
+	ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
+	ca->nv_min_rtt = NV_INIT_RTT;
+	ca->nv_min_rtt_new = NV_INIT_RTT;
+	ca->nv_enable = sysctl_tcp_nv_enable;
+	ca->nv_min_cwnd = NV_MIN_CWND;
+	if (nv_dec_eval_min_calls > 255)
+		nv_dec_eval_min_calls = 255;
+	if (nv_rtt_min_cnt > 63)
+		nv_rtt_min_cnt = 63;
+}
+
+static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	/* Only grow cwnd if NV has not detected congestion */
+	if (sysctl_tcp_nv_enable && ca->nv_enable &&
+	    !ca->nv_allow_cwnd_growth)
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+	if (ca->cnt == 0 || !(sysctl_tcp_nv_enable || ca->nv_enable))
+		ca->cnt = tp->snd_cwnd;
+
+	tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+static u32 tcpnv_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	ca->loss_cwnd = tp->snd_cwnd;
+	return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); 
+}
+
+static u32 tcpnv_undo_cwnd(struct sock *sk)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+static void tcpnv_state(struct sock *sk, u8 new_state)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Open) {
+		ca->nv_enable = 1;
+		tcpnv_reset(ca, sk);
+	} else if (new_state == TCP_CA_Loss) {
+		ca->nv_enable = 0;
+	}
+}
+
+/* Do congestion avoidance calculaitons for TCP-NV
+ */
+static void tcpnv_acked(struct sock *sk, struct ack_sample sample)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+	unsigned long now = jiffies;
+	s64 rate64 = 0;
+	u32 rate, max_win, cwnd_by_slope;
+	u32 avg_rtt;
+	u32 bytes_acked = 0;
+
+	/* Some calls are for duplicates without timetamps */
+	if (sample.rtt_us < 0)
+		return;
+
+	/* If not in TCP_CA_Open state, skip. */
+	if (icsk->icsk_ca_state != TCP_CA_Open)
+		return;
+
+	/* If NV mode is not enabled, behave like Reno */
+	if (!sysctl_tcp_nv_enable  ||  !ca->nv_enable) {
+		ca->nv_allow_cwnd_growth = 1;
+		return;
+	}
+
+	bytes_acked = tp->snd_una - ca->nv_last_snd_una;
+	ca->nv_last_snd_una = tp->snd_una;
+
+	if (sample.in_flight == 0)
+		return;
+
+	/* Calculate moving average of RTT */
+	if (nv_rtt_factor > 0) {
+		if (ca->nv_last_rtt > 0) {
+			avg_rtt = (((u64)sample.rtt_us) * nv_rtt_factor +
+				   ((u64)ca->nv_last_rtt)
+				   * (256 - nv_rtt_factor)) >> 8;
+		} else {
+			avg_rtt = sample.rtt_us;
+			ca->nv_min_rtt = avg_rtt << 1;
+		}
+		ca->nv_last_rtt = avg_rtt;
+	} else {
+		avg_rtt = sample.rtt_us;
+	}
+
+	/* rate in 100's bits per second */
+	rate64 = ((u64)sample.in_flight) * 8000000;
+	rate = (u32)div64_u64(rate64, (u64)(avg_rtt*100));
+
+	/* Remember the maximum rate seen during this RTT
+	 * Note: It may be more than one RTT. This function should be
+	 *       called at least nv_dec_eval_min_calls times.
+	 */
+	if (ca->nv_rtt_max_rate < rate)
+		ca->nv_rtt_max_rate = rate;
+
+	/* We have valid information, increment counter */
+	if (ca->nv_eval_call_cnt < 255)
+		ca->nv_eval_call_cnt++;
+
+	/* update min rtt if necessary */
+	if (avg_rtt < ca->nv_min_rtt)
+		ca->nv_min_rtt = avg_rtt;
+
+	/* update future min_rtt if necessary */
+	if (avg_rtt < ca->nv_min_rtt_new)
+		ca->nv_min_rtt_new = avg_rtt;
+
+	/* nv_min_rtt is updated with the minimum (possibley averaged) rtt
+	 * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
+	 * warm reset). This new nv_min_rtt will be continued to be updated
+	 * and be used for another sysctl_tcp_nv_reset_period seconds,
+	 * when it will be updated again.
+	 * In practice we introduce some randomness, so the actual period used
+	 * is chosen randomly from the range:
+	 *   [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
+	 */
+	if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
+		unsigned char rand;
+		ca->nv_min_rtt = ca->nv_min_rtt_new;
+		ca->nv_min_rtt_new = NV_INIT_RTT;
+		get_random_bytes(&rand, 1);
+		ca->nv_min_rtt_reset_jiffies =
+			now + ((nv_reset_period*(384 + rand)*HZ)>>9);
+		/* Every so often we decrease nv_min_cwnd in case previous
+		 *  value is no longer accurate.
+		 */
+		ca->nv_min_cwnd = max(ca->nv_min_cwnd/2, NV_MIN_CWND);
+	}
+
+	/* Once per RTT check if we need to do congestion avoidance */
+	if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
+		ca->nv_rtt_start_seq = tp->snd_nxt;
+		if (ca->nv_rtt_cnt < 63)
+			/* Increase counter for RTTs without CA decision */
+			ca->nv_rtt_cnt++;
+		if (ca->nv_rtt_cnt_dec < 255)
+			/* Increase counter for temporary cwnd decrease */
+			ca->nv_rtt_cnt_dec++;
+
+		/* If this function is only called once within an RTT
+		 * the cwnd is probably too small (in some cases due to
+		 * tso, lro or interrupt coalescence), so we increase
+		 * nv_min_cwnd.
+		 */
+		if (ca->nv_eval_call_cnt == 1
+		    && bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache
+		    && ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)
+		    && ca->nv_rtt_cnt_dec < 192) {
+			ca->nv_min_cwnd = min(ca->nv_min_cwnd
+					      + NV_MIN_CWND_GROW,
+					      NV_TSO_CWND_BOUND + 1);
+			ca->nv_rtt_start_seq = tp->snd_nxt +
+				ca->nv_min_cwnd*tp->mss_cache;
+			ca->nv_eval_call_cnt = 0;
+			ca->nv_allow_cwnd_growth = 1;
+			return;
+		}
+
+		/* Every 192 to 320 RTTs decrease cwnd to get better min RTT
+		 * measurement. In practice we accomplish this by initializing
+		 * nv_rtt_cnd_dec randomly form the range [0, 128) and
+		 * stopping at 320.
+		 * We keep the value low for nv_rtt_cnt_dec_delta RTTs and then
+		 * we restore cwnd to its previous value (by setting
+		 * ssthresh to the previous value).
+		 */
+		if (ca->nv_rtt_cnt_dec == 320) {
+			/* decrease cwnd and ssthresh */
+			tp->snd_cwnd =
+				max((unsigned int)nv_min_cwnd,
+				    ((tp->snd_cwnd * nv_dec_factor) >> 3));
+			tp->snd_ssthresh =
+				max(tp->snd_cwnd,
+				    ((tp->snd_ssthresh * nv_dec_factor) >> 3));
+			ca->nv_allow_cwnd_growth = 0;
+			return;
+		} else if (ca->nv_rtt_cnt_dec > 320) {
+			if (ca->nv_rtt_cnt_dec - 320 >= nv_rtt_cnt_dec_delta) {
+				/* Restore ssthresh to restore cwnd */
+				unsigned char rand;
+				get_random_bytes(&rand, 1);
+				ca->nv_rtt_cnt_dec = rand >> 1;
+				tp->snd_ssthresh = (tp->snd_ssthresh << 3)
+					/ nv_dec_factor;
+				ca->nv_allow_cwnd_growth = 1;
+				ca->nv_no_cong_cnt = 0;
+			}
+			return;
+		}
+
+		/* Find the ideal cwnd for current rate from slope
+		 * slope = 80000.0 * mss / nv_min_rtt
+		 * cwnd_by_slope = nv_rtt_max_rate / slope
+		 */
+		cwnd_by_slope = (u32)
+			div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
+				  (u64)(80000 * tp->mss_cache));
+		max_win = cwnd_by_slope + nv_pad;
+
+		/* If cwnd > max_win, decrease cwnd
+		 * if cwnd < max_win, grow cwnd
+		 * else leave the same
+		 */
+		if (tp->snd_cwnd > max_win) {
+			/* there is congestion, check that it is ok
+			 * to make a CA decision
+			 * 1. We should have at least nv_dec_eval_min_calls
+			 *    data points before making a CA  decision
+			 * 2. We only make a congesion decision after
+			 *    nv_rtt_min_cnt RTTs
+			 */
+			if (ca->nv_rtt_cnt < nv_rtt_min_cnt)
+				return;
+			else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
+				if (ca->nv_eval_call_cnt <
+				    nv_ssthresh_eval_min_calls)
+					return;
+			} else if (ca->nv_eval_call_cnt <
+				   nv_dec_eval_min_calls) {
+				return;
+			}
+
+			/* We have enough data to determine we are congested */
+			ca->nv_allow_cwnd_growth = 0;
+			tp->snd_ssthresh =
+				(nv_ssthresh_factor * max_win) >> 3;
+			if (tp->snd_cwnd - max_win > 2) {
+				/* gap > 2, we do exponential cwnd decrease */
+				int dec;
+				dec = max(2U, ((tp->snd_cwnd - max_win) *
+					       nv_cong_decrease_mult) >> 7);
+				tp->snd_cwnd -= dec;
+			} else if (nv_cong_decrease_mult > 0) {
+				tp->snd_cwnd = max_win;
+			}
+			ca->cnt = tp->snd_cwnd;
+			ca->nv_no_cong_cnt = 0;
+		} else if (tp->snd_cwnd <=  max_win - nv_pad_buffer) {
+			/* We allow growth of cwnd every RTT since we would
+			 * have grown even if we waited (just slower)
+			 */
+			ca->nv_allow_cwnd_growth = 1;
+			ca->nv_no_cong_cnt++;
+			if (nv_cwnd_growth_factor > 0 &&
+			    ca->nv_no_cong_cnt > nv_cwnd_growth_factor) {
+				ca->cnt = max(ca->cnt >> 1, (u32) 4);
+				ca->nv_no_cong_cnt = 0;
+			}
+		} else {
+			ca->nv_allow_cwnd_growth = 0;
+		}
+
+		/* update state */
+		ca->nv_eval_call_cnt = 0;
+		ca->nv_rtt_cnt = 0;
+		ca->nv_rtt_max_rate = 0;
+
+		/* Don't want to make cwnd < nv_min_cwnd
+		 * (it wasn't before, if it is now is because nv
+		 *  decreased it).
+		 */
+		if (tp->snd_cwnd < nv_min_cwnd)
+			tp->snd_cwnd = nv_min_cwnd;
+
+  }
+}
+
+/* Extract info for Tcp socket info provided via netlink */
+size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
+		       union tcp_cc_info *info)
+{
+	const struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		info->vegas.tcpv_enabled = ca->nv_enable
+			&& sysctl_tcp_nv_enable;
+		info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
+		info->vegas.tcpv_rtt = ca->nv_last_rtt;
+		info->vegas.tcpv_minrtt = ca->nv_min_rtt;
+
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcpnv_get_info);
+
+static struct tcp_congestion_ops tcpnv __read_mostly = {
+	.init		= tcpnv_init,
+	.ssthresh	= tcpnv_recalc_ssthresh,
+	.cong_avoid	= tcpnv_cong_avoid,
+	.set_state	= tcpnv_state,
+	.undo_cwnd	= tcpnv_undo_cwnd,
+	.pkts_acked     = tcpnv_acked,
+	.get_info	= tcpnv_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "nv",
+};
+
+static int __init tcpnv_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
+
+	return tcp_register_congestion_control(&tcpnv);
+}
+
+static void __exit tcpnv_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcpnv);
+}
+
+module_init(tcpnv_register);
+module_exit(tcpnv_unregister);
+
+MODULE_AUTHOR("Lawrence Brakmo");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP NV");
+MODULE_VERSION("1.0");
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked()
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
@ 2015-07-22  5:46   ` Eric Dumazet
  2015-07-22 11:04     ` Neal Cardwell
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2015-07-22  5:46 UTC (permalink / raw)
  To: Lawrence Brakmo; +Cc: netdev, Kernel Team, Neal Cardwell

On Tue, 2015-07-21 at 21:21 -0700, Lawrence Brakmo wrote:
> Replace 2 arguments (cnt and rtt) in the congestion control modules'
> pkts_acked() function with a struct. This will allow adding more
> information without having to modify existing congestion control
> modules (tcp_nv in particular needs bytes in flight when packet
> was sent).
> 
> This was proposed by Neal Cardwell in his comments to the tcp_nv patch.

Are you sure Neal suggested to pass a struct as argument ?

It was probably a struct pointer instead.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control
  2015-07-22  4:21 ` [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control Lawrence Brakmo
@ 2015-07-22  6:50   ` Yuchung Cheng
  2015-07-22 11:27     ` Neal Cardwell
  2015-07-24  0:47     ` Lawrence Brakmo
  0 siblings, 2 replies; 9+ messages in thread
From: Yuchung Cheng @ 2015-07-22  6:50 UTC (permalink / raw)
  To: Lawrence Brakmo; +Cc: netdev, Kernel Team, Neal Cardwell, Eric Dumazet

On Tue, Jul 21, 2015 at 9:21 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
> This is a request for comments.
>
> TCP-NV (New Vegas) is a major update to TCP-Vegas. An earlier version of
> NV was presented at 2010's LPC (slides). It is a delayed based
> congestion avoidance for the data center. This version has been tested
> within a 10G rack where the HW RTTs are 20-50us.
>
> A description of TCP-NV, including implementation and experimental
> results, can be found at:
> http://www.brakmo.org/networking/tcp-nv/TCPNV.html
>
> The current version includes many module parameters to support
> experimentation with the parameters.
>
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> ---
>  include/net/tcp.h          |   1 +
>  net/ipv4/Kconfig           |  16 ++
>  net/ipv4/Makefile          |   1 +
>  net/ipv4/sysctl_net_ipv4.c |   9 +
>  net/ipv4/tcp_input.c       |   2 +
>  net/ipv4/tcp_nv.c          | 479 +++++++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 508 insertions(+)
>  create mode 100644 net/ipv4/tcp_nv.c
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 2e62efe..c0690ae 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -281,6 +281,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
>  extern int sysctl_tcp_min_tso_segs;
>  extern int sysctl_tcp_autocorking;
>  extern int sysctl_tcp_invalid_ratelimit;
> +extern int sysctl_tcp_nv_enable;
>
>  extern atomic_long_t tcp_memory_allocated;
>  extern struct percpu_counter tcp_sockets_allocated;
> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> index 6fb3c90..c37b374 100644
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -539,6 +539,22 @@ config TCP_CONG_VEGAS
>         window. TCP Vegas should provide less packet loss, but it is
>         not as aggressive as TCP Reno.
>
> +config TCP_CONG_NV
> +       tristate "TCP NV"
> +       default m
> +       ---help---
> +       TCP NV is a follow up to TCP Vegas. It has been modified to deal with
> +       10G networks, measurement noise introduced by LRO, GRO and interrupt
> +       coalescence. In addition, it will decrease its cwnd multiplicative
multiplicatively

> +       instead of linearly.
> +
> +       Note that in general congestion avoidance (cwnd decreased when # packets
> +       queued grows) cannot coexist with congestion control (cwnd decreased only
> +       when there is packet loss) due to fairness issues. One scenario when the
s/the/they
> +       can coexist safely is when the CA flows have RTTs << CC flows RTTs.
> +
> +       For further details see http://www.brakmo.org/networking/tcp-nv/
> +
>  config TCP_CONG_SCALABLE
>         tristate "Scalable TCP"
>         default n
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index efc43f3..06f335f 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
>  obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
>  obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
>  obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
> +obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
>  obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
>  obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
>  obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 433231c..31846d5 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -730,6 +730,15 @@ static struct ctl_table ipv4_table[] = {
>                 .proc_handler   = proc_dointvec_ms_jiffies,
>         },
>         {
> +               .procname       = "tcp_nv_enable",
> +               .data           = &sysctl_tcp_nv_enable,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec_minmax,
> +               .extra1         = &zero,
> +               .extra2         = &one,
> +       },
> +       {
>                 .procname       = "icmp_msgs_per_sec",
>                 .data           = &sysctl_icmp_msgs_per_sec,
>                 .maxlen         = sizeof(int),
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index aca4ae5..87560d9 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -101,6 +101,8 @@ int sysctl_tcp_thin_dupack __read_mostly;
>  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>  int sysctl_tcp_early_retrans __read_mostly = 3;
>  int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
> +int sysctl_tcp_nv_enable __read_mostly = 1;
> +EXPORT_SYMBOL(sysctl_tcp_nv_enable);
>
>  #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
>  #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
> diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
> new file mode 100644
> index 0000000..af451b6
> --- /dev/null
> +++ b/net/ipv4/tcp_nv.c
> @@ -0,0 +1,479 @@
> +/*
> + * TCP NV: TCP with Congestion Avoidance
> + *
> + * TCP-NV is a successor of TCP-Vegas that has been developed to
> + * deal with the issues that occur in modern networks.
> + * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
> + * the ability to detect congestion before packet losses occur.
> + * When congestion (queue buildup) starts to occur, TCP-NV
> + * predicts what the cwnd size should be for the current
> + * throughput and it reduces the cwnd proportionally to
> + * the difference between the current cwnd and the predicted cwnd.
> + * TCP-NV behaves like Reno when no congestion is detected, or when
> + * recovering from packet losses.
what happens when loss is not related to congestion?
> + *
> + * TODO:
> + * 1) Modify the behavior so cwnd can grow faster under certain conditions
> + * 2) Add mechanism to deal with reverse congestion.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/math64.h>
> +#include <net/tcp.h>
> +#include <linux/inet_diag.h>
> +
> +/* TCP NV parameters */
> +static int nv_pad __read_mostly = 10;
> +static int nv_pad_buffer __read_mostly = 2;
> +static int nv_reset_period __read_mostly = 5;
> +static int nv_min_cwnd = 10;
> +static int nv_dec_eval_min_calls = 100;
> +static int nv_ssthresh_eval_min_calls = 30;
> +static int nv_rtt_min_cnt = 2;
> +static int nv_cong_decrease_mult = 30*128/100;
> +static int nv_ssthresh_factor = 8;
> +static int nv_rtt_factor = 128;
> +static int nv_rtt_cnt_dec_delta = 20; /* dec cwnd by this many RTTs */
> +static int nv_dec_factor = 5;  /* actual value is factor/8 */
> +static int nv_loss_dec_factor = 820; /* on loss reduce cwnd by 20% */
> +static int nv_cwnd_growth_factor = 2; /* larger => cwnd grows slower */
> +
> +module_param(nv_pad, int, 0644);
> +MODULE_PARM_DESC(nv_pad, "extra packets above congestion level");
> +module_param(nv_pad_buffer, int, 0644);
> +MODULE_PARM_DESC(nv_pad_buffer, "no growth buffer zone");
> +module_param(nv_reset_period, int, 0644);
> +MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
> +module_param(nv_min_cwnd, int, 0644);
> +MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
> +                " without losses");
> +module_param(nv_dec_eval_min_calls, int, 0644);
> +MODULE_PARM_DESC(nv_dec_eval_min_calls, "Wait for this many data points "
> +                "before declaring congestion (< 256)");
> +module_param(nv_ssthresh_eval_min_calls, int, 0644);
> +MODULE_PARM_DESC(nv_ssthresh_eval_min_calls, "Wait for this many data points "
> +                "before declaring congestion during initial slow-start");
> +module_param(nv_rtt_min_cnt, int, 0644);
> +MODULE_PARM_DESC(nv_rtt_min_cnt, "Wait for this many RTTs before declaring"
> +                " congestion (<64)");
> +module_param(nv_cong_decrease_mult, int, 0644);
> +MODULE_PARM_DESC(nv_cong_decrease_mult, "Congestion decrease factor");
> +module_param(nv_ssthresh_factor, int, 0644);
> +MODULE_PARM_DESC(nv_ssthresh_factor, "ssthresh factor");
> +module_param(nv_rtt_factor, int, 0644);
> +MODULE_PARM_DESC(nv_rtt_factor, "rtt averaging factor (0-256)");
> +module_param(nv_rtt_cnt_dec_delta, int, 0644);
> +MODULE_PARM_DESC(nv_rtt_cnt_dec_delta, "decrease cwnd for this many RTTs "
> +                "every 100 RTTs");
> +module_param(nv_dec_factor, int, 0644);
> +MODULE_PARM_DESC(nv_dec_factor, "decrease cwnd every ~192 RTTS by factor/8");
> +module_param(nv_loss_dec_factor, int, 0644);
> +MODULE_PARM_DESC(nv_loss_dec_factor, "on loss new cwnd = cwnd * this / 1024");
> +module_param(nv_cwnd_growth_factor, int, 0644);
> +MODULE_PARM_DESC(nv_cwnd_growth_factor, "larger => cwnd grows slower");
> +
> +/* TCP NV Parameters */
> +struct tcpnv {
> +       unsigned long nv_min_rtt_reset_jiffies;  /* when to switch to
> +                                                 * nv_min_rtt_new */
> +       u32 cnt;                /* increase cwnd by 1 after ACKs */
> +       u32 loss_cwnd;  /* cwnd at last loss */
> +       u8  nv_enable:1,
> +               nv_allow_cwnd_growth:1,         /* whether cwnd can grow */
> +               nv_rtt_cnt:6;   /* RTTs without making ca decision */
> +       u8  nv_eval_call_cnt;/* call count since last eval */
> +       u8  nv_min_cwnd;        /* nv won't make a ca decision if cwnd is
> +                                * smaller than this. It may grow to handle
> +                                * TSO, LRO and interrupt coalescence because
> +                                * with these a small cwnd cannot saturate
> +                                * the link. Note that this is different from
> +                                * sysctl_tcp_nv_min_cwnd */
> +       u8  available;
> +       u32 nv_last_rtt;        /* last rtt */
> +       u32 nv_min_rtt;         /* active min rtt. Used to determine slope */
> +       u32 nv_min_rtt_new;     /* min rtt for future use */
> +       u32 nv_rtt_max_rate;    /* max rate seen during current RTT */
> +       u32 nv_rtt_start_seq;   /* current RTT ends when packet arrives
> +                                * acking beyond nv_rtt_start_seq */
> +       u32 nv_last_snd_una;    /* Previous value of tp->snd_una. It is
> +                                * used to determine bytes acked since last
> +                                * call to bictcp_acked */
> +       u32 nv_no_cong_cnt;     /* Consecutive no congestion decisions */
> +       u32 nv_rtt_cnt_dec;     /* RTTs since last temporary cwnd decrease */
> +};
> +
> +#define NV_INIT_RTT      0xffffffff
> +#define NV_MIN_CWND      4
> +#define NV_MIN_CWND_GROW  2
> +#define NV_TSO_CWND_BOUND 80
> +
> +static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
> +{
> +       struct tcp_sock *tp = tcp_sk(sk);
> +
> +       ca->loss_cwnd = 0;
> +       ca->nv_no_cong_cnt = 0;
> +       ca->cnt = 0;
> +       ca->nv_rtt_cnt = 0;
> +       ca->nv_rtt_cnt_dec = 0;
> +       ca->nv_allow_cwnd_growth = 1;
> +       ca->nv_last_rtt = 0;
> +       ca->nv_rtt_max_rate = 0;
> +       ca->nv_rtt_start_seq = tp->snd_una;
> +       ca->nv_eval_call_cnt = 0;
> +       ca->nv_last_snd_una = tp->snd_una;
> +}
> +
> +static void tcpnv_init(struct sock *sk)
> +{
> +       struct tcpnv *ca = inet_csk_ca(sk);
> +
> +       tcpnv_reset(ca, sk);
> +
> +       ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
> +       ca->nv_min_rtt = NV_INIT_RTT;
> +       ca->nv_min_rtt_new = NV_INIT_RTT;
> +       ca->nv_enable = sysctl_tcp_nv_enable;
> +       ca->nv_min_cwnd = NV_MIN_CWND;
> +       if (nv_dec_eval_min_calls > 255)
> +               nv_dec_eval_min_calls = 255;
> +       if (nv_rtt_min_cnt > 63)
> +               nv_rtt_min_cnt = 63;
> +}
> +
> +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
> +{
> +       struct tcp_sock *tp = tcp_sk(sk);
> +       struct tcpnv *ca = inet_csk_ca(sk);
> +
> +       if (!tcp_is_cwnd_limited(sk))
> +               return;
> +
> +       /* Only grow cwnd if NV has not detected congestion */
> +       if (sysctl_tcp_nv_enable && ca->nv_enable &&
> +           !ca->nv_allow_cwnd_growth)
> +               return;
> +
> +       if (tp->snd_cwnd <= tp->snd_ssthresh) {
> +               acked = tcp_slow_start(tp, acked);
> +               if (!acked)
> +                       return;
> +       }
> +       if (ca->cnt == 0 || !(sysctl_tcp_nv_enable || ca->nv_enable))
> +               ca->cnt = tp->snd_cwnd;
> +
> +       tcp_cong_avoid_ai(tp, ca->cnt, acked);
> +}
> +
> +static u32 tcpnv_recalc_ssthresh(struct sock *sk)
> +{
> +       const struct tcp_sock *tp = tcp_sk(sk);
> +       struct tcpnv *ca = inet_csk_ca(sk);
> +
> +       ca->loss_cwnd = tp->snd_cwnd;
> +       return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
> +}
> +
> +static u32 tcpnv_undo_cwnd(struct sock *sk)
> +{
> +       struct tcpnv *ca = inet_csk_ca(sk);
> +
> +       return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
> +}
> +
> +static void tcpnv_state(struct sock *sk, u8 new_state)
> +{
> +       struct tcpnv *ca = inet_csk_ca(sk);
> +
> +       if (new_state == TCP_CA_Open) {
> +               ca->nv_enable = 1;
> +               tcpnv_reset(ca, sk);
> +       } else if (new_state == TCP_CA_Loss) {
> +               ca->nv_enable = 0;
> +       }
> +}
> +
> +/* Do congestion avoidance calculaitons for TCP-NV
> + */
> +static void tcpnv_acked(struct sock *sk, struct ack_sample sample)
> +{
> +       const struct inet_connection_sock *icsk = inet_csk(sk);
> +       struct tcp_sock *tp = tcp_sk(sk);
> +       struct tcpnv *ca = inet_csk_ca(sk);
> +       unsigned long now = jiffies;
> +       s64 rate64 = 0;
> +       u32 rate, max_win, cwnd_by_slope;
> +       u32 avg_rtt;
> +       u32 bytes_acked = 0;
> +
> +       /* Some calls are for duplicates without timetamps */
> +       if (sample.rtt_us < 0)
> +               return;
> +
> +       /* If not in TCP_CA_Open state, skip. */
> +       if (icsk->icsk_ca_state != TCP_CA_Open)
> +               return;
> +
> +       /* If NV mode is not enabled, behave like Reno */
> +       if (!sysctl_tcp_nv_enable  ||  !ca->nv_enable) {
> +               ca->nv_allow_cwnd_growth = 1;
what's the use case of using NV with sysctl_tcp_nv_enable=0?

> +               return;
> +       }
> +
> +       bytes_acked = tp->snd_una - ca->nv_last_snd_una;
> +       ca->nv_last_snd_una = tp->snd_una;
> +
> +       if (sample.in_flight == 0)
> +               return;
> +
> +       /* Calculate moving average of RTT */
> +       if (nv_rtt_factor > 0) {
> +               if (ca->nv_last_rtt > 0) {
> +                       avg_rtt = (((u64)sample.rtt_us) * nv_rtt_factor +
> +                                  ((u64)ca->nv_last_rtt)
> +                                  * (256 - nv_rtt_factor)) >> 8;
> +               } else {
> +                       avg_rtt = sample.rtt_us;
> +                       ca->nv_min_rtt = avg_rtt << 1;
> +               }
> +               ca->nv_last_rtt = avg_rtt;
> +       } else {
> +               avg_rtt = sample.rtt_us;
> +       }
> +
> +       /* rate in 100's bits per second */
> +       rate64 = ((u64)sample.in_flight) * 8000000;
> +       rate = (u32)div64_u64(rate64, (u64)(avg_rtt*100));
> +
> +       /* Remember the maximum rate seen during this RTT
> +        * Note: It may be more than one RTT. This function should be
> +        *       called at least nv_dec_eval_min_calls times.
> +        */
> +       if (ca->nv_rtt_max_rate < rate)
> +               ca->nv_rtt_max_rate = rate;
> +
> +       /* We have valid information, increment counter */
> +       if (ca->nv_eval_call_cnt < 255)
> +               ca->nv_eval_call_cnt++;
> +
> +       /* update min rtt if necessary */
> +       if (avg_rtt < ca->nv_min_rtt)
> +               ca->nv_min_rtt = avg_rtt;
> +
> +       /* update future min_rtt if necessary */
> +       if (avg_rtt < ca->nv_min_rtt_new)
> +               ca->nv_min_rtt_new = avg_rtt;
> +
> +       /* nv_min_rtt is updated with the minimum (possibley averaged) rtt
> +        * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
> +        * warm reset). This new nv_min_rtt will be continued to be updated
> +        * and be used for another sysctl_tcp_nv_reset_period seconds,
> +        * when it will be updated again.
> +        * In practice we introduce some randomness, so the actual period used
> +        * is chosen randomly from the range:
> +        *   [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
> +        */
> +       if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
> +               unsigned char rand;
> +               ca->nv_min_rtt = ca->nv_min_rtt_new;
> +               ca->nv_min_rtt_new = NV_INIT_RTT;
> +               get_random_bytes(&rand, 1);
> +               ca->nv_min_rtt_reset_jiffies =
> +                       now + ((nv_reset_period*(384 + rand)*HZ)>>9);
> +               /* Every so often we decrease nv_min_cwnd in case previous
> +                *  value is no longer accurate.
> +                */
> +               ca->nv_min_cwnd = max(ca->nv_min_cwnd/2, NV_MIN_CWND);
> +       }
> +
> +       /* Once per RTT check if we need to do congestion avoidance */
> +       if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
> +               ca->nv_rtt_start_seq = tp->snd_nxt;
> +               if (ca->nv_rtt_cnt < 63)
> +                       /* Increase counter for RTTs without CA decision */
> +                       ca->nv_rtt_cnt++;
> +               if (ca->nv_rtt_cnt_dec < 255)
> +                       /* Increase counter for temporary cwnd decrease */
> +                       ca->nv_rtt_cnt_dec++;
> +
> +               /* If this function is only called once within an RTT
> +                * the cwnd is probably too small (in some cases due to
> +                * tso, lro or interrupt coalescence), so we increase
> +                * nv_min_cwnd.
> +                */
> +               if (ca->nv_eval_call_cnt == 1
> +                   && bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache
> +                   && ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)
> +                   && ca->nv_rtt_cnt_dec < 192) {
> +                       ca->nv_min_cwnd = min(ca->nv_min_cwnd
> +                                             + NV_MIN_CWND_GROW,
> +                                             NV_TSO_CWND_BOUND + 1);
> +                       ca->nv_rtt_start_seq = tp->snd_nxt +
> +                               ca->nv_min_cwnd*tp->mss_cache;
> +                       ca->nv_eval_call_cnt = 0;
> +                       ca->nv_allow_cwnd_growth = 1;
> +                       return;
> +               }
> +
> +               /* Every 192 to 320 RTTs decrease cwnd to get better min RTT
> +                * measurement. In practice we accomplish this by initializing
> +                * nv_rtt_cnd_dec randomly form the range [0, 128) and
> +                * stopping at 320.
> +                * We keep the value low for nv_rtt_cnt_dec_delta RTTs and then
> +                * we restore cwnd to its previous value (by setting
> +                * ssthresh to the previous value).
> +                */
> +               if (ca->nv_rtt_cnt_dec == 320) {
> +                       /* decrease cwnd and ssthresh */
> +                       tp->snd_cwnd =
> +                               max((unsigned int)nv_min_cwnd,
> +                                   ((tp->snd_cwnd * nv_dec_factor) >> 3));
> +                       tp->snd_ssthresh =
> +                               max(tp->snd_cwnd,
> +                                   ((tp->snd_ssthresh * nv_dec_factor) >> 3));
> +                       ca->nv_allow_cwnd_growth = 0;
> +                       return;
> +               } else if (ca->nv_rtt_cnt_dec > 320) {
> +                       if (ca->nv_rtt_cnt_dec - 320 >= nv_rtt_cnt_dec_delta) {
> +                               /* Restore ssthresh to restore cwnd */
> +                               unsigned char rand;
> +                               get_random_bytes(&rand, 1);
> +                               ca->nv_rtt_cnt_dec = rand >> 1;
> +                               tp->snd_ssthresh = (tp->snd_ssthresh << 3)
> +                                       / nv_dec_factor;
> +                               ca->nv_allow_cwnd_growth = 1;
> +                               ca->nv_no_cong_cnt = 0;
> +                       }
> +                       return;
> +               }
> +
> +               /* Find the ideal cwnd for current rate from slope
> +                * slope = 80000.0 * mss / nv_min_rtt
> +                * cwnd_by_slope = nv_rtt_max_rate / slope
> +                */
> +               cwnd_by_slope = (u32)
> +                       div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
> +                                 (u64)(80000 * tp->mss_cache));
> +               max_win = cwnd_by_slope + nv_pad;
> +
> +               /* If cwnd > max_win, decrease cwnd
> +                * if cwnd < max_win, grow cwnd
> +                * else leave the same
> +                */
> +               if (tp->snd_cwnd > max_win) {
> +                       /* there is congestion, check that it is ok
> +                        * to make a CA decision
> +                        * 1. We should have at least nv_dec_eval_min_calls
> +                        *    data points before making a CA  decision
> +                        * 2. We only make a congesion decision after
> +                        *    nv_rtt_min_cnt RTTs
> +                        */
> +                       if (ca->nv_rtt_cnt < nv_rtt_min_cnt)
> +                               return;
> +                       else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
> +                               if (ca->nv_eval_call_cnt <
> +                                   nv_ssthresh_eval_min_calls)
> +                                       return;
> +                       } else if (ca->nv_eval_call_cnt <
> +                                  nv_dec_eval_min_calls) {
> +                               return;
> +                       }
> +
> +                       /* We have enough data to determine we are congested */
> +                       ca->nv_allow_cwnd_growth = 0;
> +                       tp->snd_ssthresh =
> +                               (nv_ssthresh_factor * max_win) >> 3;
> +                       if (tp->snd_cwnd - max_win > 2) {
> +                               /* gap > 2, we do exponential cwnd decrease */
> +                               int dec;
> +                               dec = max(2U, ((tp->snd_cwnd - max_win) *
> +                                              nv_cong_decrease_mult) >> 7);
> +                               tp->snd_cwnd -= dec;
> +                       } else if (nv_cong_decrease_mult > 0) {
> +                               tp->snd_cwnd = max_win;
> +                       }
> +                       ca->cnt = tp->snd_cwnd;
> +                       ca->nv_no_cong_cnt = 0;
> +               } else if (tp->snd_cwnd <=  max_win - nv_pad_buffer) {
> +                       /* We allow growth of cwnd every RTT since we would
> +                        * have grown even if we waited (just slower)
> +                        */
> +                       ca->nv_allow_cwnd_growth = 1;
> +                       ca->nv_no_cong_cnt++;
> +                       if (nv_cwnd_growth_factor > 0 &&
> +                           ca->nv_no_cong_cnt > nv_cwnd_growth_factor) {
> +                               ca->cnt = max(ca->cnt >> 1, (u32) 4);
> +                               ca->nv_no_cong_cnt = 0;
> +                       }
> +               } else {
> +                       ca->nv_allow_cwnd_growth = 0;
> +               }
> +
> +               /* update state */
> +               ca->nv_eval_call_cnt = 0;
> +               ca->nv_rtt_cnt = 0;
> +               ca->nv_rtt_max_rate = 0;
> +
> +               /* Don't want to make cwnd < nv_min_cwnd
> +                * (it wasn't before, if it is now is because nv
> +                *  decreased it).
> +                */
> +               if (tp->snd_cwnd < nv_min_cwnd)
> +                       tp->snd_cwnd = nv_min_cwnd;
> +
> +  }
> +}
> +
> +/* Extract info for Tcp socket info provided via netlink */
> +size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
> +                      union tcp_cc_info *info)
> +{
> +       const struct tcpnv *ca = inet_csk_ca(sk);
> +
> +       if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
> +               info->vegas.tcpv_enabled = ca->nv_enable
> +                       && sysctl_tcp_nv_enable;
> +               info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
> +               info->vegas.tcpv_rtt = ca->nv_last_rtt;
> +               info->vegas.tcpv_minrtt = ca->nv_min_rtt;
> +
> +               *attr = INET_DIAG_VEGASINFO;
> +               return sizeof(struct tcpvegas_info);
> +       }
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(tcpnv_get_info);
> +
> +static struct tcp_congestion_ops tcpnv __read_mostly = {
> +       .init           = tcpnv_init,
> +       .ssthresh       = tcpnv_recalc_ssthresh,
> +       .cong_avoid     = tcpnv_cong_avoid,
> +       .set_state      = tcpnv_state,
> +       .undo_cwnd      = tcpnv_undo_cwnd,
> +       .pkts_acked     = tcpnv_acked,
> +       .get_info       = tcpnv_get_info,
> +
> +       .owner          = THIS_MODULE,
> +       .name           = "nv",
> +};
> +
> +static int __init tcpnv_register(void)
> +{
> +       BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
> +
> +       return tcp_register_congestion_control(&tcpnv);
> +}
> +
> +static void __exit tcpnv_unregister(void)
> +{
> +       tcp_unregister_congestion_control(&tcpnv);
> +}
> +
> +module_init(tcpnv_register);
> +module_exit(tcpnv_unregister);
> +
> +MODULE_AUTHOR("Lawrence Brakmo");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("TCP NV");
> +MODULE_VERSION("1.0");
> --
> 1.8.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked()
  2015-07-22  5:46   ` Eric Dumazet
@ 2015-07-22 11:04     ` Neal Cardwell
  0 siblings, 0 replies; 9+ messages in thread
From: Neal Cardwell @ 2015-07-22 11:04 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Lawrence Brakmo, netdev, Kernel Team

On Wed, Jul 22, 2015 at 1:46 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2015-07-21 at 21:21 -0700, Lawrence Brakmo wrote:
>> Replace 2 arguments (cnt and rtt) in the congestion control modules'
>> pkts_acked() function with a struct. This will allow adding more
>> information without having to modify existing congestion control
>> modules (tcp_nv in particular needs bytes in flight when packet
>> was sent).
>>
>> This was proposed by Neal Cardwell in his comments to the tcp_nv patch.
>
> Are you sure Neal suggested to pass a struct as argument ?
>
> It was probably a struct pointer instead.

Yes, I may not have specified, but I definitely agree we should pass
the struct by pointer, as the intent is that the struct may grow over
time, and we don't want to pay more overhead to copy the struct for
the call.

neal

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control
  2015-07-22  6:50   ` Yuchung Cheng
@ 2015-07-22 11:27     ` Neal Cardwell
  2015-07-24  0:47     ` Lawrence Brakmo
  1 sibling, 0 replies; 9+ messages in thread
From: Neal Cardwell @ 2015-07-22 11:27 UTC (permalink / raw)
  To: Yuchung Cheng; +Cc: Lawrence Brakmo, netdev, Kernel Team, Eric Dumazet

On Wed, Jul 22, 2015 at 2:50 AM, Yuchung Cheng <ycheng@google.com> wrote:
> On Tue, Jul 21, 2015 at 9:21 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
>> This is a request for comments.
>>
>> TCP-NV (New Vegas) is a major update to TCP-Vegas. An earlier version of
>> NV was presented at 2010's LPC (slides). It is a delayed based
>> congestion avoidance for the data center. This version has been tested
>> within a 10G rack where the HW RTTs are 20-50us.
>>
>> A description of TCP-NV, including implementation and experimental
>> results, can be found at:
>> http://www.brakmo.org/networking/tcp-nv/TCPNV.html
>>
>> The current version includes many module parameters to support
>> experimentation with the parameters.
...
>> +extern int sysctl_tcp_nv_enable;

The sysctl_tcp_nv_enable only seems to be used within the NV module.
Can it be a module parameter instead of sysctl?

neal

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control
  2015-07-22  6:50   ` Yuchung Cheng
  2015-07-22 11:27     ` Neal Cardwell
@ 2015-07-24  0:47     ` Lawrence Brakmo
  1 sibling, 0 replies; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-24  0:47 UTC (permalink / raw)
  To: Yuchung Cheng; +Cc: netdev, Kernel Team, Neal Cardwell, Eric Dumazet

Thank you all for your comments, I¹m currently testing the changes.
Other comments inline.

On 7/21/15, 11:50 PM, "Yuchung Cheng" <ycheng@google.com> wrote:

>On Tue, Jul 21, 2015 at 9:21 PM, Lawrence Brakmo <brakmo@fb.com> wrote:
>> This is a request for comments.
>>
>> TCP-NV (New Vegas) is a major update to TCP-Vegas. An earlier version of
>> NV was presented at 2010's LPC (slides). It is a delayed based
>> congestion avoidance for the data center. This version has been tested
>> within a 10G rack where the HW RTTs are 20-50us.
>>
>> A description of TCP-NV, including implementation and experimental
>> results, can be found at:
>> 
>>https://urldefense.proofpoint.com/v1/url?u=http://www.brakmo.org/networki
>>ng/tcp-nv/TCPNV.html&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=m30SgjN07T%2FK%2
>>FdV1ZIt1iA%3D%3D%0A&m=XeELWxnafKynbNgkHg6RW%2F85hv1bPWlufUn2Dh4cOH4%3D%0A
>>&s=0029c47e62d84d6ffd22bd33e1895a3f61eaa21d88cbfb553aa1df780bbbdcf9
>>
>> The current version includes many module parameters to support
>> experimentation with the parameters.
>>
>> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
>> ---
>>  include/net/tcp.h          |   1 +
>>  net/ipv4/Kconfig           |  16 ++
>>  net/ipv4/Makefile          |   1 +
>>  net/ipv4/sysctl_net_ipv4.c |   9 +
>>  net/ipv4/tcp_input.c       |   2 +
>>  net/ipv4/tcp_nv.c          | 479
>>+++++++++++++++++++++++++++++++++++++++++++++
>>  6 files changed, 508 insertions(+)
>>  create mode 100644 net/ipv4/tcp_nv.c
>>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 2e62efe..c0690ae 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -281,6 +281,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
>>  extern int sysctl_tcp_min_tso_segs;
>>  extern int sysctl_tcp_autocorking;
>>  extern int sysctl_tcp_invalid_ratelimit;
>> +extern int sysctl_tcp_nv_enable;
>>
>>  extern atomic_long_t tcp_memory_allocated;
>>  extern struct percpu_counter tcp_sockets_allocated;
>> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
>> index 6fb3c90..c37b374 100644
>> --- a/net/ipv4/Kconfig
>> +++ b/net/ipv4/Kconfig
>> @@ -539,6 +539,22 @@ config TCP_CONG_VEGAS
>>         window. TCP Vegas should provide less packet loss, but it is
>>         not as aggressive as TCP Reno.
>>
>> +config TCP_CONG_NV
>> +       tristate "TCP NV"
>> +       default m
>> +       ---help---
>> +       TCP NV is a follow up to TCP Vegas. It has been modified to
>>deal with
>> +       10G networks, measurement noise introduced by LRO, GRO and
>>interrupt
>> +       coalescence. In addition, it will decrease its cwnd
>>multiplicative
>multiplicatively
>
>> +       instead of linearly.
>> +
>> +       Note that in general congestion avoidance (cwnd decreased when
>># packets
>> +       queued grows) cannot coexist with congestion control (cwnd
>>decreased only
>> +       when there is packet loss) due to fairness issues. One scenario
>>when the
>s/the/they
>> +       can coexist safely is when the CA flows have RTTs << CC flows
>>RTTs.
>> +
>> +       For further details see
>>https://urldefense.proofpoint.com/v1/url?u=http://www.brakmo.org/networki
>>ng/tcp-nv/&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=m30SgjN07T%2FK%2FdV1ZIt1iA
>>%3D%3D%0A&m=XeELWxnafKynbNgkHg6RW%2F85hv1bPWlufUn2Dh4cOH4%3D%0A&s=3441162
>>a0eefcad01003dbf0ba478e00a2080f76cd460eaf12213eb74f2eedbd
>> +
>>  config TCP_CONG_SCALABLE
>>         tristate "Scalable TCP"
>>         default n
>> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
>> index efc43f3..06f335f 100644
>> --- a/net/ipv4/Makefile
>> +++ b/net/ipv4/Makefile
>> @@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
>>  obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
>>  obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
>>  obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
>> +obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
>>  obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
>>  obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
>>  obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 433231c..31846d5 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -730,6 +730,15 @@ static struct ctl_table ipv4_table[] = {
>>                 .proc_handler   = proc_dointvec_ms_jiffies,
>>         },
>>         {
>> +               .procname       = "tcp_nv_enable",
>> +               .data           = &sysctl_tcp_nv_enable,
>> +               .maxlen         = sizeof(int),
>> +               .mode           = 0644,
>> +               .proc_handler   = proc_dointvec_minmax,
>> +               .extra1         = &zero,
>> +               .extra2         = &one,
>> +       },
>> +       {
>>                 .procname       = "icmp_msgs_per_sec",
>>                 .data           = &sysctl_icmp_msgs_per_sec,
>>                 .maxlen         = sizeof(int),
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index aca4ae5..87560d9 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -101,6 +101,8 @@ int sysctl_tcp_thin_dupack __read_mostly;
>>  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
>>  int sysctl_tcp_early_retrans __read_mostly = 3;
>>  int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
>> +int sysctl_tcp_nv_enable __read_mostly = 1;
>> +EXPORT_SYMBOL(sysctl_tcp_nv_enable);
>>
>>  #define FLAG_DATA              0x01 /* Incoming frame contained data.
>>        */
>>  #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a
>>window update.       */
>> diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
>> new file mode 100644
>> index 0000000..af451b6
>> --- /dev/null
>> +++ b/net/ipv4/tcp_nv.c
>> @@ -0,0 +1,479 @@
>> +/*
>> + * TCP NV: TCP with Congestion Avoidance
>> + *
>> + * TCP-NV is a successor of TCP-Vegas that has been developed to
>> + * deal with the issues that occur in modern networks.
>> + * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
>> + * the ability to detect congestion before packet losses occur.
>> + * When congestion (queue buildup) starts to occur, TCP-NV
>> + * predicts what the cwnd size should be for the current
>> + * throughput and it reduces the cwnd proportionally to
>> + * the difference between the current cwnd and the predicted cwnd.
>> + * TCP-NV behaves like Reno when no congestion is detected, or when
>> + * recovering from packet losses.
>what happens when loss is not related to congestion?

This version makes no distinction. I will update a patch later (after I¹ve
done enough testing) that adds the option of not decreasing cwnd in some
cases (such as no congestion detected) as long as they are reasonable.

>> + *
>> + * TODO:
>> + * 1) Modify the behavior so cwnd can grow faster under certain
>>conditions
>> + * 2) Add mechanism to deal with reverse congestion.
>> + */
>> +
>> +#include <linux/mm.h>
>> +#include <linux/module.h>
>> +#include <linux/math64.h>
>> +#include <net/tcp.h>
>> +#include <linux/inet_diag.h>
>> +
>> +/* TCP NV parameters */
>> +static int nv_pad __read_mostly = 10;
>> +static int nv_pad_buffer __read_mostly = 2;
>> +static int nv_reset_period __read_mostly = 5;
>> +static int nv_min_cwnd = 10;
>> +static int nv_dec_eval_min_calls = 100;
>> +static int nv_ssthresh_eval_min_calls = 30;
>> +static int nv_rtt_min_cnt = 2;
>> +static int nv_cong_decrease_mult = 30*128/100;
>> +static int nv_ssthresh_factor = 8;
>> +static int nv_rtt_factor = 128;
>> +static int nv_rtt_cnt_dec_delta = 20; /* dec cwnd by this many RTTs */
>> +static int nv_dec_factor = 5;  /* actual value is factor/8 */
>> +static int nv_loss_dec_factor = 820; /* on loss reduce cwnd by 20% */
>> +static int nv_cwnd_growth_factor = 2; /* larger => cwnd grows slower */
>> +
>> +module_param(nv_pad, int, 0644);
>> +MODULE_PARM_DESC(nv_pad, "extra packets above congestion level");
>> +module_param(nv_pad_buffer, int, 0644);
>> +MODULE_PARM_DESC(nv_pad_buffer, "no growth buffer zone");
>> +module_param(nv_reset_period, int, 0644);
>> +MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
>> +module_param(nv_min_cwnd, int, 0644);
>> +MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this
>>value"
>> +                " without losses");
>> +module_param(nv_dec_eval_min_calls, int, 0644);
>> +MODULE_PARM_DESC(nv_dec_eval_min_calls, "Wait for this many data
>>points "
>> +                "before declaring congestion (< 256)");
>> +module_param(nv_ssthresh_eval_min_calls, int, 0644);
>> +MODULE_PARM_DESC(nv_ssthresh_eval_min_calls, "Wait for this many data
>>points "
>> +                "before declaring congestion during initial
>>slow-start");
>> +module_param(nv_rtt_min_cnt, int, 0644);
>> +MODULE_PARM_DESC(nv_rtt_min_cnt, "Wait for this many RTTs before
>>declaring"
>> +                " congestion (<64)");
>> +module_param(nv_cong_decrease_mult, int, 0644);
>> +MODULE_PARM_DESC(nv_cong_decrease_mult, "Congestion decrease factor");
>> +module_param(nv_ssthresh_factor, int, 0644);
>> +MODULE_PARM_DESC(nv_ssthresh_factor, "ssthresh factor");
>> +module_param(nv_rtt_factor, int, 0644);
>> +MODULE_PARM_DESC(nv_rtt_factor, "rtt averaging factor (0-256)");
>> +module_param(nv_rtt_cnt_dec_delta, int, 0644);
>> +MODULE_PARM_DESC(nv_rtt_cnt_dec_delta, "decrease cwnd for this many
>>RTTs "
>> +                "every 100 RTTs");
>> +module_param(nv_dec_factor, int, 0644);
>> +MODULE_PARM_DESC(nv_dec_factor, "decrease cwnd every ~192 RTTS by
>>factor/8");
>> +module_param(nv_loss_dec_factor, int, 0644);
>> +MODULE_PARM_DESC(nv_loss_dec_factor, "on loss new cwnd = cwnd * this /
>>1024");
>> +module_param(nv_cwnd_growth_factor, int, 0644);
>> +MODULE_PARM_DESC(nv_cwnd_growth_factor, "larger => cwnd grows slower");
>> +
>> +/* TCP NV Parameters */
>> +struct tcpnv {
>> +       unsigned long nv_min_rtt_reset_jiffies;  /* when to switch to
>> +                                                 * nv_min_rtt_new */
>> +       u32 cnt;                /* increase cwnd by 1 after ACKs */
>> +       u32 loss_cwnd;  /* cwnd at last loss */
>> +       u8  nv_enable:1,
>> +               nv_allow_cwnd_growth:1,         /* whether cwnd can
>>grow */
>> +               nv_rtt_cnt:6;   /* RTTs without making ca decision */
>> +       u8  nv_eval_call_cnt;/* call count since last eval */
>> +       u8  nv_min_cwnd;        /* nv won't make a ca decision if cwnd
>>is
>> +                                * smaller than this. It may grow to
>>handle
>> +                                * TSO, LRO and interrupt coalescence
>>because
>> +                                * with these a small cwnd cannot
>>saturate
>> +                                * the link. Note that this is
>>different from
>> +                                * sysctl_tcp_nv_min_cwnd */
>> +       u8  available;
>> +       u32 nv_last_rtt;        /* last rtt */
>> +       u32 nv_min_rtt;         /* active min rtt. Used to determine
>>slope */
>> +       u32 nv_min_rtt_new;     /* min rtt for future use */
>> +       u32 nv_rtt_max_rate;    /* max rate seen during current RTT */
>> +       u32 nv_rtt_start_seq;   /* current RTT ends when packet arrives
>> +                                * acking beyond nv_rtt_start_seq */
>> +       u32 nv_last_snd_una;    /* Previous value of tp->snd_una. It is
>> +                                * used to determine bytes acked since
>>last
>> +                                * call to bictcp_acked */
>> +       u32 nv_no_cong_cnt;     /* Consecutive no congestion decisions
>>*/
>> +       u32 nv_rtt_cnt_dec;     /* RTTs since last temporary cwnd
>>decrease */
>> +};
>> +
>> +#define NV_INIT_RTT      0xffffffff
>> +#define NV_MIN_CWND      4
>> +#define NV_MIN_CWND_GROW  2
>> +#define NV_TSO_CWND_BOUND 80
>> +
>> +static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
>> +{
>> +       struct tcp_sock *tp = tcp_sk(sk);
>> +
>> +       ca->loss_cwnd = 0;
>> +       ca->nv_no_cong_cnt = 0;
>> +       ca->cnt = 0;
>> +       ca->nv_rtt_cnt = 0;
>> +       ca->nv_rtt_cnt_dec = 0;
>> +       ca->nv_allow_cwnd_growth = 1;
>> +       ca->nv_last_rtt = 0;
>> +       ca->nv_rtt_max_rate = 0;
>> +       ca->nv_rtt_start_seq = tp->snd_una;
>> +       ca->nv_eval_call_cnt = 0;
>> +       ca->nv_last_snd_una = tp->snd_una;
>> +}
>> +
>> +static void tcpnv_init(struct sock *sk)
>> +{
>> +       struct tcpnv *ca = inet_csk_ca(sk);
>> +
>> +       tcpnv_reset(ca, sk);
>> +
>> +       ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
>> +       ca->nv_min_rtt = NV_INIT_RTT;
>> +       ca->nv_min_rtt_new = NV_INIT_RTT;
>> +       ca->nv_enable = sysctl_tcp_nv_enable;
>> +       ca->nv_min_cwnd = NV_MIN_CWND;
>> +       if (nv_dec_eval_min_calls > 255)
>> +               nv_dec_eval_min_calls = 255;
>> +       if (nv_rtt_min_cnt > 63)
>> +               nv_rtt_min_cnt = 63;
>> +}
>> +
>> +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
>> +{
>> +       struct tcp_sock *tp = tcp_sk(sk);
>> +       struct tcpnv *ca = inet_csk_ca(sk);
>> +
>> +       if (!tcp_is_cwnd_limited(sk))
>> +               return;
>> +
>> +       /* Only grow cwnd if NV has not detected congestion */
>> +       if (sysctl_tcp_nv_enable && ca->nv_enable &&
>> +           !ca->nv_allow_cwnd_growth)
>> +               return;
>> +
>> +       if (tp->snd_cwnd <= tp->snd_ssthresh) {
>> +               acked = tcp_slow_start(tp, acked);
>> +               if (!acked)
>> +                       return;
>> +       }
>> +       if (ca->cnt == 0 || !(sysctl_tcp_nv_enable || ca->nv_enable))
>> +               ca->cnt = tp->snd_cwnd;
>> +
>> +       tcp_cong_avoid_ai(tp, ca->cnt, acked);
>> +}
>> +
>> +static u32 tcpnv_recalc_ssthresh(struct sock *sk)
>> +{
>> +       const struct tcp_sock *tp = tcp_sk(sk);
>> +       struct tcpnv *ca = inet_csk_ca(sk);
>> +
>> +       ca->loss_cwnd = tp->snd_cwnd;
>> +       return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
>> +}
>> +
>> +static u32 tcpnv_undo_cwnd(struct sock *sk)
>> +{
>> +       struct tcpnv *ca = inet_csk_ca(sk);
>> +
>> +       return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
>> +}
>> +
>> +static void tcpnv_state(struct sock *sk, u8 new_state)
>> +{
>> +       struct tcpnv *ca = inet_csk_ca(sk);
>> +
>> +       if (new_state == TCP_CA_Open) {
>> +               ca->nv_enable = 1;
>> +               tcpnv_reset(ca, sk);
>> +       } else if (new_state == TCP_CA_Loss) {
>> +               ca->nv_enable = 0;
>> +       }
>> +}
>> +
>> +/* Do congestion avoidance calculaitons for TCP-NV
>> + */
>> +static void tcpnv_acked(struct sock *sk, struct ack_sample sample)
>> +{
>> +       const struct inet_connection_sock *icsk = inet_csk(sk);
>> +       struct tcp_sock *tp = tcp_sk(sk);
>> +       struct tcpnv *ca = inet_csk_ca(sk);
>> +       unsigned long now = jiffies;
>> +       s64 rate64 = 0;
>> +       u32 rate, max_win, cwnd_by_slope;
>> +       u32 avg_rtt;
>> +       u32 bytes_acked = 0;
>> +
>> +       /* Some calls are for duplicates without timetamps */
>> +       if (sample.rtt_us < 0)
>> +               return;
>> +
>> +       /* If not in TCP_CA_Open state, skip. */
>> +       if (icsk->icsk_ca_state != TCP_CA_Open)
>> +               return;
>> +
>> +       /* If NV mode is not enabled, behave like Reno */
>> +       if (!sysctl_tcp_nv_enable  ||  !ca->nv_enable) {
>> +               ca->nv_allow_cwnd_growth = 1;
>what's the use case of using NV with sysctl_tcp_nv_enable=0?

I¹m paranoid, this is to turn NV behavior off in case it starts acting
badly.
I¹ve done some testing within a rack, have plans to do more extensive
testing.

As per Neal¹s comment, I¹ve changed it to a module parameter making NV
even less
intrusive.
 
>
>> +               return;
>> +       }
>> +
>> +       bytes_acked = tp->snd_una - ca->nv_last_snd_una;
>> +       ca->nv_last_snd_una = tp->snd_una;
>> +
>> +       if (sample.in_flight == 0)
>> +               return;
>> +
>> +       /* Calculate moving average of RTT */
>> +       if (nv_rtt_factor > 0) {
>> +               if (ca->nv_last_rtt > 0) {
>> +                       avg_rtt = (((u64)sample.rtt_us) * nv_rtt_factor
>>+
>> +                                  ((u64)ca->nv_last_rtt)
>> +                                  * (256 - nv_rtt_factor)) >> 8;
>> +               } else {
>> +                       avg_rtt = sample.rtt_us;
>> +                       ca->nv_min_rtt = avg_rtt << 1;
>> +               }
>> +               ca->nv_last_rtt = avg_rtt;
>> +       } else {
>> +               avg_rtt = sample.rtt_us;
>> +       }
>> +
>> +       /* rate in 100's bits per second */
>> +       rate64 = ((u64)sample.in_flight) * 8000000;
>> +       rate = (u32)div64_u64(rate64, (u64)(avg_rtt*100));
>> +
>> +       /* Remember the maximum rate seen during this RTT
>> +        * Note: It may be more than one RTT. This function should be
>> +        *       called at least nv_dec_eval_min_calls times.
>> +        */
>> +       if (ca->nv_rtt_max_rate < rate)
>> +               ca->nv_rtt_max_rate = rate;
>> +
>> +       /* We have valid information, increment counter */
>> +       if (ca->nv_eval_call_cnt < 255)
>> +               ca->nv_eval_call_cnt++;
>> +
>> +       /* update min rtt if necessary */
>> +       if (avg_rtt < ca->nv_min_rtt)
>> +               ca->nv_min_rtt = avg_rtt;
>> +
>> +       /* update future min_rtt if necessary */
>> +       if (avg_rtt < ca->nv_min_rtt_new)
>> +               ca->nv_min_rtt_new = avg_rtt;
>> +
>> +       /* nv_min_rtt is updated with the minimum (possibley averaged)
>>rtt
>> +        * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
>> +        * warm reset). This new nv_min_rtt will be continued to be
>>updated
>> +        * and be used for another sysctl_tcp_nv_reset_period seconds,
>> +        * when it will be updated again.
>> +        * In practice we introduce some randomness, so the actual
>>period used
>> +        * is chosen randomly from the range:
>> +        *   [sysctl_tcp_nv_reset_period*3/4,
>>sysctl_tcp_nv_reset_period*5/4)
>> +        */
>> +       if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
>> +               unsigned char rand;
>> +               ca->nv_min_rtt = ca->nv_min_rtt_new;
>> +               ca->nv_min_rtt_new = NV_INIT_RTT;
>> +               get_random_bytes(&rand, 1);
>> +               ca->nv_min_rtt_reset_jiffies =
>> +                       now + ((nv_reset_period*(384 + rand)*HZ)>>9);
>> +               /* Every so often we decrease nv_min_cwnd in case
>>previous
>> +                *  value is no longer accurate.
>> +                */
>> +               ca->nv_min_cwnd = max(ca->nv_min_cwnd/2, NV_MIN_CWND);
>> +       }
>> +
>> +       /* Once per RTT check if we need to do congestion avoidance */
>> +       if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
>> +               ca->nv_rtt_start_seq = tp->snd_nxt;
>> +               if (ca->nv_rtt_cnt < 63)
>> +                       /* Increase counter for RTTs without CA
>>decision */
>> +                       ca->nv_rtt_cnt++;
>> +               if (ca->nv_rtt_cnt_dec < 255)
>> +                       /* Increase counter for temporary cwnd decrease
>>*/
>> +                       ca->nv_rtt_cnt_dec++;
>> +
>> +               /* If this function is only called once within an RTT
>> +                * the cwnd is probably too small (in some cases due to
>> +                * tso, lro or interrupt coalescence), so we increase
>> +                * nv_min_cwnd.
>> +                */
>> +               if (ca->nv_eval_call_cnt == 1
>> +                   && bytes_acked >= (ca->nv_min_cwnd - 1) *
>>tp->mss_cache
>> +                   && ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)
>> +                   && ca->nv_rtt_cnt_dec < 192) {
>> +                       ca->nv_min_cwnd = min(ca->nv_min_cwnd
>> +                                             + NV_MIN_CWND_GROW,
>> +                                             NV_TSO_CWND_BOUND + 1);
>> +                       ca->nv_rtt_start_seq = tp->snd_nxt +
>> +                               ca->nv_min_cwnd*tp->mss_cache;
>> +                       ca->nv_eval_call_cnt = 0;
>> +                       ca->nv_allow_cwnd_growth = 1;
>> +                       return;
>> +               }
>> +
>> +               /* Every 192 to 320 RTTs decrease cwnd to get better
>>min RTT
>> +                * measurement. In practice we accomplish this by
>>initializing
>> +                * nv_rtt_cnd_dec randomly form the range [0, 128) and
>> +                * stopping at 320.
>> +                * We keep the value low for nv_rtt_cnt_dec_delta RTTs
>>and then
>> +                * we restore cwnd to its previous value (by setting
>> +                * ssthresh to the previous value).
>> +                */
>> +               if (ca->nv_rtt_cnt_dec == 320) {
>> +                       /* decrease cwnd and ssthresh */
>> +                       tp->snd_cwnd =
>> +                               max((unsigned int)nv_min_cwnd,
>> +                                   ((tp->snd_cwnd * nv_dec_factor) >>
>>3));
>> +                       tp->snd_ssthresh =
>> +                               max(tp->snd_cwnd,
>> +                                   ((tp->snd_ssthresh * nv_dec_factor)
>>>> 3));
>> +                       ca->nv_allow_cwnd_growth = 0;
>> +                       return;
>> +               } else if (ca->nv_rtt_cnt_dec > 320) {
>> +                       if (ca->nv_rtt_cnt_dec - 320 >=
>>nv_rtt_cnt_dec_delta) {
>> +                               /* Restore ssthresh to restore cwnd */
>> +                               unsigned char rand;
>> +                               get_random_bytes(&rand, 1);
>> +                               ca->nv_rtt_cnt_dec = rand >> 1;
>> +                               tp->snd_ssthresh = (tp->snd_ssthresh <<
>>3)
>> +                                       / nv_dec_factor;
>> +                               ca->nv_allow_cwnd_growth = 1;
>> +                               ca->nv_no_cong_cnt = 0;
>> +                       }
>> +                       return;
>> +               }
>> +
>> +               /* Find the ideal cwnd for current rate from slope
>> +                * slope = 80000.0 * mss / nv_min_rtt
>> +                * cwnd_by_slope = nv_rtt_max_rate / slope
>> +                */
>> +               cwnd_by_slope = (u32)
>> +                       div64_u64(((u64)ca->nv_rtt_max_rate) *
>>ca->nv_min_rtt,
>> +                                 (u64)(80000 * tp->mss_cache));
>> +               max_win = cwnd_by_slope + nv_pad;
>> +
>> +               /* If cwnd > max_win, decrease cwnd
>> +                * if cwnd < max_win, grow cwnd
>> +                * else leave the same
>> +                */
>> +               if (tp->snd_cwnd > max_win) {
>> +                       /* there is congestion, check that it is ok
>> +                        * to make a CA decision
>> +                        * 1. We should have at least
>>nv_dec_eval_min_calls
>> +                        *    data points before making a CA  decision
>> +                        * 2. We only make a congesion decision after
>> +                        *    nv_rtt_min_cnt RTTs
>> +                        */
>> +                       if (ca->nv_rtt_cnt < nv_rtt_min_cnt)
>> +                               return;
>> +                       else if (tp->snd_ssthresh ==
>>TCP_INFINITE_SSTHRESH) {
>> +                               if (ca->nv_eval_call_cnt <
>> +                                   nv_ssthresh_eval_min_calls)
>> +                                       return;
>> +                       } else if (ca->nv_eval_call_cnt <
>> +                                  nv_dec_eval_min_calls) {
>> +                               return;
>> +                       }
>> +
>> +                       /* We have enough data to determine we are
>>congested */
>> +                       ca->nv_allow_cwnd_growth = 0;
>> +                       tp->snd_ssthresh =
>> +                               (nv_ssthresh_factor * max_win) >> 3;
>> +                       if (tp->snd_cwnd - max_win > 2) {
>> +                               /* gap > 2, we do exponential cwnd
>>decrease */
>> +                               int dec;
>> +                               dec = max(2U, ((tp->snd_cwnd - max_win)
>>*
>> +                                              nv_cong_decrease_mult)
>>>> 7);
>> +                               tp->snd_cwnd -= dec;
>> +                       } else if (nv_cong_decrease_mult > 0) {
>> +                               tp->snd_cwnd = max_win;
>> +                       }
>> +                       ca->cnt = tp->snd_cwnd;
>> +                       ca->nv_no_cong_cnt = 0;
>> +               } else if (tp->snd_cwnd <=  max_win - nv_pad_buffer) {
>> +                       /* We allow growth of cwnd every RTT since we
>>would
>> +                        * have grown even if we waited (just slower)
>> +                        */
>> +                       ca->nv_allow_cwnd_growth = 1;
>> +                       ca->nv_no_cong_cnt++;
>> +                       if (nv_cwnd_growth_factor > 0 &&
>> +                           ca->nv_no_cong_cnt > nv_cwnd_growth_factor)
>>{
>> +                               ca->cnt = max(ca->cnt >> 1, (u32) 4);
>> +                               ca->nv_no_cong_cnt = 0;
>> +                       }
>> +               } else {
>> +                       ca->nv_allow_cwnd_growth = 0;
>> +               }
>> +
>> +               /* update state */
>> +               ca->nv_eval_call_cnt = 0;
>> +               ca->nv_rtt_cnt = 0;
>> +               ca->nv_rtt_max_rate = 0;
>> +
>> +               /* Don't want to make cwnd < nv_min_cwnd
>> +                * (it wasn't before, if it is now is because nv
>> +                *  decreased it).
>> +                */
>> +               if (tp->snd_cwnd < nv_min_cwnd)
>> +                       tp->snd_cwnd = nv_min_cwnd;
>> +
>> +  }
>> +}
>> +
>> +/* Extract info for Tcp socket info provided via netlink */
>> +size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
>> +                      union tcp_cc_info *info)
>> +{
>> +       const struct tcpnv *ca = inet_csk_ca(sk);
>> +
>> +       if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
>> +               info->vegas.tcpv_enabled = ca->nv_enable
>> +                       && sysctl_tcp_nv_enable;
>> +               info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
>> +               info->vegas.tcpv_rtt = ca->nv_last_rtt;
>> +               info->vegas.tcpv_minrtt = ca->nv_min_rtt;
>> +
>> +               *attr = INET_DIAG_VEGASINFO;
>> +               return sizeof(struct tcpvegas_info);
>> +       }
>> +       return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(tcpnv_get_info);
>> +
>> +static struct tcp_congestion_ops tcpnv __read_mostly = {
>> +       .init           = tcpnv_init,
>> +       .ssthresh       = tcpnv_recalc_ssthresh,
>> +       .cong_avoid     = tcpnv_cong_avoid,
>> +       .set_state      = tcpnv_state,
>> +       .undo_cwnd      = tcpnv_undo_cwnd,
>> +       .pkts_acked     = tcpnv_acked,
>> +       .get_info       = tcpnv_get_info,
>> +
>> +       .owner          = THIS_MODULE,
>> +       .name           = "nv",
>> +};
>> +
>> +static int __init tcpnv_register(void)
>> +{
>> +       BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
>> +
>> +       return tcp_register_congestion_control(&tcpnv);
>> +}
>> +
>> +static void __exit tcpnv_unregister(void)
>> +{
>> +       tcp_unregister_congestion_control(&tcpnv);
>> +}
>> +
>> +module_init(tcpnv_register);
>> +module_exit(tcpnv_unregister);
>> +
>> +MODULE_AUTHOR("Lawrence Brakmo");
>> +MODULE_LICENSE("GPL");
>> +MODULE_DESCRIPTION("TCP NV");
>> +MODULE_VERSION("1.0");
>> --
>> 1.8.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2015-07-24  0:47 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-22  4:21 [RFC PATCH v2 net-next 0/3] tcp: add NV congestion control Lawrence Brakmo
2015-07-22  4:21 ` [RFC PATCH v2 net-next 1/3] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
2015-07-22  5:46   ` Eric Dumazet
2015-07-22 11:04     ` Neal Cardwell
2015-07-22  4:21 ` [RFC PATCH v2 net-next 2/3] tcp: add in_flight to tcp_skb_cb Lawrence Brakmo
2015-07-22  4:21 ` [RFC PATCH v2 net-next 3/3] tcp: add NV congestion control Lawrence Brakmo
2015-07-22  6:50   ` Yuchung Cheng
2015-07-22 11:27     ` Neal Cardwell
2015-07-24  0:47     ` Lawrence Brakmo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).