netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v4 net-next 0/4] tcp: add NV congestion control
@ 2015-07-25  2:47 Lawrence Brakmo
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-25  2:47 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng

This patchset adds support for NV congestion control.

The first patch replaces two arguments with a struct in pkts_acked()
The second patch is a refactor of tcp_skb_cb
The third patch adds in_flight to tcp_skb_cb's tx section
The fourth patch adds NV congestion control support.

[RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked()
[RFC PATCH v4 net-next 2/4] tcp: refactor struct tcp_skb_cb
[RFC PATCH v4 net-next 3/4] tcp: add in_flight to tcp_skb_cb
[RFC PATCH v4 net-next 4/4] tcp: add NV congestion control

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>

include/net/tcp.h       |  20 ++-
net/ipv4/Kconfig        |  16 ++
net/ipv4/Makefile       |   1 +
net/ipv4/tcp_bic.c      |   6 +-
net/ipv4/tcp_cdg.c      |  14 +-
net/ipv4/tcp_cubic.c    |   6 +-
net/ipv4/tcp_htcp.c     |  10 +-
net/ipv4/tcp_illinois.c |  20 +--
net/ipv4/tcp_input.c    |  10 +-
net/ipv4/tcp_lp.c       |   6 +-
net/ipv4/tcp_nv.c       | 479 ++++++++++++++++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_output.c   |   4 +-
net/ipv4/tcp_vegas.c    |   6 +-
net/ipv4/tcp_vegas.h    |   2 +-
net/ipv4/tcp_veno.c     |   6 +-
net/ipv4/tcp_westwood.c |   6 +-
net/ipv4/tcp_yeah.c     |   6 +-
17 files changed, 567 insertions(+), 51 deletions(-)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked()
  2015-07-25  2:47 [RFC PATCH v4 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
@ 2015-07-25  2:47 ` Lawrence Brakmo
  2015-07-25  6:22   ` Eric Dumazet
  2015-07-27 18:46   ` Stephen Hemminger
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 2/4] tcp: refactor struct tcp_skb_cb Lawrence Brakmo
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-25  2:47 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng

Replace 2 arguments (cnt and rtt) in the congestion control modules'
pkts_acked() function with a struct. This will allow adding more
information without having to modify existing congestion control
modules (tcp_nv in particular needs bytes in flight when packet
was sent).

As proposed by Neal Cardwell in his comments to the tcp_nv patch.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/net/tcp.h       |  7 ++++++-
 net/ipv4/tcp_bic.c      |  6 +++---
 net/ipv4/tcp_cdg.c      | 14 +++++++-------
 net/ipv4/tcp_cubic.c    |  6 +++---
 net/ipv4/tcp_htcp.c     | 10 +++++-----
 net/ipv4/tcp_illinois.c | 20 ++++++++++----------
 net/ipv4/tcp_input.c    |  7 +++++--
 net/ipv4/tcp_lp.c       |  6 +++---
 net/ipv4/tcp_vegas.c    |  6 +++---
 net/ipv4/tcp_vegas.h    |  2 +-
 net/ipv4/tcp_veno.c     |  6 +++---
 net/ipv4/tcp_westwood.c |  6 +++---
 net/ipv4/tcp_yeah.c     |  6 +++---
 13 files changed, 55 insertions(+), 47 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 364426a..1e6c5b04 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -834,6 +834,11 @@ enum tcp_ca_ack_event_flags {
 
 union tcp_cc_info;
 
+struct ack_sample {
+	u32 pkts_acked;
+	s32 rtt_us;
+};
+
 struct tcp_congestion_ops {
 	struct list_head	list;
 	u32 key;
@@ -857,7 +862,7 @@ struct tcp_congestion_ops {
 	/* new value of cwnd after loss (optional) */
 	u32  (*undo_cwnd)(struct sock *sk);
 	/* hook for packet ack accounting (optional) */
-	void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us);
+	void (*pkts_acked)(struct sock *sk, struct ack_sample *sample);
 	/* get info for inet_diag (optional) */
 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
 			   union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index fd1405d..f237691 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -197,15 +197,15 @@ static void bictcp_state(struct sock *sk, u8 new_state)
 /* Track delayed acknowledgment ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void bictcp_acked(struct sock *sk, struct ack_sample *sample)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (icsk->icsk_ca_state == TCP_CA_Open) {
 		struct bictcp *ca = inet_csk_ca(sk);
 
-		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
-		ca->delayed_ack += cnt;
+		ca->delayed_ack += sample->pkts_acked - 
+			(ca->delayed_ack >> ACK_RATIO_SHIFT);
 	}
 }
 
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 167b6a3..9fbdfa5 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -294,12 +294,12 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
 }
 
-static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_cdg_acked(struct sock *sk, struct ack_sample *sample)
 {
 	struct cdg *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (rtt_us <= 0)
+	if (sample->rtt_us <= 0)
 		return;
 
 	/* A heuristic for filtering delayed ACKs, adapted from:
@@ -307,20 +307,20 @@ static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
 	 * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
 	 */
 	if (tp->sacked_out == 0) {
-		if (num_acked == 1 && ca->delack) {
+		if (sample->pkts_acked == 1 && ca->delack) {
 			/* A delayed ACK is only used for the minimum if it is
 			 * provenly lower than an existing non-zero minimum.
 			 */
-			ca->rtt.min = min(ca->rtt.min, rtt_us);
+			ca->rtt.min = min(ca->rtt.min, sample->rtt_us);
 			ca->delack--;
 			return;
-		} else if (num_acked > 1 && ca->delack < 5) {
+		} else if (sample->pkts_acked > 1 && ca->delack < 5) {
 			ca->delack++;
 		}
 	}
 
-	ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
-	ca->rtt.max = max(ca->rtt.max, rtt_us);
+	ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us);
+	ca->rtt.max = max(ca->rtt.max, sample->rtt_us);
 }
 
 static u32 tcp_cdg_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 28011fb..9817a8f 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -416,21 +416,21 @@ static void hystart_update(struct sock *sk, u32 delay)
 /* Track delayed acknowledgment ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void bictcp_acked(struct sock *sk, struct ack_sample *sample)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
 	u32 delay;
 
 	/* Some calls are for duplicates without timetamps */
-	if (rtt_us < 0)
+	if (sample->rtt_us < 0)
 		return;
 
 	/* Discard delay samples right after fast recovery */
 	if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
 		return;
 
-	delay = (rtt_us << 3) / USEC_PER_MSEC;
+	delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
 	if (delay == 0)
 		delay = 1;
 
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 82f0d9e..53e21fb8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -99,7 +99,7 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
 }
 
 static void measure_achieved_throughput(struct sock *sk,
-					u32 pkts_acked, s32 rtt)
+					struct ack_sample *sample)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
@@ -107,10 +107,10 @@ static void measure_achieved_throughput(struct sock *sk,
 	u32 now = tcp_time_stamp;
 
 	if (icsk->icsk_ca_state == TCP_CA_Open)
-		ca->pkts_acked = pkts_acked;
+		ca->pkts_acked = sample->pkts_acked;
 
-	if (rtt > 0)
-		measure_rtt(sk, usecs_to_jiffies(rtt));
+	if (sample->rtt_us > 0)
+		measure_rtt(sk, usecs_to_jiffies(sample->rtt_us));
 
 	if (!use_bandwidth_switch)
 		return;
@@ -122,7 +122,7 @@ static void measure_achieved_throughput(struct sock *sk,
 		return;
 	}
 
-	ca->packetcount += pkts_acked;
+	ca->packetcount += sample->pkts_acked;
 
 	if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
 	    now - ca->lasttime >= ca->minRTT &&
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 2ab9bbb..daf0992 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -82,30 +82,30 @@ static void tcp_illinois_init(struct sock *sk)
 }
 
 /* Measure RTT for each ack. */
-static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
+static void tcp_illinois_acked(struct sock *sk, struct ack_sample *sample)
 {
 	struct illinois *ca = inet_csk_ca(sk);
 
-	ca->acked = pkts_acked;
+	ca->acked = sample->pkts_acked;
 
 	/* dup ack, no rtt sample */
-	if (rtt < 0)
+	if (sample->rtt_us < 0)
 		return;
 
 	/* ignore bogus values, this prevents wraparound in alpha math */
-	if (rtt > RTT_MAX)
-		rtt = RTT_MAX;
+	if (sample->rtt_us > RTT_MAX)
+		sample->rtt_us = RTT_MAX;
 
 	/* keep track of minimum RTT seen so far */
-	if (ca->base_rtt > rtt)
-		ca->base_rtt = rtt;
+	if (ca->base_rtt > sample->rtt_us)
+		ca->base_rtt = sample->rtt_us;
 
 	/* and max */
-	if (ca->max_rtt < rtt)
-		ca->max_rtt = rtt;
+	if (ca->max_rtt < sample->rtt_us)
+		ca->max_rtt = sample->rtt_us;
 
 	++ca->cnt_rtt;
-	ca->sum_rtt += rtt;
+	ca->sum_rtt += sample->rtt_us;
 }
 
 /* Maximum queuing delay */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1578fc2..423d3af 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3195,8 +3195,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		tcp_rearm_rto(sk);
 	}
 
-	if (icsk->icsk_ca_ops->pkts_acked)
-		icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+	if (icsk->icsk_ca_ops->pkts_acked) {
+		struct ack_sample sample = {pkts_acked, ca_rtt_us};
+
+		icsk->icsk_ca_ops->pkts_acked(sk, &sample);
+	}
 
 #if FASTRETRANS_DEBUG > 0
 	WARN_ON((int)tp->sacked_out < 0);
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 1e70fa8..8989c41 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -260,13 +260,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
  * newReno in increase case.
  * We work it out by following the idea from TCP-LP's paper directly
  */
-static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_lp_pkts_acked(struct sock *sk, struct ack_sample *sample)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct lp *lp = inet_csk_ca(sk);
 
-	if (rtt_us > 0)
-		tcp_lp_rtt_sample(sk, rtt_us);
+	if (sample->rtt_us > 0)
+		tcp_lp_rtt_sample(sk, sample->rtt_us);
 
 	/* calc inference */
 	if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13951c4..d8c9cb3 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -107,16 +107,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init);
  *   o min-filter RTT samples from a much longer window (forever for now)
  *     to find the propagation delay (baseRTT)
  */
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+void tcp_vegas_pkts_acked(struct sock *sk, struct ack_sample *sample)
 {
 	struct vegas *vegas = inet_csk_ca(sk);
 	u32 vrtt;
 
-	if (rtt_us < 0)
+	if (sample->rtt_us < 0)
 		return;
 
 	/* Never allow zero rtt or baseRTT */
-	vrtt = rtt_us + 1;
+	vrtt = sample->rtt_us + 1;
 
 	/* Filter to find propagation delay: */
 	if (vrtt < vegas->baseRTT)
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index ef9da53..9fa2eea 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -17,7 +17,7 @@ struct vegas {
 
 void tcp_vegas_init(struct sock *sk);
 void tcp_vegas_state(struct sock *sk, u8 ca_state);
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_pkts_acked(struct sock *sk, struct ack_sample *sample);
 void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
 size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
 			  union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 0d094b9..9ca3175 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -69,16 +69,16 @@ static void tcp_veno_init(struct sock *sk)
 }
 
 /* Do rtt sampling needed for Veno. */
-static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void tcp_veno_pkts_acked(struct sock *sk, struct ack_sample *sample)
 {
 	struct veno *veno = inet_csk_ca(sk);
 	u32 vrtt;
 
-	if (rtt_us < 0)
+	if (sample->rtt_us < 0)
 		return;
 
 	/* Never allow zero rtt or baseRTT */
-	vrtt = rtt_us + 1;
+	vrtt = sample->rtt_us + 1;
 
 	/* Filter to find propagation delay: */
 	if (vrtt < veno->basertt)
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index c10732e..99c151a 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -99,12 +99,12 @@ static void westwood_filter(struct westwood *w, u32 delta)
  * Called after processing group of packets.
  * but all westwood needs is the last sample of srtt.
  */
-static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void tcp_westwood_pkts_acked(struct sock *sk, struct ack_sample *sample)
 {
 	struct westwood *w = inet_csk_ca(sk);
 
-	if (rtt > 0)
-		w->rtt = usecs_to_jiffies(rtt);
+	if (sample->rtt_us > 0)
+		w->rtt = usecs_to_jiffies(sample->rtt_us);
 }
 
 /*
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 17d3566..3a507c5 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -56,15 +56,15 @@ static void tcp_yeah_init(struct sock *sk)
 	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
 }
 
-static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
+static void tcp_yeah_pkts_acked(struct sock *sk, struct ack_sample *sample)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct yeah *yeah = inet_csk_ca(sk);
 
 	if (icsk->icsk_ca_state == TCP_CA_Open)
-		yeah->pkts_acked = pkts_acked;
+		yeah->pkts_acked = sample->pkts_acked;
 
-	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
+	tcp_vegas_pkts_acked(sk, sample);
 }
 
 static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH v4 net-next 2/4] tcp:  refactor struct tcp_skb_cb
  2015-07-25  2:47 [RFC PATCH v4 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
@ 2015-07-25  2:47 ` Lawrence Brakmo
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 3/4] tcp: add in_flight to tcp_skb_cb Lawrence Brakmo
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 4/4] tcp: add NV congestion control Lawrence Brakmo
  3 siblings, 0 replies; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-25  2:47 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng

Refactor tcp_skb_cb to create two overlaping areas to store
state for incoming or outgoing skbs based on comments by
Neal Cardwell to tcp_nv patch:

   AFAICT this patch would not require an increase in the size of
   sk_buff cb[] if it were to take advantage of the fact that the
   tcp_skb_cb header.h4 and header.h6 fields are only used in the packet
   reception code path, and this in_flight field is only used on the
   transmit side.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/net/tcp.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1e6c5b04..7c510ed 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -755,11 +755,16 @@ struct tcp_skb_cb {
 	/* 1 byte hole */
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
-		struct inet_skb_parm	h4;
+		struct {
+			/* There is space for up to 20 bytes */
+		} tx;   /* only used for outgoing skbs */
+		union {
+			struct inet_skb_parm	h4;
 #if IS_ENABLED(CONFIG_IPV6)
-		struct inet6_skb_parm	h6;
+			struct inet6_skb_parm	h6;
 #endif
-	} header;	/* For incoming frames		*/
+		} header;	/* For incoming skbs */
+	};
 };
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH v4 net-next 3/4] tcp: add in_flight to tcp_skb_cb
  2015-07-25  2:47 [RFC PATCH v4 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 2/4] tcp: refactor struct tcp_skb_cb Lawrence Brakmo
@ 2015-07-25  2:47 ` Lawrence Brakmo
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 4/4] tcp: add NV congestion control Lawrence Brakmo
  3 siblings, 0 replies; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-25  2:47 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng

Add in_flight (bytes in flight when packet was sent) field
to tx component of tcp_skb_cb and make it available to
congestion modules' pkts_acked() function through the
ack_sample function argument.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/net/tcp.h     | 2 ++
 net/ipv4/tcp_input.c  | 5 ++++-
 net/ipv4/tcp_output.c | 4 +++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7c510ed..f850404 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -757,6 +757,7 @@ struct tcp_skb_cb {
 	union {
 		struct {
 			/* There is space for up to 20 bytes */
+			__u32 in_flight;/* Bytes in flight when packet sent */
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -842,6 +843,7 @@ union tcp_cc_info;
 struct ack_sample {
 	u32 pkts_acked;
 	s32 rtt_us;
+	u32 in_flight;
 };
 
 struct tcp_congestion_ops {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 423d3af..3ab4178 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3068,6 +3068,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	long ca_rtt_us = -1L;
 	struct sk_buff *skb;
 	u32 pkts_acked = 0;
+	u32 last_in_flight = 0;
 	bool rtt_update;
 	int flag = 0;
 
@@ -3107,6 +3108,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			if (!first_ackt.v64)
 				first_ackt = last_ackt;
 
+			last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
 			reord = min(pkts_acked, reord);
 			if (!after(scb->end_seq, tp->high_seq))
 				flag |= FLAG_ORIG_SACK_ACKED;
@@ -3196,7 +3198,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	}
 
 	if (icsk->icsk_ca_ops->pkts_acked) {
-		struct ack_sample sample = {pkts_acked, ca_rtt_us};
+		struct ack_sample sample = {pkts_acked, ca_rtt_us,
+					    last_in_flight};
 
 		icsk->icsk_ca_ops->pkts_acked(sk, &sample);
 	}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7105784..e9deab5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -920,9 +920,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	int err;
 
 	BUG_ON(!skb || !tcp_skb_pcount(skb));
+	tp = tcp_sk(sk);
 
 	if (clone_it) {
 		skb_mstamp_get(&skb->skb_mstamp);
+		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
+			- tp->snd_una;
 
 		if (unlikely(skb_cloned(skb)))
 			skb = pskb_copy(skb, gfp_mask);
@@ -933,7 +936,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	}
 
 	inet = inet_sk(sk);
-	tp = tcp_sk(sk);
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH v4 net-next 4/4] tcp: add NV congestion control
  2015-07-25  2:47 [RFC PATCH v4 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
                   ` (2 preceding siblings ...)
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 3/4] tcp: add in_flight to tcp_skb_cb Lawrence Brakmo
@ 2015-07-25  2:47 ` Lawrence Brakmo
  3 siblings, 0 replies; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-25  2:47 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng

This is a request for comments.

TCP-NV (New Vegas) is a major update to TCP-Vegas.
An earlier version of NV was presented at 2010's LPC.
It is a delayed based congestion avoidance for the
data center. This version has been tested within a
10G rack where the HW RTTs are 20-50us.

A description of TCP-NV, including implementation
details as well as experimental results, can be found at:
http://www.brakmo.org/networking/tcp-nv/TCPNV.html

The current version includes many module parameters to support
experimentation with the parameters.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/ipv4/Kconfig  |  16 ++
 net/ipv4/Makefile |   1 +
 net/ipv4/tcp_nv.c | 479 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 496 insertions(+)
 create mode 100644 net/ipv4/tcp_nv.c

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6fb3c90..f11f2f8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -539,6 +539,22 @@ config TCP_CONG_VEGAS
 	window. TCP Vegas should provide less packet loss, but it is
 	not as aggressive as TCP Reno.
 
+config TCP_CONG_NV
+       tristate "TCP NV"
+       default n
+       ---help---
+       TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+       10G networks, measurement noise introduced by LRO, GRO and interrupt
+       coalescence. In addition, it will decrease its cwnd multiplicatively
+       instead of linearly.
+
+       Note that in general congestion avoidance (cwnd decreased when # packets
+       queued grows) cannot coexist with congestion control (cwnd decreased only
+       when there is packet loss) due to fairness issues. One scenario when they
+       can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+
+       For further details see http://www.brakmo.org/networking/tcp-nv/
+
 config TCP_CONG_SCALABLE
 	tristate "Scalable TCP"
 	default n
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index efc43f3..06f335f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
 obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
new file mode 100644
index 0000000..c4379b8
--- /dev/null
+++ b/net/ipv4/tcp_nv.c
@@ -0,0 +1,479 @@
+/*
+ * TCP NV: TCP with Congestion Avoidance
+ *
+ * TCP-NV is a successor of TCP-Vegas that has been developed to
+ * deal with the issues that occur in modern networks. 
+ * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
+ * the ability to detect congestion before packet losses occur.
+ * When congestion (queue buildup) starts to occur, TCP-NV
+ * predicts what the cwnd size should be for the current
+ * throughput and it reduces the cwnd proportionally to
+ * the difference between the current cwnd and the predicted cwnd.
+ * TCP-NV behaves like Reno when no congestion is detected, or when
+ * recovering from packet losses.
+ *
+ * TODO:
+ * 1) Add option to not decrease cwnd on losses below certain level
+ * 2) Add mechanism to deal with reverse congestion.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+/* TCP NV parameters */
+static int nv_enable __read_mostly = 1;
+static int nv_pad __read_mostly = 10;
+static int nv_pad_buffer __read_mostly = 2;
+static int nv_reset_period __read_mostly = 5;
+static int nv_min_cwnd = 10;
+static int nv_dec_eval_min_calls = 100;
+static int nv_ssthresh_eval_min_calls = 30;
+static int nv_rtt_min_cnt = 2;
+static int nv_cong_decrease_mult = 30*128/100;
+static int nv_ssthresh_factor = 8;
+static int nv_rtt_factor = 128;
+static int nv_rtt_cnt_dec_delta = 20; /* dec cwnd by this many RTTs */
+static int nv_dec_factor = 5;  /* actual value is factor/8 */
+static int nv_loss_dec_factor = 820; /* on loss reduce cwnd by 20% */
+static int nv_cwnd_growth_factor = 2; /* larger => cwnd grows slower */
+
+module_param(nv_pad, int, 0644);
+MODULE_PARM_DESC(nv_pad, "extra packets above congestion level");
+module_param(nv_pad_buffer, int, 0644);
+MODULE_PARM_DESC(nv_pad_buffer, "no growth buffer zone");
+module_param(nv_reset_period, int, 0644);
+MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
+module_param(nv_min_cwnd, int, 0644);
+MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
+		 " without losses");
+module_param(nv_dec_eval_min_calls, int, 0644);
+MODULE_PARM_DESC(nv_dec_eval_min_calls, "Wait for this many data points "
+		 "before declaring congestion (< 256)");
+module_param(nv_ssthresh_eval_min_calls, int, 0644);
+MODULE_PARM_DESC(nv_ssthresh_eval_min_calls, "Wait for this many data points "
+		 "before declaring congestion during initial slow-start");
+module_param(nv_rtt_min_cnt, int, 0644);
+MODULE_PARM_DESC(nv_rtt_min_cnt, "Wait for this many RTTs before declaring"
+		 " congestion (<64)");
+module_param(nv_cong_decrease_mult, int, 0644);
+MODULE_PARM_DESC(nv_cong_decrease_mult, "Congestion decrease factor");
+module_param(nv_ssthresh_factor, int, 0644);
+MODULE_PARM_DESC(nv_ssthresh_factor, "ssthresh factor");
+module_param(nv_rtt_factor, int, 0644);
+MODULE_PARM_DESC(nv_rtt_factor, "rtt averaging factor (0-256)");
+module_param(nv_rtt_cnt_dec_delta, int, 0644);
+MODULE_PARM_DESC(nv_rtt_cnt_dec_delta, "decrease cwnd for this many RTTs "
+		 "every 100 RTTs");
+module_param(nv_dec_factor, int, 0644);
+MODULE_PARM_DESC(nv_dec_factor, "decrease cwnd every ~192 RTTS by factor/8");
+module_param(nv_loss_dec_factor, int, 0644);
+MODULE_PARM_DESC(nv_loss_dec_factor, "on loss new cwnd = cwnd * this / 1024");
+module_param(nv_cwnd_growth_factor, int, 0644);
+MODULE_PARM_DESC(nv_cwnd_growth_factor, "larger => cwnd grows slower");
+
+/* TCP NV Parameters */
+struct tcpnv {
+	unsigned long nv_min_rtt_reset_jiffies;  /* when to switch to
+						  * nv_min_rtt_new */
+	u32 cnt;		/* increase cwnd by 1 after ACKs */
+	u32 loss_cwnd;	/* cwnd at last loss */
+	u8  nv_enable:1,
+		nv_allow_cwnd_growth:1, 	/* whether cwnd can grow */
+		nv_rtt_cnt:6;	/* RTTs without making ca decision */
+	u8  nv_eval_call_cnt;/* call count since last eval */
+	u8  nv_min_cwnd;	/* nv won't make a ca decision if cwnd is
+				 * smaller than this. It may grow to handle
+				 * TSO, LRO and interrupt coalescence because
+				 * with these a small cwnd cannot saturate
+				 * the link. Note that this is different from
+				 * sysctl_tcp_nv_min_cwnd */
+	u8  available;
+	u32 nv_last_rtt;	/* last rtt */
+	u32 nv_min_rtt;		/* active min rtt. Used to determine slope */
+	u32 nv_min_rtt_new;	/* min rtt for future use */
+	u32 nv_rtt_max_rate;  	/* max rate seen during current RTT */
+	u32 nv_rtt_start_seq;	/* current RTT ends when packet arrives
+				 * acking beyond nv_rtt_start_seq */
+	u32 nv_last_snd_una;	/* Previous value of tp->snd_una. It is
+				 * used to determine bytes acked since last
+				 * call to bictcp_acked */
+	u32 nv_no_cong_cnt;	/* Consecutive no congestion decisions */
+	u32 nv_rtt_cnt_dec;	/* RTTs since last temporary cwnd decrease */
+};
+
+#define NV_INIT_RTT	  0xffffffff
+#define NV_MIN_CWND	  4
+#define NV_MIN_CWND_GROW  2
+#define NV_TSO_CWND_BOUND 80
+
+static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = 0;
+	ca->nv_no_cong_cnt = 0;
+	ca->cnt = 0;
+	ca->nv_rtt_cnt = 0;
+	ca->nv_rtt_cnt_dec = 0;
+	ca->nv_allow_cwnd_growth = 1;
+	ca->nv_last_rtt = 0;
+	ca->nv_rtt_max_rate = 0;
+	ca->nv_rtt_start_seq = tp->snd_una;
+	ca->nv_eval_call_cnt = 0;
+	ca->nv_last_snd_una = tp->snd_una;
+}
+
+static void tcpnv_init(struct sock *sk)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	tcpnv_reset(ca, sk);
+
+	ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
+	ca->nv_min_rtt = NV_INIT_RTT;
+	ca->nv_min_rtt_new = NV_INIT_RTT;
+	ca->nv_enable = nv_enable;
+	ca->nv_min_cwnd = NV_MIN_CWND;
+	if (nv_dec_eval_min_calls > 255)
+		nv_dec_eval_min_calls = 255;
+	if (nv_rtt_min_cnt > 63)
+		nv_rtt_min_cnt = 63;
+}
+
+static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	/* Only grow cwnd if NV has not detected congestion */
+	if (nv_enable && ca->nv_enable && !ca->nv_allow_cwnd_growth)
+		return;
+
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+	if (ca->cnt == 0 || !(nv_enable || ca->nv_enable))
+		ca->cnt = tp->snd_cwnd;
+
+	tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+static u32 tcpnv_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	ca->loss_cwnd = tp->snd_cwnd;
+	return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); 
+}
+
+static u32 tcpnv_undo_cwnd(struct sock *sk)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+static void tcpnv_state(struct sock *sk, u8 new_state)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Open) {
+		ca->nv_enable = 1;
+		tcpnv_reset(ca, sk);
+	} else if (new_state == TCP_CA_Loss) {
+		ca->nv_enable = 0;
+	}
+}
+
+/* Do congestion avoidance calculaitons for TCP-NV
+ */
+static void tcpnv_acked(struct sock *sk, struct ack_sample *sample)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+	unsigned long now = jiffies;
+	s64 rate64 = 0;
+	u32 rate, max_win, cwnd_by_slope;
+	u32 avg_rtt;
+	u32 bytes_acked = 0;
+
+	/* Some calls are for duplicates without timetamps */
+	if (sample->rtt_us < 0)
+		return;
+
+	/* If not in TCP_CA_Open state, skip. */
+	if (icsk->icsk_ca_state != TCP_CA_Open)
+		return;
+
+	/* If NV mode is not enabled, behave like Reno */
+	if (!nv_enable  ||  !ca->nv_enable) {
+		ca->nv_allow_cwnd_growth = 1;
+		return;
+	}
+
+	bytes_acked = tp->snd_una - ca->nv_last_snd_una;
+	ca->nv_last_snd_una = tp->snd_una;
+
+	if (sample->in_flight == 0)
+		return;
+
+	/* Calculate moving average of RTT */
+	if (nv_rtt_factor > 0) {
+		if (ca->nv_last_rtt > 0) {
+			avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor +
+				   ((u64)ca->nv_last_rtt)
+				   * (256 - nv_rtt_factor)) >> 8;
+		} else {
+			avg_rtt = sample->rtt_us;
+			ca->nv_min_rtt = avg_rtt << 1;
+		}
+		ca->nv_last_rtt = avg_rtt;
+	} else {
+		avg_rtt = sample->rtt_us;
+	}
+
+	/* rate in 100's bits per second */
+	rate64 = ((u64)sample->in_flight) * 8000000;
+	rate = (u32)div64_u64(rate64, (u64)(avg_rtt*100));
+
+	/* Remember the maximum rate seen during this RTT
+	 * Note: It may be more than one RTT. This function should be
+	 *       called at least nv_dec_eval_min_calls times.
+	 */
+	if (ca->nv_rtt_max_rate < rate)
+		ca->nv_rtt_max_rate = rate;
+
+	/* We have valid information, increment counter */
+	if (ca->nv_eval_call_cnt < 255)
+		ca->nv_eval_call_cnt++;
+
+	/* update min rtt if necessary */
+	if (avg_rtt < ca->nv_min_rtt)
+		ca->nv_min_rtt = avg_rtt;
+
+	/* update future min_rtt if necessary */
+	if (avg_rtt < ca->nv_min_rtt_new)
+		ca->nv_min_rtt_new = avg_rtt;
+
+	/* nv_min_rtt is updated with the minimum (possibley averaged) rtt
+	 * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
+	 * warm reset). This new nv_min_rtt will be continued to be updated
+	 * and be used for another sysctl_tcp_nv_reset_period seconds,
+	 * when it will be updated again.
+	 * In practice we introduce some randomness, so the actual period used
+	 * is chosen randomly from the range:
+	 *   [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
+	 */
+	if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
+		unsigned char rand;
+		ca->nv_min_rtt = ca->nv_min_rtt_new;
+		ca->nv_min_rtt_new = NV_INIT_RTT;
+		get_random_bytes(&rand, 1);
+		ca->nv_min_rtt_reset_jiffies =
+			now + ((nv_reset_period*(384 + rand)*HZ)>>9);
+		/* Every so often we decrease nv_min_cwnd in case previous
+		 *  value is no longer accurate.
+		 */
+		ca->nv_min_cwnd = max(ca->nv_min_cwnd/2, NV_MIN_CWND);
+	}
+
+	/* Once per RTT check if we need to do congestion avoidance */
+	if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
+		ca->nv_rtt_start_seq = tp->snd_nxt;
+		if (ca->nv_rtt_cnt < 63)
+			/* Increase counter for RTTs without CA decision */
+			ca->nv_rtt_cnt++;
+		if (ca->nv_rtt_cnt_dec < 255)
+			/* Increase counter for temporary cwnd decrease */
+			ca->nv_rtt_cnt_dec++;
+
+		/* If this function is only called once within an RTT
+		 * the cwnd is probably too small (in some cases due to
+		 * tso, lro or interrupt coalescence), so we increase
+		 * nv_min_cwnd.
+		 */
+		if (ca->nv_eval_call_cnt == 1
+		    && bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache
+		    && ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)
+		    && ca->nv_rtt_cnt_dec < 192) {
+			ca->nv_min_cwnd = min(ca->nv_min_cwnd
+					      + NV_MIN_CWND_GROW,
+					      NV_TSO_CWND_BOUND + 1);
+			ca->nv_rtt_start_seq = tp->snd_nxt +
+				ca->nv_min_cwnd*tp->mss_cache;
+			ca->nv_eval_call_cnt = 0;
+			ca->nv_allow_cwnd_growth = 1;
+			return;
+		}
+
+		/* Every 192 to 320 RTTs decrease cwnd to get better min RTT
+		 * measurement. In practice we accomplish this by initializing
+		 * nv_rtt_cnd_dec randomly form the range [0, 128) and
+		 * stopping at 320.
+		 * We keep the value low for nv_rtt_cnt_dec_delta RTTs and then
+		 * we restore cwnd to its previous value (by setting
+		 * ssthresh to the previous value).
+		 */
+		if (ca->nv_rtt_cnt_dec == 320) {
+			/* decrease cwnd and ssthresh */
+			tp->snd_cwnd =
+				max((unsigned int)nv_min_cwnd,
+				    ((tp->snd_cwnd * nv_dec_factor) >> 3));
+			tp->snd_ssthresh =
+				max(tp->snd_cwnd,
+				    ((tp->snd_ssthresh * nv_dec_factor) >> 3));
+			ca->nv_allow_cwnd_growth = 0;
+			return;
+		} else if (ca->nv_rtt_cnt_dec > 320) {
+			if (ca->nv_rtt_cnt_dec - 320 >= nv_rtt_cnt_dec_delta) {
+				/* Restore ssthresh to restore cwnd */
+				unsigned char rand;
+				get_random_bytes(&rand, 1);
+				ca->nv_rtt_cnt_dec = rand >> 1;
+				tp->snd_ssthresh = (tp->snd_ssthresh << 3)
+					/ nv_dec_factor;
+				ca->nv_allow_cwnd_growth = 1;
+				ca->nv_no_cong_cnt = 0;
+			}
+			return;
+		}
+
+		/* Find the ideal cwnd for current rate from slope
+		 * slope = 80000.0 * mss / nv_min_rtt
+		 * cwnd_by_slope = nv_rtt_max_rate / slope
+		 */
+		cwnd_by_slope = (u32)
+			div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
+				  (u64)(80000 * tp->mss_cache));
+		max_win = cwnd_by_slope + nv_pad;
+
+		/* If cwnd > max_win, decrease cwnd
+		 * if cwnd < max_win, grow cwnd
+		 * else leave the same
+		 */
+		if (tp->snd_cwnd > max_win) {
+			/* there is congestion, check that it is ok
+			 * to make a CA decision
+			 * 1. We should have at least nv_dec_eval_min_calls
+			 *    data points before making a CA  decision
+			 * 2. We only make a congesion decision after
+			 *    nv_rtt_min_cnt RTTs
+			 */
+			if (ca->nv_rtt_cnt < nv_rtt_min_cnt)
+				return;
+			else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
+				if (ca->nv_eval_call_cnt <
+				    nv_ssthresh_eval_min_calls)
+					return;
+			} else if (ca->nv_eval_call_cnt <
+				   nv_dec_eval_min_calls) {
+				return;
+			}
+
+			/* We have enough data to determine we are congested */
+			ca->nv_allow_cwnd_growth = 0;
+			tp->snd_ssthresh =
+				(nv_ssthresh_factor * max_win) >> 3;
+			if (tp->snd_cwnd - max_win > 2) {
+				/* gap > 2, we do exponential cwnd decrease */
+				int dec;
+				dec = max(2U, ((tp->snd_cwnd - max_win) *
+					       nv_cong_decrease_mult) >> 7);
+				tp->snd_cwnd -= dec;
+			} else if (nv_cong_decrease_mult > 0) {
+				tp->snd_cwnd = max_win;
+			}
+			ca->cnt = tp->snd_cwnd;
+			ca->nv_no_cong_cnt = 0;
+		} else if (tp->snd_cwnd <=  max_win - nv_pad_buffer) {
+			/* We allow growth of cwnd every RTT since we would
+			 * have grown even if we waited (just slower)
+			 */
+			ca->nv_allow_cwnd_growth = 1;
+			ca->nv_no_cong_cnt++;
+			if (nv_cwnd_growth_factor > 0 &&
+			    ca->nv_no_cong_cnt > nv_cwnd_growth_factor) {
+				ca->cnt = max(ca->cnt >> 1, (u32) 4);
+				ca->nv_no_cong_cnt = 0;
+			}
+		} else {
+			ca->nv_allow_cwnd_growth = 0;
+		}
+
+		/* update state */
+		ca->nv_eval_call_cnt = 0;
+		ca->nv_rtt_cnt = 0;
+		ca->nv_rtt_max_rate = 0;
+
+		/* Don't want to make cwnd < nv_min_cwnd
+		 * (it wasn't before, if it is now is because nv
+		 *  decreased it).
+		 */
+		if (tp->snd_cwnd < nv_min_cwnd)
+			tp->snd_cwnd = nv_min_cwnd;
+
+  }
+}
+
+/* Extract info for Tcp socket info provided via netlink */
+size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
+		       union tcp_cc_info *info)
+{
+	const struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		info->vegas.tcpv_enabled = ca->nv_enable
+			&& nv_enable;
+		info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
+		info->vegas.tcpv_rtt = ca->nv_last_rtt;
+		info->vegas.tcpv_minrtt = ca->nv_min_rtt;
+
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcpnv_get_info);
+
+static struct tcp_congestion_ops tcpnv __read_mostly = {
+	.init		= tcpnv_init,
+	.ssthresh	= tcpnv_recalc_ssthresh,
+	.cong_avoid	= tcpnv_cong_avoid,
+	.set_state	= tcpnv_state,
+	.undo_cwnd	= tcpnv_undo_cwnd,
+	.pkts_acked     = tcpnv_acked,
+	.get_info	= tcpnv_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "nv",
+};
+
+static int __init tcpnv_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
+
+	return tcp_register_congestion_control(&tcpnv);
+}
+
+static void __exit tcpnv_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcpnv);
+}
+
+module_init(tcpnv_register);
+module_exit(tcpnv_unregister);
+
+MODULE_AUTHOR("Lawrence Brakmo");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP NV");
+MODULE_VERSION("1.0");
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked()
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
@ 2015-07-25  6:22   ` Eric Dumazet
  2015-07-27 18:46   ` Stephen Hemminger
  1 sibling, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2015-07-25  6:22 UTC (permalink / raw)
  To: Lawrence Brakmo; +Cc: netdev, Kernel Team, Neal Cardwell, Yuchung Cheng

On Fri, 2015-07-24 at 19:47 -0700, Lawrence Brakmo wrote:
> Replace 2 arguments (cnt and rtt) in the congestion control modules'
> pkts_acked() function with a struct. This will allow adding more
> information without having to modify existing congestion control
> modules (tcp_nv in particular needs bytes in flight when packet
> was sent).
> 
>  
> +struct ack_sample {
> +	u32 pkts_acked;
> +	s32 rtt_us;
> +};
> +
>  struct tcp_congestion_ops {
>  	struct list_head	list;
>  	u32 key;
> @@ -857,7 +862,7 @@ struct tcp_congestion_ops {
>  	/* new value of cwnd after loss (optional) */
>  	u32  (*undo_cwnd)(struct sock *sk);
>  	/* hook for packet ack accounting (optional) */
> -	void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us);
> +	void (*pkts_acked)(struct sock *sk, struct ack_sample *sample);

This probably should be a const struct ack_sample *sample ?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked()
  2015-07-25  2:47 ` [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
  2015-07-25  6:22   ` Eric Dumazet
@ 2015-07-27 18:46   ` Stephen Hemminger
  2015-07-27 19:30     ` Lawrence Brakmo
  1 sibling, 1 reply; 9+ messages in thread
From: Stephen Hemminger @ 2015-07-27 18:46 UTC (permalink / raw)
  To: Lawrence Brakmo
  Cc: netdev, Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng

On Fri, 24 Jul 2015 19:47:03 -0700
Lawrence Brakmo <brakmo@fb.com> wrote:

> Replace 2 arguments (cnt and rtt) in the congestion control modules'
> pkts_acked() function with a struct. This will allow adding more
> information without having to modify existing congestion control
> modules (tcp_nv in particular needs bytes in flight when packet
> was sent).
> 
> As proposed by Neal Cardwell in his comments to the tcp_nv patch.

Adding a layer of indirection makes code changes easier, but makes
the code slower. Arguments are passed in registers, and putting an
additional level of indirection only matters if you can't change
all the CC modules. Since this is the kernel and API compatability
doesn't matter, just pass more arguments.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked()
  2015-07-27 18:46   ` Stephen Hemminger
@ 2015-07-27 19:30     ` Lawrence Brakmo
  2015-07-28  3:42       ` Alexei Starovoitov
  0 siblings, 1 reply; 9+ messages in thread
From: Lawrence Brakmo @ 2015-07-27 19:30 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, Kernel Team, Neal Cardwell, Eric Dumazet, Yuchung Cheng



On 7/27/15, 11:46 AM, "Stephen Hemminger" <stephen@networkplumber.org>
wrote:

>On Fri, 24 Jul 2015 19:47:03 -0700
>Lawrence Brakmo <brakmo@fb.com> wrote:
>
>> Replace 2 arguments (cnt and rtt) in the congestion control modules'
>> pkts_acked() function with a struct. This will allow adding more
>> information without having to modify existing congestion control
>> modules (tcp_nv in particular needs bytes in flight when packet
>> was sent).
>> 
>> As proposed by Neal Cardwell in his comments to the tcp_nv patch.
>
>Adding a layer of indirection makes code changes easier, but makes
>the code slower. Arguments are passed in registers, and putting an
>additional level of indirection only matters if you can't change
>all the CC modules. Since this is the kernel and API compatability
>doesn't matter, just pass more arguments.

I prefer the cleanliness of passing a structure and don¹t think the
overhead will be significant enough to worry about it.
Will the compiler pass struct values in registers if the struct is
passed by value?

I will be happy to do it either way (I did it like Stephen proposes
originally). What does everyone else think?

>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked()
  2015-07-27 19:30     ` Lawrence Brakmo
@ 2015-07-28  3:42       ` Alexei Starovoitov
  0 siblings, 0 replies; 9+ messages in thread
From: Alexei Starovoitov @ 2015-07-28  3:42 UTC (permalink / raw)
  To: Lawrence Brakmo
  Cc: Stephen Hemminger, netdev, Kernel Team, Neal Cardwell,
	Eric Dumazet, Yuchung Cheng

On Mon, Jul 27, 2015 at 07:30:39PM +0000, Lawrence Brakmo wrote:
> 
> 
> I prefer the cleanliness of passing a structure and don¹t think the
> overhead will be significant enough to worry about it.
> Will the compiler pass struct values in registers if the struct is
> passed by value?

In passing struct by value it's not a matter of optimization but C abi.
On sparc even 4-byte structs are passed by reference which creates
an extra copy of the same struct on stack.
Struct returns are even worse. I would always pass structs by reference
in C. C++ is different matter.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2015-07-28  3:42 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-25  2:47 [RFC PATCH v4 net-next 0/4] tcp: add NV congestion control Lawrence Brakmo
2015-07-25  2:47 ` [RFC PATCH v4 net-next 1/4] tcp: replace cnt & rtt with struct in pkts_acked() Lawrence Brakmo
2015-07-25  6:22   ` Eric Dumazet
2015-07-27 18:46   ` Stephen Hemminger
2015-07-27 19:30     ` Lawrence Brakmo
2015-07-28  3:42       ` Alexei Starovoitov
2015-07-25  2:47 ` [RFC PATCH v4 net-next 2/4] tcp: refactor struct tcp_skb_cb Lawrence Brakmo
2015-07-25  2:47 ` [RFC PATCH v4 net-next 3/4] tcp: add in_flight to tcp_skb_cb Lawrence Brakmo
2015-07-25  2:47 ` [RFC PATCH v4 net-next 4/4] tcp: add NV congestion control Lawrence Brakmo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).