1b87d8561SStephen Hemminger /* 2b87d8561SStephen Hemminger * TCP Vegas congestion control 3b87d8561SStephen Hemminger * 4b87d8561SStephen Hemminger * This is based on the congestion detection/avoidance scheme described in 5b87d8561SStephen Hemminger * Lawrence S. Brakmo and Larry L. Peterson. 6b87d8561SStephen Hemminger * "TCP Vegas: End to end congestion avoidance on a global internet." 7b87d8561SStephen Hemminger * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, 8b87d8561SStephen Hemminger * October 1995. Available from: 9b87d8561SStephen Hemminger * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps 10b87d8561SStephen Hemminger * 11b87d8561SStephen Hemminger * See http://www.cs.arizona.edu/xkernel/ for their implementation. 12b87d8561SStephen Hemminger * The main aspects that distinguish this implementation from the 13b87d8561SStephen Hemminger * Arizona Vegas implementation are: 14b87d8561SStephen Hemminger * o We do not change the loss detection or recovery mechanisms of 15b87d8561SStephen Hemminger * Linux in any way. Linux already recovers from losses quite well, 16b87d8561SStephen Hemminger * using fine-grained timers, NewReno, and FACK. 17b87d8561SStephen Hemminger * o To avoid the performance penalty imposed by increasing cwnd 18b87d8561SStephen Hemminger * only every-other RTT during slow start, we increase during 19b87d8561SStephen Hemminger * every RTT during slow start, just like Reno. 20b87d8561SStephen Hemminger * o Largely to allow continuous cwnd growth during slow start, 21b87d8561SStephen Hemminger * we use the rate at which ACKs come back as the "actual" 22b87d8561SStephen Hemminger * rate, rather than the rate at which data is sent. 23b87d8561SStephen Hemminger * o To speed convergence to the right rate, we set the cwnd 24b87d8561SStephen Hemminger * to achieve the right ("actual") rate when we exit slow start. 25b87d8561SStephen Hemminger * o To filter out the noise caused by delayed ACKs, we use the 26b87d8561SStephen Hemminger * minimum RTT sample observed during the last RTT to calculate 27b87d8561SStephen Hemminger * the actual rate. 28b87d8561SStephen Hemminger * o When the sender re-starts from idle, it waits until it has 29b87d8561SStephen Hemminger * received ACKs for an entire flight of new data before making 30b87d8561SStephen Hemminger * a cwnd adjustment decision. The original Vegas implementation 31b87d8561SStephen Hemminger * assumed senders never went idle. 32b87d8561SStephen Hemminger */ 33b87d8561SStephen Hemminger 34b87d8561SStephen Hemminger #include <linux/mm.h> 35b87d8561SStephen Hemminger #include <linux/module.h> 36b87d8561SStephen Hemminger #include <linux/skbuff.h> 37a8c2190eSArnaldo Carvalho de Melo #include <linux/inet_diag.h> 38b87d8561SStephen Hemminger 39b87d8561SStephen Hemminger #include <net/tcp.h> 40b87d8561SStephen Hemminger 417752237eSStephen Hemminger #include "tcp_vegas.h" 427752237eSStephen Hemminger 438d3a564dSDoug Leith static int alpha = 2; 448d3a564dSDoug Leith static int beta = 4; 458d3a564dSDoug Leith static int gamma = 1; 46b87d8561SStephen Hemminger 47b87d8561SStephen Hemminger module_param(alpha, int, 0644); 488d3a564dSDoug Leith MODULE_PARM_DESC(alpha, "lower bound of packets in network"); 49b87d8561SStephen Hemminger module_param(beta, int, 0644); 508d3a564dSDoug Leith MODULE_PARM_DESC(beta, "upper bound of packets in network"); 51b87d8561SStephen Hemminger module_param(gamma, int, 0644); 52b87d8561SStephen Hemminger MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); 53b87d8561SStephen Hemminger 54b87d8561SStephen Hemminger 55b87d8561SStephen Hemminger /* There are several situations when we must "re-start" Vegas: 56b87d8561SStephen Hemminger * 57b87d8561SStephen Hemminger * o when a connection is established 58b87d8561SStephen Hemminger * o after an RTO 59b87d8561SStephen Hemminger * o after fast recovery 60b87d8561SStephen Hemminger * o when we send a packet and there is no outstanding 61b87d8561SStephen Hemminger * unacknowledged data (restarting an idle connection) 62b87d8561SStephen Hemminger * 63b87d8561SStephen Hemminger * In these circumstances we cannot do a Vegas calculation at the 64b87d8561SStephen Hemminger * end of the first RTT, because any calculation we do is using 65b87d8561SStephen Hemminger * stale info -- both the saved cwnd and congestion feedback are 66b87d8561SStephen Hemminger * stale. 67b87d8561SStephen Hemminger * 68b87d8561SStephen Hemminger * Instead we must wait until the completion of an RTT during 69b87d8561SStephen Hemminger * which we actually receive ACKs. 70b87d8561SStephen Hemminger */ 717752237eSStephen Hemminger static void vegas_enable(struct sock *sk) 72b87d8561SStephen Hemminger { 736687e988SArnaldo Carvalho de Melo const struct tcp_sock *tp = tcp_sk(sk); 746687e988SArnaldo Carvalho de Melo struct vegas *vegas = inet_csk_ca(sk); 75b87d8561SStephen Hemminger 76b87d8561SStephen Hemminger /* Begin taking Vegas samples next time we send something. */ 77b87d8561SStephen Hemminger vegas->doing_vegas_now = 1; 78b87d8561SStephen Hemminger 79b87d8561SStephen Hemminger /* Set the beginning of the next send window. */ 80b87d8561SStephen Hemminger vegas->beg_snd_nxt = tp->snd_nxt; 81b87d8561SStephen Hemminger 82b87d8561SStephen Hemminger vegas->cntRTT = 0; 83b87d8561SStephen Hemminger vegas->minRTT = 0x7fffffff; 84b87d8561SStephen Hemminger } 85b87d8561SStephen Hemminger 86b87d8561SStephen Hemminger /* Stop taking Vegas samples for now. */ 876687e988SArnaldo Carvalho de Melo static inline void vegas_disable(struct sock *sk) 88b87d8561SStephen Hemminger { 896687e988SArnaldo Carvalho de Melo struct vegas *vegas = inet_csk_ca(sk); 90b87d8561SStephen Hemminger 91b87d8561SStephen Hemminger vegas->doing_vegas_now = 0; 92b87d8561SStephen Hemminger } 93b87d8561SStephen Hemminger 947752237eSStephen Hemminger void tcp_vegas_init(struct sock *sk) 95b87d8561SStephen Hemminger { 966687e988SArnaldo Carvalho de Melo struct vegas *vegas = inet_csk_ca(sk); 97b87d8561SStephen Hemminger 98b87d8561SStephen Hemminger vegas->baseRTT = 0x7fffffff; 996687e988SArnaldo Carvalho de Melo vegas_enable(sk); 100b87d8561SStephen Hemminger } 1017752237eSStephen Hemminger EXPORT_SYMBOL_GPL(tcp_vegas_init); 102b87d8561SStephen Hemminger 103b87d8561SStephen Hemminger /* Do RTT sampling needed for Vegas. 104b87d8561SStephen Hemminger * Basically we: 105b87d8561SStephen Hemminger * o min-filter RTT samples from within an RTT to get the current 106b87d8561SStephen Hemminger * propagation delay + queuing delay (we are min-filtering to try to 107b87d8561SStephen Hemminger * avoid the effects of delayed ACKs) 108b87d8561SStephen Hemminger * o min-filter RTT samples from a much longer window (forever for now) 109b87d8561SStephen Hemminger * to find the propagation delay (baseRTT) 110b87d8561SStephen Hemminger */ 11130cfd0baSStephen Hemminger void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) 112b87d8561SStephen Hemminger { 1136687e988SArnaldo Carvalho de Melo struct vegas *vegas = inet_csk_ca(sk); 114164891aaSStephen Hemminger u32 vrtt; 115164891aaSStephen Hemminger 11630cfd0baSStephen Hemminger if (rtt_us < 0) 117b9ce204fSIlpo Järvinen return; 118b9ce204fSIlpo Järvinen 119164891aaSStephen Hemminger /* Never allow zero rtt or baseRTT */ 12030cfd0baSStephen Hemminger vrtt = rtt_us + 1; 121b87d8561SStephen Hemminger 122b87d8561SStephen Hemminger /* Filter to find propagation delay: */ 123b87d8561SStephen Hemminger if (vrtt < vegas->baseRTT) 124b87d8561SStephen Hemminger vegas->baseRTT = vrtt; 125b87d8561SStephen Hemminger 126b87d8561SStephen Hemminger /* Find the min RTT during the last RTT to find 127b87d8561SStephen Hemminger * the current prop. delay + queuing delay: 128b87d8561SStephen Hemminger */ 129b87d8561SStephen Hemminger vegas->minRTT = min(vegas->minRTT, vrtt); 130b87d8561SStephen Hemminger vegas->cntRTT++; 131b87d8561SStephen Hemminger } 1327752237eSStephen Hemminger EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); 133b87d8561SStephen Hemminger 1347752237eSStephen Hemminger void tcp_vegas_state(struct sock *sk, u8 ca_state) 135b87d8561SStephen Hemminger { 136b87d8561SStephen Hemminger 137b87d8561SStephen Hemminger if (ca_state == TCP_CA_Open) 1386687e988SArnaldo Carvalho de Melo vegas_enable(sk); 139b87d8561SStephen Hemminger else 1406687e988SArnaldo Carvalho de Melo vegas_disable(sk); 141b87d8561SStephen Hemminger } 1427752237eSStephen Hemminger EXPORT_SYMBOL_GPL(tcp_vegas_state); 143b87d8561SStephen Hemminger 144b87d8561SStephen Hemminger /* 145b87d8561SStephen Hemminger * If the connection is idle and we are restarting, 146b87d8561SStephen Hemminger * then we don't want to do any Vegas calculations 147b87d8561SStephen Hemminger * until we get fresh RTT samples. So when we 148b87d8561SStephen Hemminger * restart, we reset our Vegas state to a clean 149b87d8561SStephen Hemminger * slate. After we get acks for this flight of 150b87d8561SStephen Hemminger * packets, _then_ we can make Vegas calculations 151b87d8561SStephen Hemminger * again. 152b87d8561SStephen Hemminger */ 1537752237eSStephen Hemminger void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) 154b87d8561SStephen Hemminger { 155b87d8561SStephen Hemminger if (event == CA_EVENT_CWND_RESTART || 156b87d8561SStephen Hemminger event == CA_EVENT_TX_START) 1576687e988SArnaldo Carvalho de Melo tcp_vegas_init(sk); 158b87d8561SStephen Hemminger } 1597752237eSStephen Hemminger EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); 160b87d8561SStephen Hemminger 161c80a5cdfSDoug Leith static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) 162c80a5cdfSDoug Leith { 163c80a5cdfSDoug Leith return min(tp->snd_ssthresh, tp->snd_cwnd-1); 164c80a5cdfSDoug Leith } 165c80a5cdfSDoug Leith 16624901551SEric Dumazet static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) 167b87d8561SStephen Hemminger { 1686687e988SArnaldo Carvalho de Melo struct tcp_sock *tp = tcp_sk(sk); 1696687e988SArnaldo Carvalho de Melo struct vegas *vegas = inet_csk_ca(sk); 170b87d8561SStephen Hemminger 171ab59859dSHarvey Harrison if (!vegas->doing_vegas_now) { 17224901551SEric Dumazet tcp_reno_cong_avoid(sk, ack, acked); 173ab59859dSHarvey Harrison return; 174ab59859dSHarvey Harrison } 175b87d8561SStephen Hemminger 176b87d8561SStephen Hemminger if (after(ack, vegas->beg_snd_nxt)) { 177b87d8561SStephen Hemminger /* Do the Vegas once-per-RTT cwnd adjustment. */ 178b87d8561SStephen Hemminger 179b87d8561SStephen Hemminger /* Save the extent of the current window so we can use this 180b87d8561SStephen Hemminger * at the end of the next RTT. 181b87d8561SStephen Hemminger */ 182b87d8561SStephen Hemminger vegas->beg_snd_nxt = tp->snd_nxt; 183b87d8561SStephen Hemminger 184b87d8561SStephen Hemminger /* We do the Vegas calculations only if we got enough RTT 185b87d8561SStephen Hemminger * samples that we can be reasonably sure that we got 186b87d8561SStephen Hemminger * at least one RTT sample that wasn't from a delayed ACK. 187b87d8561SStephen Hemminger * If we only had 2 samples total, 188b87d8561SStephen Hemminger * then that means we're getting only 1 ACK per RTT, which 189b87d8561SStephen Hemminger * means they're almost certainly delayed ACKs. 190b87d8561SStephen Hemminger * If we have 3 samples, we should be OK. 191b87d8561SStephen Hemminger */ 192b87d8561SStephen Hemminger 193b87d8561SStephen Hemminger if (vegas->cntRTT <= 2) { 194b87d8561SStephen Hemminger /* We don't have enough RTT samples to do the Vegas 195b87d8561SStephen Hemminger * calculation, so we'll behave like Reno. 196b87d8561SStephen Hemminger */ 19724901551SEric Dumazet tcp_reno_cong_avoid(sk, ack, acked); 198b87d8561SStephen Hemminger } else { 19915913114SLachlan Andrew u32 rtt, diff; 20015913114SLachlan Andrew u64 target_cwnd; 201b87d8561SStephen Hemminger 202b87d8561SStephen Hemminger /* We have enough RTT samples, so, using the Vegas 203b87d8561SStephen Hemminger * algorithm, we determine if we should increase or 204b87d8561SStephen Hemminger * decrease cwnd, and by how much. 205b87d8561SStephen Hemminger */ 206b87d8561SStephen Hemminger 207b87d8561SStephen Hemminger /* Pluck out the RTT we are using for the Vegas 208b87d8561SStephen Hemminger * calculations. This is the min RTT seen during the 209b87d8561SStephen Hemminger * last RTT. Taking the min filters out the effects 210b87d8561SStephen Hemminger * of delayed ACKs, at the cost of noticing congestion 211b87d8561SStephen Hemminger * a bit later. 212b87d8561SStephen Hemminger */ 213b87d8561SStephen Hemminger rtt = vegas->minRTT; 214b87d8561SStephen Hemminger 215b87d8561SStephen Hemminger /* Calculate the cwnd we should have, if we weren't 216b87d8561SStephen Hemminger * going too fast. 217b87d8561SStephen Hemminger * 218b87d8561SStephen Hemminger * This is: 219b87d8561SStephen Hemminger * (actual rate in segments) * baseRTT 220b87d8561SStephen Hemminger */ 221*1f74e613SChristoph Paasch target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT; 222*1f74e613SChristoph Paasch do_div(target_cwnd, rtt); 223b87d8561SStephen Hemminger 224b87d8561SStephen Hemminger /* Calculate the difference between the window we had, 225b87d8561SStephen Hemminger * and the window we would like to have. This quantity 226b87d8561SStephen Hemminger * is the "Diff" from the Arizona Vegas papers. 227b87d8561SStephen Hemminger */ 2288d3a564dSDoug Leith diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; 229b87d8561SStephen Hemminger 230c80a5cdfSDoug Leith if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { 231b87d8561SStephen Hemminger /* Going too fast. Time to slow down 232b87d8561SStephen Hemminger * and switch to congestion avoidance. 233b87d8561SStephen Hemminger */ 234b87d8561SStephen Hemminger 235b87d8561SStephen Hemminger /* Set cwnd to match the actual rate 236b87d8561SStephen Hemminger * exactly: 237b87d8561SStephen Hemminger * cwnd = (actual rate) * baseRTT 238b87d8561SStephen Hemminger * Then we add 1 because the integer 239b87d8561SStephen Hemminger * truncation robs us of full link 240b87d8561SStephen Hemminger * utilization. 241b87d8561SStephen Hemminger */ 2428d3a564dSDoug Leith tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); 243c80a5cdfSDoug Leith tp->snd_ssthresh = tcp_vegas_ssthresh(tp); 244b87d8561SStephen Hemminger 245c940587bSXiaoliang (David) Wei } else if (tp->snd_cwnd <= tp->snd_ssthresh) { 246c940587bSXiaoliang (David) Wei /* Slow start. */ 2479f9843a7SYuchung Cheng tcp_slow_start(tp, acked); 248b87d8561SStephen Hemminger } else { 249b87d8561SStephen Hemminger /* Congestion avoidance. */ 250b87d8561SStephen Hemminger 251b87d8561SStephen Hemminger /* Figure out where we would like cwnd 252b87d8561SStephen Hemminger * to be. 253b87d8561SStephen Hemminger */ 254b87d8561SStephen Hemminger if (diff > beta) { 255b87d8561SStephen Hemminger /* The old window was too fast, so 256b87d8561SStephen Hemminger * we slow down. 257b87d8561SStephen Hemminger */ 2588d3a564dSDoug Leith tp->snd_cwnd--; 259c80a5cdfSDoug Leith tp->snd_ssthresh 260c80a5cdfSDoug Leith = tcp_vegas_ssthresh(tp); 261b87d8561SStephen Hemminger } else if (diff < alpha) { 262b87d8561SStephen Hemminger /* We don't have enough extra packets 263b87d8561SStephen Hemminger * in the network, so speed up. 264b87d8561SStephen Hemminger */ 2658d3a564dSDoug Leith tp->snd_cwnd++; 266b87d8561SStephen Hemminger } else { 267b87d8561SStephen Hemminger /* Sending just as fast as we 268b87d8561SStephen Hemminger * should be. 269b87d8561SStephen Hemminger */ 270b87d8561SStephen Hemminger } 271b87d8561SStephen Hemminger } 2727faffa1cSStephen Hemminger 2737faffa1cSStephen Hemminger if (tp->snd_cwnd < 2) 2747faffa1cSStephen Hemminger tp->snd_cwnd = 2; 2757faffa1cSStephen Hemminger else if (tp->snd_cwnd > tp->snd_cwnd_clamp) 2767faffa1cSStephen Hemminger tp->snd_cwnd = tp->snd_cwnd_clamp; 277a6af2d6bSDoug Leith 278a6af2d6bSDoug Leith tp->snd_ssthresh = tcp_current_ssthresh(sk); 2797faffa1cSStephen Hemminger } 280b87d8561SStephen Hemminger 281b87d8561SStephen Hemminger /* Wipe the slate clean for the next RTT. */ 282b87d8561SStephen Hemminger vegas->cntRTT = 0; 283b87d8561SStephen Hemminger vegas->minRTT = 0x7fffffff; 284b87d8561SStephen Hemminger } 28574cb8798SThomas Young /* Use normal slow start */ 28674cb8798SThomas Young else if (tp->snd_cwnd <= tp->snd_ssthresh) 2879f9843a7SYuchung Cheng tcp_slow_start(tp, acked); 28874cb8798SThomas Young 2895b495613SThomas Young } 290b87d8561SStephen Hemminger 291b87d8561SStephen Hemminger /* Extract info for Tcp socket info provided via netlink. */ 2927752237eSStephen Hemminger void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) 293b87d8561SStephen Hemminger { 2946687e988SArnaldo Carvalho de Melo const struct vegas *ca = inet_csk_ca(sk); 29573c1f4a0SArnaldo Carvalho de Melo if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 296e9195d67SThomas Graf struct tcpvegas_info info = { 297e9195d67SThomas Graf .tcpv_enabled = ca->doing_vegas_now, 298e9195d67SThomas Graf .tcpv_rttcnt = ca->cntRTT, 299e9195d67SThomas Graf .tcpv_rtt = ca->baseRTT, 300e9195d67SThomas Graf .tcpv_minrtt = ca->minRTT, 301e9195d67SThomas Graf }; 302b87d8561SStephen Hemminger 303e9195d67SThomas Graf nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); 304b87d8561SStephen Hemminger } 305b87d8561SStephen Hemminger } 3067752237eSStephen Hemminger EXPORT_SYMBOL_GPL(tcp_vegas_get_info); 307b87d8561SStephen Hemminger 308a252bebeSStephen Hemminger static struct tcp_congestion_ops tcp_vegas __read_mostly = { 309b87d8561SStephen Hemminger .init = tcp_vegas_init, 310b87d8561SStephen Hemminger .ssthresh = tcp_reno_ssthresh, 311b87d8561SStephen Hemminger .cong_avoid = tcp_vegas_cong_avoid, 312164891aaSStephen Hemminger .pkts_acked = tcp_vegas_pkts_acked, 313b87d8561SStephen Hemminger .set_state = tcp_vegas_state, 314b87d8561SStephen Hemminger .cwnd_event = tcp_vegas_cwnd_event, 315b87d8561SStephen Hemminger .get_info = tcp_vegas_get_info, 316b87d8561SStephen Hemminger 317b87d8561SStephen Hemminger .owner = THIS_MODULE, 318b87d8561SStephen Hemminger .name = "vegas", 319b87d8561SStephen Hemminger }; 320b87d8561SStephen Hemminger 321b87d8561SStephen Hemminger static int __init tcp_vegas_register(void) 322b87d8561SStephen Hemminger { 32374975d40SAlexey Dobriyan BUILD_BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); 324b87d8561SStephen Hemminger tcp_register_congestion_control(&tcp_vegas); 325b87d8561SStephen Hemminger return 0; 326b87d8561SStephen Hemminger } 327b87d8561SStephen Hemminger 328b87d8561SStephen Hemminger static void __exit tcp_vegas_unregister(void) 329b87d8561SStephen Hemminger { 330b87d8561SStephen Hemminger tcp_unregister_congestion_control(&tcp_vegas); 331b87d8561SStephen Hemminger } 332b87d8561SStephen Hemminger 333b87d8561SStephen Hemminger module_init(tcp_vegas_register); 334b87d8561SStephen Hemminger module_exit(tcp_vegas_unregister); 335b87d8561SStephen Hemminger 336b87d8561SStephen Hemminger MODULE_AUTHOR("Stephen Hemminger"); 337b87d8561SStephen Hemminger MODULE_LICENSE("GPL"); 338b87d8561SStephen Hemminger MODULE_DESCRIPTION("TCP Vegas"); 339