1 /*- 2 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 3 * Copyright (c) 2010 The FreeBSD Foundation 4 * All rights reserved. 5 * 6 * This software was developed by Lawrence Stewart while studying at the Centre 7 * for Advanced Internet Architectures, Swinburne University of Technology, made 8 * possible in part by a grant from the Cisco University Research Program Fund 9 * at Community Foundation Silicon Valley. 10 * 11 * Portions of this software were developed at the Centre for Advanced 12 * Internet Architectures, Swinburne University of Technology, Melbourne, 13 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 39 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 40 * Originally released as part of the NewTCP research project at Swinburne 41 * University of Technology's Centre for Advanced Internet Architectures, 42 * Melbourne, Australia, which was made possible in part by a grant from the 43 * Cisco University Research Program Fund at Community Foundation Silicon 44 * Valley. More details are available at: 45 * http://caia.swin.edu.au/urp/newtcp/ 46 */ 47 48 #include <sys/cdefs.h> 49 __FBSDID("$FreeBSD$"); 50 51 #include <sys/param.h> 52 #include <sys/kernel.h> 53 #include <sys/malloc.h> 54 #include <sys/module.h> 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <sys/sysctl.h> 58 #include <sys/systm.h> 59 60 #include <net/vnet.h> 61 62 #include <netinet/tcp.h> 63 #include <netinet/tcp_seq.h> 64 #include <netinet/tcp_timer.h> 65 #include <netinet/tcp_var.h> 66 #include <netinet/cc/cc.h> 67 #include <netinet/cc/cc_cubic.h> 68 #include <netinet/cc/cc_module.h> 69 70 static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 71 static void cubic_cb_destroy(struct cc_var *ccv); 72 static int cubic_cb_init(struct cc_var *ccv); 73 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 74 static void cubic_conn_init(struct cc_var *ccv); 75 static int cubic_mod_init(void); 76 static void cubic_post_recovery(struct cc_var *ccv); 77 static void cubic_record_rtt(struct cc_var *ccv); 78 static void cubic_ssthresh_update(struct cc_var *ccv); 79 80 struct cubic { 81 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 82 int64_t K; 83 /* Sum of RTT samples across an epoch in ticks. */ 84 int64_t sum_rtt_ticks; 85 /* cwnd at the most recent congestion event. */ 86 unsigned long max_cwnd; 87 /* cwnd at the previous congestion event. */ 88 unsigned long prev_max_cwnd; 89 /* Number of congestion events. */ 90 uint32_t num_cong_events; 91 /* Minimum observed rtt in ticks. */ 92 int min_rtt_ticks; 93 /* Mean observed rtt between congestion epochs. */ 94 int mean_rtt_ticks; 95 /* ACKs since last congestion event. */ 96 int epoch_ack_count; 97 /* Time of last congestion event in ticks. */ 98 int t_last_cong; 99 }; 100 101 static MALLOC_DEFINE(M_CUBIC, "cubic data", 102 "Per connection data required for the CUBIC congestion control algorithm"); 103 104 struct cc_algo cubic_cc_algo = { 105 .name = "cubic", 106 .ack_received = cubic_ack_received, 107 .cb_destroy = cubic_cb_destroy, 108 .cb_init = cubic_cb_init, 109 .cong_signal = cubic_cong_signal, 110 .conn_init = cubic_conn_init, 111 .mod_init = cubic_mod_init, 112 .post_recovery = cubic_post_recovery, 113 }; 114 115 static void 116 cubic_ack_received(struct cc_var *ccv, uint16_t type) 117 { 118 struct cubic *cubic_data; 119 unsigned long w_tf, w_cubic_next; 120 int ticks_since_cong; 121 122 cubic_data = ccv->cc_data; 123 cubic_record_rtt(ccv); 124 125 /* 126 * Regular ACK and we're not in cong/fast recovery and we're cwnd 127 * limited and we're either not doing ABC or are slow starting or are 128 * doing ABC and we've sent a cwnd's worth of bytes. 129 */ 130 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 131 (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || 132 CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 133 (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { 134 /* Use the logic in NewReno ack_received() for slow start. */ 135 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 136 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) 137 newreno_cc_algo.ack_received(ccv, type); 138 else { 139 ticks_since_cong = ticks - cubic_data->t_last_cong; 140 141 /* 142 * The mean RTT is used to best reflect the equations in 143 * the I-D. Using min_rtt in the tf_cwnd calculation 144 * causes w_tf to grow much faster than it should if the 145 * RTT is dominated by network buffering rather than 146 * propagation delay. 147 */ 148 w_tf = tf_cwnd(ticks_since_cong, 149 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 150 CCV(ccv, t_maxseg)); 151 152 w_cubic_next = cubic_cwnd(ticks_since_cong + 153 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 154 CCV(ccv, t_maxseg), cubic_data->K); 155 156 ccv->flags &= ~CCF_ABC_SENTAWND; 157 158 if (w_cubic_next < w_tf) 159 /* 160 * TCP-friendly region, follow tf 161 * cwnd growth. 162 */ 163 CCV(ccv, snd_cwnd) = w_tf; 164 165 else if (CCV(ccv, snd_cwnd) < w_cubic_next) { 166 /* 167 * Concave or convex region, follow CUBIC 168 * cwnd growth. 169 */ 170 if (V_tcp_do_rfc3465) 171 CCV(ccv, snd_cwnd) = w_cubic_next; 172 else 173 CCV(ccv, snd_cwnd) += ((w_cubic_next - 174 CCV(ccv, snd_cwnd)) * 175 CCV(ccv, t_maxseg)) / 176 CCV(ccv, snd_cwnd); 177 } 178 179 /* 180 * If we're not in slow start and we're probing for a 181 * new cwnd limit at the start of a connection 182 * (happens when hostcache has a relevant entry), 183 * keep updating our current estimate of the 184 * max_cwnd. 185 */ 186 if (cubic_data->num_cong_events == 0 && 187 cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) 188 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 189 } 190 } 191 } 192 193 static void 194 cubic_cb_destroy(struct cc_var *ccv) 195 { 196 197 if (ccv->cc_data != NULL) 198 free(ccv->cc_data, M_CUBIC); 199 } 200 201 static int 202 cubic_cb_init(struct cc_var *ccv) 203 { 204 struct cubic *cubic_data; 205 206 cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO); 207 208 if (cubic_data == NULL) 209 return (ENOMEM); 210 211 /* Init some key variables with sensible defaults. */ 212 cubic_data->t_last_cong = ticks; 213 cubic_data->min_rtt_ticks = TCPTV_SRTTBASE; 214 cubic_data->mean_rtt_ticks = 1; 215 216 ccv->cc_data = cubic_data; 217 218 return (0); 219 } 220 221 /* 222 * Perform any necessary tasks before we enter congestion recovery. 223 */ 224 static void 225 cubic_cong_signal(struct cc_var *ccv, uint32_t type) 226 { 227 struct cubic *cubic_data; 228 uint32_t cwin; 229 u_int mss; 230 231 cubic_data = ccv->cc_data; 232 cwin = CCV(ccv, snd_cwnd); 233 mss = CCV(ccv, t_maxseg); 234 235 switch (type) { 236 case CC_NDUPACK: 237 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 238 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 239 cubic_ssthresh_update(ccv); 240 cubic_data->num_cong_events++; 241 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 242 cubic_data->max_cwnd = cwin; 243 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 244 } 245 ENTER_RECOVERY(CCV(ccv, t_flags)); 246 } 247 break; 248 249 case CC_ECN: 250 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 251 cubic_ssthresh_update(ccv); 252 cubic_data->num_cong_events++; 253 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 254 cubic_data->max_cwnd = cwin; 255 cubic_data->t_last_cong = ticks; 256 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 257 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 258 } 259 break; 260 261 case CC_RTO: 262 /* 263 * Grab the current time and record it so we know when the 264 * most recent congestion event was. Only record it when the 265 * timeout has fired more than once, as there is a reasonable 266 * chance the first one is a false alarm and may not indicate 267 * congestion. 268 */ 269 if (CCV(ccv, t_rxtshift) >= 2) { 270 cubic_data->num_cong_events++; 271 cubic_data->t_last_cong = ticks; 272 cubic_ssthresh_update(ccv); 273 cubic_data->max_cwnd = cwin; 274 CCV(ccv, snd_cwnd) = mss; 275 } 276 break; 277 } 278 } 279 280 static void 281 cubic_conn_init(struct cc_var *ccv) 282 { 283 struct cubic *cubic_data; 284 285 cubic_data = ccv->cc_data; 286 287 /* 288 * Ensure we have a sane initial value for max_cwnd recorded. Without 289 * this here bad things happen when entries from the TCP hostcache 290 * get used. 291 */ 292 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 293 } 294 295 static int 296 cubic_mod_init(void) 297 { 298 299 cubic_cc_algo.after_idle = newreno_cc_algo.after_idle; 300 301 return (0); 302 } 303 304 /* 305 * Perform any necessary tasks before we exit congestion recovery. 306 */ 307 static void 308 cubic_post_recovery(struct cc_var *ccv) 309 { 310 struct cubic *cubic_data; 311 int pipe; 312 313 cubic_data = ccv->cc_data; 314 pipe = 0; 315 316 /* Fast convergence heuristic. */ 317 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) 318 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) 319 >> CUBIC_SHIFT; 320 321 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 322 /* 323 * If inflight data is less than ssthresh, set cwnd 324 * conservatively to avoid a burst of data, as suggested in 325 * the NewReno RFC. Otherwise, use the CUBIC method. 326 * 327 * XXXLAS: Find a way to do this without needing curack 328 */ 329 if (V_tcp_do_rfc6675_pipe) 330 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 331 else 332 pipe = CCV(ccv, snd_max) - ccv->curack; 333 334 if (pipe < CCV(ccv, snd_ssthresh)) 335 CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg); 336 else 337 /* Update cwnd based on beta and adjusted max_cwnd. */ 338 CCV(ccv, snd_cwnd) = max(1, ((CUBIC_BETA * 339 cubic_data->max_cwnd) >> CUBIC_SHIFT)); 340 } 341 cubic_data->t_last_cong = ticks; 342 343 /* Calculate the average RTT between congestion epochs. */ 344 if (cubic_data->epoch_ack_count > 0 && 345 cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { 346 cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks / 347 cubic_data->epoch_ack_count); 348 } 349 350 cubic_data->epoch_ack_count = 0; 351 cubic_data->sum_rtt_ticks = 0; 352 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 353 } 354 355 /* 356 * Record the min RTT and sum samples for the epoch average RTT calculation. 357 */ 358 static void 359 cubic_record_rtt(struct cc_var *ccv) 360 { 361 struct cubic *cubic_data; 362 int t_srtt_ticks; 363 364 /* Ignore srtt until a min number of samples have been taken. */ 365 if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { 366 cubic_data = ccv->cc_data; 367 t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE; 368 369 /* 370 * Record the current SRTT as our minrtt if it's the smallest 371 * we've seen or minrtt is currently equal to its initialised 372 * value. 373 * 374 * XXXLAS: Should there be some hysteresis for minrtt? 375 */ 376 if ((t_srtt_ticks < cubic_data->min_rtt_ticks || 377 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) { 378 cubic_data->min_rtt_ticks = max(1, t_srtt_ticks); 379 380 /* 381 * If the connection is within its first congestion 382 * epoch, ensure we prime mean_rtt_ticks with a 383 * reasonable value until the epoch average RTT is 384 * calculated in cubic_post_recovery(). 385 */ 386 if (cubic_data->min_rtt_ticks > 387 cubic_data->mean_rtt_ticks) 388 cubic_data->mean_rtt_ticks = 389 cubic_data->min_rtt_ticks; 390 } 391 392 /* Sum samples for epoch average RTT calculation. */ 393 cubic_data->sum_rtt_ticks += t_srtt_ticks; 394 cubic_data->epoch_ack_count++; 395 } 396 } 397 398 /* 399 * Update the ssthresh in the event of congestion. 400 */ 401 static void 402 cubic_ssthresh_update(struct cc_var *ccv) 403 { 404 struct cubic *cubic_data; 405 406 cubic_data = ccv->cc_data; 407 408 /* 409 * On the first congestion event, set ssthresh to cwnd * 0.5, on 410 * subsequent congestion events, set it to cwnd * beta. 411 */ 412 if (cubic_data->num_cong_events == 0) 413 CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd) >> 1; 414 else 415 CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) * 416 CUBIC_BETA) >> CUBIC_SHIFT; 417 } 418 419 420 DECLARE_CC_MODULE(cubic, &cubic_cc_algo); 421