1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 5 * Copyright (c) 2010 The FreeBSD Foundation 6 * All rights reserved. 7 * 8 * This software was developed by Lawrence Stewart while studying at the Centre 9 * for Advanced Internet Architectures, Swinburne University of Technology, made 10 * possible in part by a grant from the Cisco University Research Program Fund 11 * at Community Foundation Silicon Valley. 12 * 13 * Portions of this software were developed at the Centre for Advanced 14 * Internet Architectures, Swinburne University of Technology, Melbourne, 15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 /* 40 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 41 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 42 * Originally released as part of the NewTCP research project at Swinburne 43 * University of Technology's Centre for Advanced Internet Architectures, 44 * Melbourne, Australia, which was made possible in part by a grant from the 45 * Cisco University Research Program Fund at Community Foundation Silicon 46 * Valley. More details are available at: 47 * http://caia.swin.edu.au/urp/newtcp/ 48 */ 49 50 #include <sys/cdefs.h> 51 __FBSDID("$FreeBSD$"); 52 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/malloc.h> 57 #include <sys/module.h> 58 #include <sys/socket.h> 59 #include <sys/socketvar.h> 60 #include <sys/sysctl.h> 61 #include <sys/systm.h> 62 63 #include <net/vnet.h> 64 65 #include <netinet/tcp.h> 66 #include <netinet/tcp_seq.h> 67 #include <netinet/tcp_timer.h> 68 #include <netinet/tcp_var.h> 69 #include <netinet/cc/cc.h> 70 #include <netinet/cc/cc_cubic.h> 71 #include <netinet/cc/cc_module.h> 72 73 static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 74 static void cubic_cb_destroy(struct cc_var *ccv); 75 static int cubic_cb_init(struct cc_var *ccv); 76 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 77 static void cubic_conn_init(struct cc_var *ccv); 78 static int cubic_mod_init(void); 79 static void cubic_post_recovery(struct cc_var *ccv); 80 static void cubic_record_rtt(struct cc_var *ccv); 81 static void cubic_ssthresh_update(struct cc_var *ccv); 82 static void cubic_after_idle(struct cc_var *ccv); 83 84 struct cubic { 85 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 86 int64_t K; 87 /* Sum of RTT samples across an epoch in ticks. */ 88 int64_t sum_rtt_ticks; 89 /* cwnd at the most recent congestion event. */ 90 unsigned long max_cwnd; 91 /* cwnd at the previous congestion event. */ 92 unsigned long prev_max_cwnd; 93 /* various flags */ 94 uint32_t flags; 95 #define CUBICFLAG_CONG_EVENT 0x00000001 /* congestion experienced */ 96 #define CUBICFLAG_IN_SLOWSTART 0x00000002 /* in slow start */ 97 #define CUBICFLAG_IN_APPLIMIT 0x00000004 /* application limited */ 98 /* Minimum observed rtt in ticks. */ 99 int min_rtt_ticks; 100 /* Mean observed rtt between congestion epochs. */ 101 int mean_rtt_ticks; 102 /* ACKs since last congestion event. */ 103 int epoch_ack_count; 104 /* Time of last congestion event in ticks. */ 105 int t_last_cong; 106 }; 107 108 static MALLOC_DEFINE(M_CUBIC, "cubic data", 109 "Per connection data required for the CUBIC congestion control algorithm"); 110 111 struct cc_algo cubic_cc_algo = { 112 .name = "cubic", 113 .ack_received = cubic_ack_received, 114 .cb_destroy = cubic_cb_destroy, 115 .cb_init = cubic_cb_init, 116 .cong_signal = cubic_cong_signal, 117 .conn_init = cubic_conn_init, 118 .mod_init = cubic_mod_init, 119 .post_recovery = cubic_post_recovery, 120 .after_idle = cubic_after_idle, 121 }; 122 123 static void 124 cubic_ack_received(struct cc_var *ccv, uint16_t type) 125 { 126 struct cubic *cubic_data; 127 unsigned long w_tf, w_cubic_next; 128 int ticks_since_cong; 129 130 cubic_data = ccv->cc_data; 131 cubic_record_rtt(ccv); 132 133 /* 134 * Regular ACK and we're not in cong/fast recovery and we're cwnd 135 * limited and we're either not doing ABC or are slow starting or are 136 * doing ABC and we've sent a cwnd's worth of bytes. 137 */ 138 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 139 (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || 140 CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 141 (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { 142 /* Use the logic in NewReno ack_received() for slow start. */ 143 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 144 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) { 145 cubic_data->flags |= CUBICFLAG_IN_SLOWSTART; 146 newreno_cc_algo.ack_received(ccv, type); 147 } else { 148 if ((ticks_since_cong = 149 ticks - cubic_data->t_last_cong) < 0) { 150 /* 151 * dragging t_last_cong along 152 */ 153 ticks_since_cong = INT_MAX; 154 cubic_data->t_last_cong = ticks - INT_MAX; 155 } 156 157 if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | 158 CUBICFLAG_IN_APPLIMIT)) { 159 cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART | 160 CUBICFLAG_IN_APPLIMIT); 161 cubic_data->t_last_cong = ticks; 162 cubic_data->K = 0; 163 } 164 /* 165 * The mean RTT is used to best reflect the equations in 166 * the I-D. Using min_rtt in the tf_cwnd calculation 167 * causes w_tf to grow much faster than it should if the 168 * RTT is dominated by network buffering rather than 169 * propagation delay. 170 */ 171 w_tf = tf_cwnd(ticks_since_cong, 172 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 173 CCV(ccv, t_maxseg)); 174 175 w_cubic_next = cubic_cwnd(ticks_since_cong + 176 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 177 CCV(ccv, t_maxseg), cubic_data->K); 178 179 ccv->flags &= ~CCF_ABC_SENTAWND; 180 181 if (w_cubic_next < w_tf) { 182 /* 183 * TCP-friendly region, follow tf 184 * cwnd growth. 185 */ 186 if (CCV(ccv, snd_cwnd) < w_tf) 187 CCV(ccv, snd_cwnd) = ulmin(w_tf, INT_MAX); 188 } else if (CCV(ccv, snd_cwnd) < w_cubic_next) { 189 /* 190 * Concave or convex region, follow CUBIC 191 * cwnd growth. 192 * Only update snd_cwnd, if it doesn't shrink. 193 */ 194 if (V_tcp_do_rfc3465) 195 CCV(ccv, snd_cwnd) = ulmin(w_cubic_next, 196 INT_MAX); 197 else 198 CCV(ccv, snd_cwnd) += ulmax(1, 199 ((ulmin(w_cubic_next, INT_MAX) - 200 CCV(ccv, snd_cwnd)) * 201 CCV(ccv, t_maxseg)) / 202 CCV(ccv, snd_cwnd)); 203 } 204 205 /* 206 * If we're not in slow start and we're probing for a 207 * new cwnd limit at the start of a connection 208 * (happens when hostcache has a relevant entry), 209 * keep updating our current estimate of the 210 * max_cwnd. 211 */ 212 if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) && 213 cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) { 214 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 215 cubic_data->K = cubic_k(cubic_data->max_cwnd / 216 CCV(ccv, t_maxseg)); 217 } 218 } 219 } else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 220 !(ccv->flags & CCF_CWND_LIMITED)) { 221 cubic_data->flags |= CUBICFLAG_IN_APPLIMIT; 222 } 223 } 224 225 /* 226 * This is a Cubic specific implementation of after_idle. 227 * - Reset cwnd by calling New Reno implementation of after_idle. 228 * - Reset t_last_cong. 229 */ 230 static void 231 cubic_after_idle(struct cc_var *ccv) 232 { 233 struct cubic *cubic_data; 234 235 cubic_data = ccv->cc_data; 236 237 cubic_data->max_cwnd = ulmax(cubic_data->max_cwnd, CCV(ccv, snd_cwnd)); 238 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 239 240 newreno_cc_algo.after_idle(ccv); 241 cubic_data->t_last_cong = ticks; 242 } 243 244 245 static void 246 cubic_cb_destroy(struct cc_var *ccv) 247 { 248 free(ccv->cc_data, M_CUBIC); 249 } 250 251 static int 252 cubic_cb_init(struct cc_var *ccv) 253 { 254 struct cubic *cubic_data; 255 256 cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO); 257 258 if (cubic_data == NULL) 259 return (ENOMEM); 260 261 /* Init some key variables with sensible defaults. */ 262 cubic_data->t_last_cong = ticks; 263 cubic_data->min_rtt_ticks = TCPTV_SRTTBASE; 264 cubic_data->mean_rtt_ticks = 1; 265 266 ccv->cc_data = cubic_data; 267 268 return (0); 269 } 270 271 /* 272 * Perform any necessary tasks before we enter congestion recovery. 273 */ 274 static void 275 cubic_cong_signal(struct cc_var *ccv, uint32_t type) 276 { 277 struct cubic *cubic_data; 278 279 cubic_data = ccv->cc_data; 280 281 switch (type) { 282 case CC_NDUPACK: 283 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 284 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 285 cubic_ssthresh_update(ccv); 286 cubic_data->flags |= CUBICFLAG_CONG_EVENT; 287 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 288 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 289 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 290 } 291 ENTER_RECOVERY(CCV(ccv, t_flags)); 292 } 293 break; 294 295 case CC_ECN: 296 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 297 cubic_ssthresh_update(ccv); 298 cubic_data->flags |= CUBICFLAG_CONG_EVENT; 299 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 300 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 301 cubic_data->t_last_cong = ticks; 302 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 303 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 304 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 305 } 306 break; 307 308 case CC_RTO: 309 /* 310 * Grab the current time and record it so we know when the 311 * most recent congestion event was. Only record it when the 312 * timeout has fired more than once, as there is a reasonable 313 * chance the first one is a false alarm and may not indicate 314 * congestion. 315 * This will put Cubic firmly into the concave / TCP friendly 316 * region, for a slower ramp-up after two consecutive RTOs. 317 */ 318 if (CCV(ccv, t_rxtshift) >= 2) { 319 cubic_data->flags |= CUBICFLAG_CONG_EVENT; 320 cubic_data->t_last_cong = ticks; 321 cubic_data->max_cwnd = CCV(ccv, snd_cwnd_prev); 322 cubic_data->K = cubic_k(cubic_data->max_cwnd / 323 CCV(ccv, t_maxseg)); 324 } 325 break; 326 } 327 } 328 329 static void 330 cubic_conn_init(struct cc_var *ccv) 331 { 332 struct cubic *cubic_data; 333 334 cubic_data = ccv->cc_data; 335 336 /* 337 * Ensure we have a sane initial value for max_cwnd recorded. Without 338 * this here bad things happen when entries from the TCP hostcache 339 * get used. 340 */ 341 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 342 } 343 344 static int 345 cubic_mod_init(void) 346 { 347 return (0); 348 } 349 350 /* 351 * Perform any necessary tasks before we exit congestion recovery. 352 */ 353 static void 354 cubic_post_recovery(struct cc_var *ccv) 355 { 356 struct cubic *cubic_data; 357 int pipe; 358 359 cubic_data = ccv->cc_data; 360 pipe = 0; 361 362 /* Fast convergence heuristic. */ 363 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) 364 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) 365 >> CUBIC_SHIFT; 366 367 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 368 /* 369 * If inflight data is less than ssthresh, set cwnd 370 * conservatively to avoid a burst of data, as suggested in 371 * the NewReno RFC. Otherwise, use the CUBIC method. 372 * 373 * XXXLAS: Find a way to do this without needing curack 374 */ 375 if (V_tcp_do_rfc6675_pipe) 376 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 377 else 378 pipe = CCV(ccv, snd_max) - ccv->curack; 379 380 if (pipe < CCV(ccv, snd_ssthresh)) 381 /* 382 * Ensure that cwnd does not collapse to 1 MSS under 383 * adverse conditions. Implements RFC6582 384 */ 385 CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + 386 CCV(ccv, t_maxseg); 387 else 388 /* Update cwnd based on beta and adjusted max_cwnd. */ 389 CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->max_cwnd * 390 CUBIC_BETA) >> CUBIC_SHIFT, 391 2 * CCV(ccv, t_maxseg)); 392 } 393 cubic_data->t_last_cong = ticks; 394 395 /* Calculate the average RTT between congestion epochs. */ 396 if (cubic_data->epoch_ack_count > 0 && 397 cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { 398 cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks / 399 cubic_data->epoch_ack_count); 400 } 401 402 cubic_data->epoch_ack_count = 0; 403 cubic_data->sum_rtt_ticks = 0; 404 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 405 } 406 407 /* 408 * Record the min RTT and sum samples for the epoch average RTT calculation. 409 */ 410 static void 411 cubic_record_rtt(struct cc_var *ccv) 412 { 413 struct cubic *cubic_data; 414 int t_srtt_ticks; 415 416 /* Ignore srtt until a min number of samples have been taken. */ 417 if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { 418 cubic_data = ccv->cc_data; 419 t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE; 420 421 /* 422 * Record the current SRTT as our minrtt if it's the smallest 423 * we've seen or minrtt is currently equal to its initialised 424 * value. 425 * 426 * XXXLAS: Should there be some hysteresis for minrtt? 427 */ 428 if ((t_srtt_ticks < cubic_data->min_rtt_ticks || 429 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) { 430 cubic_data->min_rtt_ticks = max(1, t_srtt_ticks); 431 432 /* 433 * If the connection is within its first congestion 434 * epoch, ensure we prime mean_rtt_ticks with a 435 * reasonable value until the epoch average RTT is 436 * calculated in cubic_post_recovery(). 437 */ 438 if (cubic_data->min_rtt_ticks > 439 cubic_data->mean_rtt_ticks) 440 cubic_data->mean_rtt_ticks = 441 cubic_data->min_rtt_ticks; 442 } 443 444 /* Sum samples for epoch average RTT calculation. */ 445 cubic_data->sum_rtt_ticks += t_srtt_ticks; 446 cubic_data->epoch_ack_count++; 447 } 448 } 449 450 /* 451 * Update the ssthresh in the event of congestion. 452 */ 453 static void 454 cubic_ssthresh_update(struct cc_var *ccv) 455 { 456 struct cubic *cubic_data; 457 uint32_t ssthresh; 458 459 cubic_data = ccv->cc_data; 460 461 /* 462 * On the first congestion event, set ssthresh to cwnd * 0.5, on 463 * subsequent congestion events, set it to cwnd * beta. 464 */ 465 if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) 466 ssthresh = CCV(ccv, snd_cwnd) >> 1; 467 else 468 ssthresh = ((uint64_t)CCV(ccv, snd_cwnd) * 469 CUBIC_BETA) >> CUBIC_SHIFT; 470 CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * CCV(ccv, t_maxseg)); 471 } 472 473 474 DECLARE_CC_MODULE(cubic, &cubic_cc_algo); 475 MODULE_VERSION(cubic, 1); 476