1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 5 * Copyright (c) 2010 The FreeBSD Foundation 6 * All rights reserved. 7 * 8 * This software was developed by Lawrence Stewart while studying at the Centre 9 * for Advanced Internet Architectures, Swinburne University of Technology, made 10 * possible in part by a grant from the Cisco University Research Program Fund 11 * at Community Foundation Silicon Valley. 12 * 13 * Portions of this software were developed at the Centre for Advanced 14 * Internet Architectures, Swinburne University of Technology, Melbourne, 15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 /* 40 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 41 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 42 * Originally released as part of the NewTCP research project at Swinburne 43 * University of Technology's Centre for Advanced Internet Architectures, 44 * Melbourne, Australia, which was made possible in part by a grant from the 45 * Cisco University Research Program Fund at Community Foundation Silicon 46 * Valley. More details are available at: 47 * http://caia.swin.edu.au/urp/newtcp/ 48 */ 49 50 #include <sys/cdefs.h> 51 __FBSDID("$FreeBSD$"); 52 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/malloc.h> 57 #include <sys/module.h> 58 #include <sys/socket.h> 59 #include <sys/socketvar.h> 60 #include <sys/sysctl.h> 61 #include <sys/systm.h> 62 63 #include <net/vnet.h> 64 65 #include <net/route.h> 66 #include <net/route/nhop.h> 67 68 #include <netinet/in_pcb.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_seq.h> 71 #include <netinet/tcp_timer.h> 72 #include <netinet/tcp_var.h> 73 #include <netinet/cc/cc.h> 74 #include <netinet/cc/cc_cubic.h> 75 #include <netinet/cc/cc_module.h> 76 77 static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 78 static void cubic_cb_destroy(struct cc_var *ccv); 79 static int cubic_cb_init(struct cc_var *ccv, void *ptr); 80 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 81 static void cubic_conn_init(struct cc_var *ccv); 82 static int cubic_mod_init(void); 83 static void cubic_post_recovery(struct cc_var *ccv); 84 static void cubic_record_rtt(struct cc_var *ccv); 85 static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg); 86 static void cubic_after_idle(struct cc_var *ccv); 87 static size_t cubic_data_sz(void); 88 89 struct cubic { 90 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 91 int64_t K; 92 /* Sum of RTT samples across an epoch in ticks. */ 93 int64_t sum_rtt_ticks; 94 /* cwnd at the most recent congestion event. */ 95 unsigned long max_cwnd; 96 /* cwnd at the previous congestion event. */ 97 unsigned long prev_max_cwnd; 98 /* A copy of prev_max_cwnd. Used for CC_RTO_ERR */ 99 unsigned long prev_max_cwnd_cp; 100 /* various flags */ 101 uint32_t flags; 102 #define CUBICFLAG_CONG_EVENT 0x00000001 /* congestion experienced */ 103 #define CUBICFLAG_IN_SLOWSTART 0x00000002 /* in slow start */ 104 #define CUBICFLAG_IN_APPLIMIT 0x00000004 /* application limited */ 105 #define CUBICFLAG_RTO_EVENT 0x00000008 /* RTO experienced */ 106 /* Minimum observed rtt in ticks. */ 107 int min_rtt_ticks; 108 /* Mean observed rtt between congestion epochs. */ 109 int mean_rtt_ticks; 110 /* ACKs since last congestion event. */ 111 int epoch_ack_count; 112 /* Timestamp (in ticks) of arriving in congestion avoidance from last 113 * congestion event. 114 */ 115 int t_last_cong; 116 /* Timestamp (in ticks) of a previous congestion event. Used for 117 * CC_RTO_ERR. 118 */ 119 int t_last_cong_prev; 120 }; 121 122 struct cc_algo cubic_cc_algo = { 123 .name = "cubic", 124 .ack_received = cubic_ack_received, 125 .cb_destroy = cubic_cb_destroy, 126 .cb_init = cubic_cb_init, 127 .cong_signal = cubic_cong_signal, 128 .conn_init = cubic_conn_init, 129 .mod_init = cubic_mod_init, 130 .post_recovery = cubic_post_recovery, 131 .after_idle = cubic_after_idle, 132 .cc_data_sz = cubic_data_sz 133 }; 134 135 static void 136 cubic_ack_received(struct cc_var *ccv, uint16_t type) 137 { 138 struct cubic *cubic_data; 139 unsigned long w_tf, w_cubic_next; 140 int ticks_since_cong; 141 142 cubic_data = ccv->cc_data; 143 cubic_record_rtt(ccv); 144 145 /* 146 * For a regular ACK and we're not in cong/fast recovery and 147 * we're cwnd limited, always recalculate cwnd. 148 */ 149 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 150 (ccv->flags & CCF_CWND_LIMITED)) { 151 /* Use the logic in NewReno ack_received() for slow start. */ 152 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 153 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) { 154 cubic_data->flags |= CUBICFLAG_IN_SLOWSTART; 155 newreno_cc_ack_received(ccv, type); 156 } else { 157 if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) && 158 (cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) { 159 /* RFC8312 Section 4.7 */ 160 cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT | 161 CUBICFLAG_IN_SLOWSTART); 162 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 163 cubic_data->K = 0; 164 } else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | 165 CUBICFLAG_IN_APPLIMIT)) { 166 cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART | 167 CUBICFLAG_IN_APPLIMIT); 168 cubic_data->t_last_cong = ticks; 169 cubic_data->K = cubic_k(cubic_data->max_cwnd / 170 CCV(ccv, t_maxseg)); 171 } 172 if ((ticks_since_cong = 173 ticks - cubic_data->t_last_cong) < 0) { 174 /* 175 * dragging t_last_cong along 176 */ 177 ticks_since_cong = INT_MAX; 178 cubic_data->t_last_cong = ticks - INT_MAX; 179 } 180 /* 181 * The mean RTT is used to best reflect the equations in 182 * the I-D. Using min_rtt in the tf_cwnd calculation 183 * causes w_tf to grow much faster than it should if the 184 * RTT is dominated by network buffering rather than 185 * propagation delay. 186 */ 187 w_tf = tf_cwnd(ticks_since_cong, 188 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 189 CCV(ccv, t_maxseg)); 190 191 w_cubic_next = cubic_cwnd(ticks_since_cong + 192 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 193 CCV(ccv, t_maxseg), cubic_data->K); 194 195 ccv->flags &= ~CCF_ABC_SENTAWND; 196 197 if (w_cubic_next < w_tf) { 198 /* 199 * TCP-friendly region, follow tf 200 * cwnd growth. 201 */ 202 if (CCV(ccv, snd_cwnd) < w_tf) 203 CCV(ccv, snd_cwnd) = ulmin(w_tf, INT_MAX); 204 } else if (CCV(ccv, snd_cwnd) < w_cubic_next) { 205 /* 206 * Concave or convex region, follow CUBIC 207 * cwnd growth. 208 * Only update snd_cwnd, if it doesn't shrink. 209 */ 210 CCV(ccv, snd_cwnd) = ulmin(w_cubic_next, 211 INT_MAX); 212 } 213 214 /* 215 * If we're not in slow start and we're probing for a 216 * new cwnd limit at the start of a connection 217 * (happens when hostcache has a relevant entry), 218 * keep updating our current estimate of the 219 * max_cwnd. 220 */ 221 if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) && 222 cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) { 223 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 224 cubic_data->K = cubic_k(cubic_data->max_cwnd / 225 CCV(ccv, t_maxseg)); 226 } 227 } 228 } else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 229 !(ccv->flags & CCF_CWND_LIMITED)) { 230 cubic_data->flags |= CUBICFLAG_IN_APPLIMIT; 231 } 232 } 233 234 /* 235 * This is a Cubic specific implementation of after_idle. 236 * - Reset cwnd by calling New Reno implementation of after_idle. 237 * - Reset t_last_cong. 238 */ 239 static void 240 cubic_after_idle(struct cc_var *ccv) 241 { 242 struct cubic *cubic_data; 243 244 cubic_data = ccv->cc_data; 245 246 cubic_data->max_cwnd = ulmax(cubic_data->max_cwnd, CCV(ccv, snd_cwnd)); 247 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 248 249 newreno_cc_after_idle(ccv); 250 cubic_data->t_last_cong = ticks; 251 } 252 253 static void 254 cubic_cb_destroy(struct cc_var *ccv) 255 { 256 free(ccv->cc_data, M_CC_MEM); 257 } 258 259 static size_t 260 cubic_data_sz(void) 261 { 262 return (sizeof(struct cubic)); 263 } 264 265 static int 266 cubic_cb_init(struct cc_var *ccv, void *ptr) 267 { 268 struct cubic *cubic_data; 269 270 INP_WLOCK_ASSERT(ccv->ccvc.tcp->t_inpcb); 271 if (ptr == NULL) { 272 cubic_data = malloc(sizeof(struct cubic), M_CC_MEM, M_NOWAIT|M_ZERO); 273 if (cubic_data == NULL) 274 return (ENOMEM); 275 } else 276 cubic_data = ptr; 277 278 /* Init some key variables with sensible defaults. */ 279 cubic_data->t_last_cong = ticks; 280 cubic_data->min_rtt_ticks = TCPTV_SRTTBASE; 281 cubic_data->mean_rtt_ticks = 1; 282 283 ccv->cc_data = cubic_data; 284 285 return (0); 286 } 287 288 /* 289 * Perform any necessary tasks before we enter congestion recovery. 290 */ 291 static void 292 cubic_cong_signal(struct cc_var *ccv, uint32_t type) 293 { 294 struct cubic *cubic_data; 295 u_int mss; 296 297 cubic_data = ccv->cc_data; 298 mss = tcp_maxseg(ccv->ccvc.tcp); 299 300 switch (type) { 301 case CC_NDUPACK: 302 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 303 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 304 cubic_ssthresh_update(ccv, mss); 305 cubic_data->flags |= CUBICFLAG_CONG_EVENT; 306 cubic_data->t_last_cong = ticks; 307 cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); 308 } 309 ENTER_RECOVERY(CCV(ccv, t_flags)); 310 } 311 break; 312 313 case CC_ECN: 314 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 315 cubic_ssthresh_update(ccv, mss); 316 cubic_data->flags |= CUBICFLAG_CONG_EVENT; 317 cubic_data->t_last_cong = ticks; 318 cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); 319 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 320 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 321 } 322 break; 323 324 case CC_RTO: 325 /* RFC8312 Section 4.7 */ 326 if (CCV(ccv, t_rxtshift) == 1) { 327 cubic_data->t_last_cong_prev = cubic_data->t_last_cong; 328 cubic_data->prev_max_cwnd_cp = cubic_data->prev_max_cwnd; 329 } 330 cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT; 331 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 332 CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_cwnd) * 333 CUBIC_BETA) >> CUBIC_SHIFT; 334 CCV(ccv, snd_cwnd) = mss; 335 break; 336 337 case CC_RTO_ERR: 338 cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT); 339 cubic_data->max_cwnd = cubic_data->prev_max_cwnd; 340 cubic_data->prev_max_cwnd = cubic_data->prev_max_cwnd_cp; 341 cubic_data->t_last_cong = cubic_data->t_last_cong_prev; 342 cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); 343 break; 344 } 345 } 346 347 static void 348 cubic_conn_init(struct cc_var *ccv) 349 { 350 struct cubic *cubic_data; 351 352 cubic_data = ccv->cc_data; 353 354 /* 355 * Ensure we have a sane initial value for max_cwnd recorded. Without 356 * this here bad things happen when entries from the TCP hostcache 357 * get used. 358 */ 359 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 360 } 361 362 static int 363 cubic_mod_init(void) 364 { 365 return (0); 366 } 367 368 /* 369 * Perform any necessary tasks before we exit congestion recovery. 370 */ 371 static void 372 cubic_post_recovery(struct cc_var *ccv) 373 { 374 struct cubic *cubic_data; 375 int pipe; 376 377 cubic_data = ccv->cc_data; 378 pipe = 0; 379 380 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 381 /* 382 * If inflight data is less than ssthresh, set cwnd 383 * conservatively to avoid a burst of data, as suggested in 384 * the NewReno RFC. Otherwise, use the CUBIC method. 385 * 386 * XXXLAS: Find a way to do this without needing curack 387 */ 388 if (V_tcp_do_newsack) 389 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 390 else 391 pipe = CCV(ccv, snd_max) - ccv->curack; 392 393 if (pipe < CCV(ccv, snd_ssthresh)) 394 /* 395 * Ensure that cwnd does not collapse to 1 MSS under 396 * adverse conditions. Implements RFC6582 397 */ 398 CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + 399 CCV(ccv, t_maxseg); 400 else 401 /* Update cwnd based on beta and adjusted max_cwnd. */ 402 CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->max_cwnd * 403 CUBIC_BETA) >> CUBIC_SHIFT, 404 2 * CCV(ccv, t_maxseg)); 405 } 406 407 /* Calculate the average RTT between congestion epochs. */ 408 if (cubic_data->epoch_ack_count > 0 && 409 cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { 410 cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks / 411 cubic_data->epoch_ack_count); 412 } 413 414 cubic_data->epoch_ack_count = 0; 415 cubic_data->sum_rtt_ticks = 0; 416 } 417 418 /* 419 * Record the min RTT and sum samples for the epoch average RTT calculation. 420 */ 421 static void 422 cubic_record_rtt(struct cc_var *ccv) 423 { 424 struct cubic *cubic_data; 425 int t_srtt_ticks; 426 427 /* Ignore srtt until a min number of samples have been taken. */ 428 if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { 429 cubic_data = ccv->cc_data; 430 t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE; 431 432 /* 433 * Record the current SRTT as our minrtt if it's the smallest 434 * we've seen or minrtt is currently equal to its initialised 435 * value. 436 * 437 * XXXLAS: Should there be some hysteresis for minrtt? 438 */ 439 if ((t_srtt_ticks < cubic_data->min_rtt_ticks || 440 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) { 441 cubic_data->min_rtt_ticks = max(1, t_srtt_ticks); 442 443 /* 444 * If the connection is within its first congestion 445 * epoch, ensure we prime mean_rtt_ticks with a 446 * reasonable value until the epoch average RTT is 447 * calculated in cubic_post_recovery(). 448 */ 449 if (cubic_data->min_rtt_ticks > 450 cubic_data->mean_rtt_ticks) 451 cubic_data->mean_rtt_ticks = 452 cubic_data->min_rtt_ticks; 453 } 454 455 /* Sum samples for epoch average RTT calculation. */ 456 cubic_data->sum_rtt_ticks += t_srtt_ticks; 457 cubic_data->epoch_ack_count++; 458 } 459 } 460 461 /* 462 * Update the ssthresh in the event of congestion. 463 */ 464 static void 465 cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg) 466 { 467 struct cubic *cubic_data; 468 uint32_t ssthresh; 469 uint32_t cwnd; 470 471 cubic_data = ccv->cc_data; 472 cwnd = CCV(ccv, snd_cwnd); 473 474 /* Fast convergence heuristic. */ 475 if (cwnd < cubic_data->max_cwnd) { 476 cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT; 477 } 478 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 479 cubic_data->max_cwnd = cwnd; 480 481 /* 482 * On the first congestion event, set ssthresh to cwnd * 0.5 483 * and reduce max_cwnd to cwnd * beta. This aligns the cubic concave 484 * region appropriately. On subsequent congestion events, set 485 * ssthresh to cwnd * beta. 486 */ 487 if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) { 488 ssthresh = cwnd >> 1; 489 cubic_data->max_cwnd = ((uint64_t)cwnd * 490 CUBIC_BETA) >> CUBIC_SHIFT; 491 } else { 492 ssthresh = ((uint64_t)cwnd * 493 CUBIC_BETA) >> CUBIC_SHIFT; 494 } 495 CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * maxseg); 496 } 497 498 DECLARE_CC_MODULE(cubic, &cubic_cc_algo); 499 MODULE_VERSION(cubic, 2); 500