1 /* 2 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 3 * Copyright (c) 2010 The FreeBSD Foundation 4 * All rights reserved. 5 * Copyright (c) 2017 by Delphix. All rights reserved. 6 * Copyright 2019 Joyent, Inc. 7 * Copyright 2020 RackTop Systems, Inc. 8 * 9 * This software was developed by Lawrence Stewart while studying at the Centre 10 * for Advanced Internet Architectures, Swinburne University of Technology, made 11 * possible in part by a grant from the Cisco University Research Program Fund 12 * at Community Foundation Silicon Valley. 13 * 14 * Portions of this software were developed at the Centre for Advanced 15 * Internet Architectures, Swinburne University of Technology, Melbourne, 16 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 1. Redistributions of source code must retain the above copyright 22 * notice, this list of conditions and the following disclaimer. 23 * 2. Redistributions in binary form must reproduce the above copyright 24 * notice, this list of conditions and the following disclaimer in the 25 * documentation and/or other materials provided with the distribution. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 */ 39 40 /* 41 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 42 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 43 * Originally released as part of the NewTCP research project at Swinburne 44 * University of Technology's Centre for Advanced Internet Architectures, 45 * Melbourne, Australia, which was made possible in part by a grant from the 46 * Cisco University Research Program Fund at Community Foundation Silicon 47 * Valley. More details are available at: 48 * http://caia.swin.edu.au/urp/newtcp/ 49 */ 50 51 #include <sys/errno.h> 52 #include <sys/types.h> 53 #include <sys/kmem.h> 54 #include <sys/ddi.h> 55 #include <sys/sunddi.h> 56 #include <sys/modctl.h> 57 #include <sys/time.h> 58 59 #include <inet/tcp_impl.h> 60 #include <inet/cc.h> 61 #include <inet/cc/cc_cubic.h> 62 #include <inet/cc/cc_module.h> 63 64 static struct modlmisc cc_cubic_modlmisc = { 65 &mod_miscops, 66 "Cubic Congestion Control" 67 }; 68 69 static struct modlinkage cc_cubic_modlinkage = { 70 MODREV_1, 71 &cc_cubic_modlmisc, 72 NULL 73 }; 74 75 /* 76 * cubic uses the NewReno implementation of after_idle and uses NewReno's 77 * ack_received callback during slow start. 78 */ 79 static struct cc_algo *newreno_cc_algo; 80 81 static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 82 static void cubic_cb_destroy(struct cc_var *ccv); 83 static int cubic_cb_init(struct cc_var *ccv); 84 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 85 static void cubic_conn_init(struct cc_var *ccv); 86 static void cubic_post_recovery(struct cc_var *ccv); 87 static void cubic_record_rtt(struct cc_var *ccv); 88 static void cubic_ssthresh_update(struct cc_var *ccv); 89 static void cubic_after_idle(struct cc_var *ccv); 90 91 struct cubic { 92 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 93 int64_t K; 94 /* Sum of RTT samples across an epoch in nanoseconds. */ 95 hrtime_t sum_rtt_nsecs; 96 /* cwnd at the most recent congestion event. */ 97 uint32_t max_cwnd; 98 /* cwnd at the previous congestion event. */ 99 uint32_t prev_max_cwnd; 100 /* Number of congestion events. */ 101 uint32_t num_cong_events; 102 /* Minimum observed rtt in nanoseconds. */ 103 hrtime_t min_rtt_nsecs; 104 /* Mean observed rtt between congestion epochs. */ 105 hrtime_t mean_rtt_nsecs; 106 /* ACKs since last congestion event. */ 107 int epoch_ack_count; 108 /* Time of last congestion event in nanoseconds. */ 109 hrtime_t t_last_cong; 110 }; 111 112 struct cc_algo cubic_cc_algo = { 113 .name = "cubic", 114 .ack_received = cubic_ack_received, 115 .cb_destroy = cubic_cb_destroy, 116 .cb_init = cubic_cb_init, 117 .cong_signal = cubic_cong_signal, 118 .conn_init = cubic_conn_init, 119 .post_recovery = cubic_post_recovery, 120 .after_idle = cubic_after_idle, 121 }; 122 123 int 124 _init(void) 125 { 126 int err; 127 128 if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL) 129 return (EINVAL); 130 131 if ((err = cc_register_algo(&cubic_cc_algo)) == 0) { 132 if ((err = mod_install(&cc_cubic_modlinkage)) != 0) 133 (void) cc_deregister_algo(&cubic_cc_algo); 134 } 135 136 return (err); 137 } 138 139 int 140 _fini(void) 141 { 142 /* XXX Not unloadable for now */ 143 return (EBUSY); 144 } 145 146 int 147 _info(struct modinfo *modinfop) 148 { 149 return (mod_info(&cc_cubic_modlinkage, modinfop)); 150 } 151 152 static void 153 cubic_ack_received(struct cc_var *ccv, uint16_t type) 154 { 155 struct cubic *cubic_data; 156 uint32_t w_tf, w_cubic_next; 157 hrtime_t nsecs_since_cong; 158 159 cubic_data = ccv->cc_data; 160 cubic_record_rtt(ccv); 161 162 /* 163 * Regular ACK and we're not in cong/fast recovery and we're cwnd 164 * limited and we're either not doing ABC or are slow starting or are 165 * doing ABC and we've sent a cwnd's worth of bytes. 166 */ 167 if (type == CC_ACK && !IN_RECOVERY(ccv->flags) && 168 (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) || 169 CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) || 170 (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) { 171 /* Use the logic in NewReno ack_received() for slow start. */ 172 if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) || 173 cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE) 174 newreno_cc_algo->ack_received(ccv, type); 175 else { 176 nsecs_since_cong = gethrtime() - 177 cubic_data->t_last_cong; 178 179 /* 180 * The mean RTT is used to best reflect the equations in 181 * the I-D. Using min_rtt in the tf_cwnd calculation 182 * causes w_tf to grow much faster than it should if the 183 * RTT is dominated by network buffering rather than 184 * propagation delay. 185 */ 186 w_tf = tf_cwnd(nsecs_since_cong, 187 cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd, 188 CCV(ccv, tcp_mss)); 189 190 w_cubic_next = cubic_cwnd(nsecs_since_cong + 191 cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd, 192 CCV(ccv, tcp_mss), cubic_data->K); 193 194 ccv->flags &= ~CCF_ABC_SENTAWND; 195 196 if (w_cubic_next < w_tf) { 197 /* 198 * TCP-friendly region, follow tf 199 * cwnd growth. 200 */ 201 if (CCV(ccv, tcp_cwnd) < w_tf) 202 CCV(ccv, tcp_cwnd) = w_tf; 203 } else if (CCV(ccv, tcp_cwnd) < w_cubic_next) { 204 /* 205 * Concave or convex region, follow CUBIC 206 * cwnd growth. 207 */ 208 if (CC_ABC(ccv)) 209 CCV(ccv, tcp_cwnd) = MIN(w_cubic_next, 210 INT_MAX); 211 else 212 CCV(ccv, tcp_cwnd) += MAX(1, 213 ((MIN(w_cubic_next, INT_MAX) - 214 CCV(ccv, tcp_cwnd)) * 215 CCV(ccv, tcp_mss)) / 216 CCV(ccv, tcp_cwnd)); 217 } 218 219 /* 220 * If we're not in slow start and we're probing for a 221 * new cwnd limit at the start of a connection 222 * (happens when hostcache has a relevant entry), 223 * keep updating our current estimate of the 224 * max_cwnd. 225 */ 226 if (cubic_data->num_cong_events == 0 && 227 cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) { 228 cubic_data->max_cwnd = CCV(ccv, tcp_cwnd); 229 cubic_data->K = cubic_k(cubic_data->max_cwnd / 230 CCV(ccv, tcp_mss)); 231 } 232 } 233 } 234 } 235 236 /* 237 * This is a Cubic specific implementation of after_idle. 238 * - Reset cwnd by calling New Reno implementation of after_idle. 239 * - Reset t_last_cong. 240 */ 241 static void 242 cubic_after_idle(struct cc_var *ccv) 243 { 244 struct cubic *cubic_data; 245 246 cubic_data = ccv->cc_data; 247 248 cubic_data->max_cwnd = max(cubic_data->max_cwnd, CCV(ccv, tcp_cwnd)); 249 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss)); 250 251 newreno_cc_algo->after_idle(ccv); 252 cubic_data->t_last_cong = gethrtime(); 253 } 254 255 static void 256 cubic_cb_destroy(struct cc_var *ccv) 257 { 258 259 if (ccv->cc_data != NULL) 260 kmem_free(ccv->cc_data, sizeof (struct cubic)); 261 } 262 263 static int 264 cubic_cb_init(struct cc_var *ccv) 265 { 266 struct cubic *cubic_data; 267 268 cubic_data = kmem_zalloc(sizeof (struct cubic), KM_NOSLEEP); 269 270 if (cubic_data == NULL) 271 return (ENOMEM); 272 273 /* Init some key variables with sensible defaults. */ 274 cubic_data->t_last_cong = gethrtime(); 275 cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE; 276 cubic_data->mean_rtt_nsecs = 1; 277 278 ccv->cc_data = cubic_data; 279 280 return (0); 281 } 282 283 /* 284 * Perform any necessary tasks before we enter congestion recovery. 285 */ 286 static void 287 cubic_cong_signal(struct cc_var *ccv, uint32_t type) 288 { 289 struct cubic *cubic_data; 290 uint32_t cwin; 291 uint32_t mss; 292 293 cubic_data = ccv->cc_data; 294 cwin = CCV(ccv, tcp_cwnd); 295 mss = CCV(ccv, tcp_mss); 296 297 switch (type) { 298 case CC_NDUPACK: 299 if (!IN_FASTRECOVERY(ccv->flags)) { 300 if (!IN_CONGRECOVERY(ccv->flags)) { 301 cubic_ssthresh_update(ccv); 302 cubic_data->num_cong_events++; 303 cubic_data->prev_max_cwnd = 304 cubic_data->max_cwnd; 305 cubic_data->max_cwnd = cwin; 306 CCV(ccv, tcp_cwnd) = 307 CCV(ccv, tcp_cwnd_ssthresh); 308 } 309 ENTER_RECOVERY(ccv->flags); 310 } 311 break; 312 313 case CC_ECN: 314 if (!IN_CONGRECOVERY(ccv->flags)) { 315 cubic_ssthresh_update(ccv); 316 cubic_data->num_cong_events++; 317 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 318 cubic_data->max_cwnd = cwin; 319 cubic_data->t_last_cong = gethrtime(); 320 CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh); 321 ENTER_CONGRECOVERY(ccv->flags); 322 } 323 break; 324 325 case CC_RTO: 326 /* 327 * Grab the current time and record it so we know when the 328 * most recent congestion event was. 329 */ 330 cubic_data->num_cong_events++; 331 cubic_data->t_last_cong = gethrtime(); 332 cubic_ssthresh_update(ccv); 333 cubic_data->max_cwnd = cwin; 334 CCV(ccv, tcp_cwnd) = mss; 335 break; 336 } 337 } 338 339 static void 340 cubic_conn_init(struct cc_var *ccv) 341 { 342 struct cubic *cubic_data; 343 344 cubic_data = ccv->cc_data; 345 346 /* 347 * Ensure we have a sane initial value for max_cwnd recorded. Without 348 * this here bad things happen when entries from the TCP hostcache 349 * get used. 350 */ 351 cubic_data->max_cwnd = CCV(ccv, tcp_cwnd); 352 } 353 354 /* 355 * Perform any necessary tasks before we exit congestion recovery. 356 */ 357 static void 358 cubic_post_recovery(struct cc_var *ccv) 359 { 360 struct cubic *cubic_data; 361 uint32_t mss, pipe; 362 363 cubic_data = ccv->cc_data; 364 365 /* Fast convergence heuristic. */ 366 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) { 367 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) 368 >> CUBIC_SHIFT; 369 } 370 371 /* 372 * There is a risk that if the cwnd becomes less than mss, and 373 * we do not get enough acks to drive it back up beyond mss, 374 * we will stop transmitting data altogether. 375 * 376 * The Cubic RFC defines values in terms of units of mss. Therefore 377 * we must make sure we have at least 1 mss to make progress 378 * since the algorthm is written that way. 379 */ 380 mss = CCV(ccv, tcp_mss); 381 382 if (IN_FASTRECOVERY(ccv->flags)) { 383 /* 384 * If inflight data is less than ssthresh, set cwnd 385 * conservatively to avoid a burst of data, as suggested in 386 * the NewReno RFC. Otherwise, use the CUBIC method. 387 */ 388 pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna); 389 if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) { 390 /* 391 * Ensure that cwnd does not collapse to 1 MSS under 392 * adverse conditions. Implements RFC6582 393 */ 394 CCV(ccv, tcp_cwnd) = MAX(pipe, mss) + mss; 395 } else { 396 /* Update cwnd based on beta and adjusted max_cwnd. */ 397 CCV(ccv, tcp_cwnd) = max(mss, ((CUBIC_BETA * 398 cubic_data->max_cwnd) >> CUBIC_SHIFT)); 399 } 400 } else { 401 CCV(ccv, tcp_cwnd) = max(mss, CCV(ccv, tcp_cwnd)); 402 } 403 404 cubic_data->t_last_cong = gethrtime(); 405 406 /* Calculate the average RTT between congestion epochs. */ 407 if (cubic_data->epoch_ack_count > 0 && 408 cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) { 409 cubic_data->mean_rtt_nsecs = 410 (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count); 411 } 412 413 cubic_data->epoch_ack_count = 0; 414 cubic_data->sum_rtt_nsecs = 0; 415 cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); 416 } 417 418 /* 419 * Record the min RTT and sum samples for the epoch average RTT calculation. 420 */ 421 static void 422 cubic_record_rtt(struct cc_var *ccv) 423 { 424 struct cubic *cubic_data; 425 int t_srtt_nsecs; 426 427 /* Ignore srtt until a min number of samples have been taken. */ 428 if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) { 429 cubic_data = ccv->cc_data; 430 /* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */ 431 t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3; 432 433 /* 434 * Record the current SRTT as our minrtt if it's the smallest 435 * we've seen or minrtt is currently equal to its initialized 436 * value. 437 * 438 * XXXLAS: Should there be some hysteresis for minrtt? 439 */ 440 if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs || 441 cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) { 442 cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs); 443 444 /* 445 * If the connection is within its first congestion 446 * epoch, ensure we prime mean_rtt_nsecs with a 447 * reasonable value until the epoch average RTT is 448 * calculated in cubic_post_recovery(). 449 */ 450 if (cubic_data->min_rtt_nsecs > 451 cubic_data->mean_rtt_nsecs) 452 cubic_data->mean_rtt_nsecs = 453 cubic_data->min_rtt_nsecs; 454 } 455 456 /* Sum samples for epoch average RTT calculation. */ 457 cubic_data->sum_rtt_nsecs += t_srtt_nsecs; 458 cubic_data->epoch_ack_count++; 459 } 460 } 461 462 /* 463 * Update the ssthresh in the event of congestion. 464 */ 465 static void 466 cubic_ssthresh_update(struct cc_var *ccv) 467 { 468 struct cubic *cubic_data; 469 470 cubic_data = ccv->cc_data; 471 472 /* 473 * On the first congestion event, set ssthresh to cwnd * 0.5, on 474 * subsequent congestion events, set it to cwnd * beta. 475 */ 476 if (cubic_data->num_cong_events == 0) 477 CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1; 478 else 479 CCV(ccv, tcp_cwnd_ssthresh) = 480 (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT; 481 } 482