1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2007-2008 5 * Swinburne University of Technology, Melbourne, Australia. 6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7 * Copyright (c) 2010 The FreeBSD Foundation 8 * All rights reserved. 9 * 10 * This software was developed at the Centre for Advanced Internet 11 * Architectures, Swinburne University of Technology, by Lawrence Stewart and 12 * James Healy, made possible in part by a grant from the Cisco University 13 * Research Program Fund at Community Foundation Silicon Valley. 14 * 15 * Portions of this software were developed at the Centre for Advanced 16 * Internet Architectures, Swinburne University of Technology, Melbourne, 17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 */ 40 41 /* 42 * This software was first released in 2007 by James Healy and Lawrence Stewart 43 * whilst working on the NewTCP research project at Swinburne University of 44 * Technology's Centre for Advanced Internet Architectures, Melbourne, 45 * Australia, which was made possible in part by a grant from the Cisco 46 * University Research Program Fund at Community Foundation Silicon Valley. 47 * More details are available at: 48 * http://caia.swin.edu.au/urp/newtcp/ 49 */ 50 51 #include <sys/cdefs.h> 52 __FBSDID("$FreeBSD$"); 53 #include <opt_cc.h> 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/libkern.h> 57 #include <sys/lock.h> 58 #include <sys/malloc.h> 59 #include <sys/module.h> 60 #include <sys/mutex.h> 61 #include <sys/queue.h> 62 #include <sys/rwlock.h> 63 #include <sys/sbuf.h> 64 #include <sys/socket.h> 65 #include <sys/socketvar.h> 66 #include <sys/sysctl.h> 67 68 #include <net/vnet.h> 69 70 #include <netinet/in.h> 71 #include <netinet/in_pcb.h> 72 #include <netinet/tcp.h> 73 #include <netinet/tcp_seq.h> 74 #include <netinet/tcp_var.h> 75 #include <netinet/tcp_log_buf.h> 76 #include <netinet/tcp_hpts.h> 77 #include <netinet/cc/cc.h> 78 #include <netinet/cc/cc_module.h> 79 80 /* 81 * Have a sane default if no CC_DEFAULT is specified in the kernel config file. 82 */ 83 #ifndef CC_DEFAULT 84 #define CC_DEFAULT "newreno" 85 #endif 86 87 MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); 88 89 /* 90 * List of available cc algorithms on the current system. First element 91 * is used as the system default CC algorithm. 92 */ 93 struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 94 95 /* Protects the cc_list TAILQ. */ 96 struct rwlock cc_list_lock; 97 98 VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; 99 100 VNET_DEFINE(uint32_t, newreno_beta) = 50; 101 #define V_newreno_beta VNET(newreno_beta) 102 103 /* 104 * Sysctl handler to show and change the default CC algorithm. 105 */ 106 static int 107 cc_default_algo(SYSCTL_HANDLER_ARGS) 108 { 109 char default_cc[TCP_CA_NAME_MAX]; 110 struct cc_algo *funcs; 111 int error; 112 113 /* Get the current default: */ 114 CC_LIST_RLOCK(); 115 if (CC_DEFAULT_ALGO() != NULL) 116 strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); 117 else 118 memset(default_cc, 0, TCP_CA_NAME_MAX); 119 CC_LIST_RUNLOCK(); 120 121 error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); 122 123 /* Check for error or no change */ 124 if (error != 0 || req->newptr == NULL) 125 goto done; 126 127 error = ESRCH; 128 /* Find algo with specified name and set it to default. */ 129 CC_LIST_RLOCK(); 130 STAILQ_FOREACH(funcs, &cc_list, entries) { 131 if (strncmp(default_cc, funcs->name, sizeof(default_cc))) 132 continue; 133 V_default_cc_ptr = funcs; 134 error = 0; 135 break; 136 } 137 CC_LIST_RUNLOCK(); 138 done: 139 return (error); 140 } 141 142 /* 143 * Sysctl handler to display the list of available CC algorithms. 144 */ 145 static int 146 cc_list_available(SYSCTL_HANDLER_ARGS) 147 { 148 struct cc_algo *algo; 149 struct sbuf *s; 150 int err, first, nalgos; 151 152 err = nalgos = 0; 153 first = 1; 154 155 CC_LIST_RLOCK(); 156 STAILQ_FOREACH(algo, &cc_list, entries) { 157 nalgos++; 158 } 159 CC_LIST_RUNLOCK(); 160 if (nalgos == 0) { 161 return (ENOENT); 162 } 163 s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); 164 165 if (s == NULL) 166 return (ENOMEM); 167 168 /* 169 * It is theoretically possible for the CC list to have grown in size 170 * since the call to sbuf_new() and therefore for the sbuf to be too 171 * small. If this were to happen (incredibly unlikely), the sbuf will 172 * reach an overflow condition, sbuf_printf() will return an error and 173 * the sysctl will fail gracefully. 174 */ 175 CC_LIST_RLOCK(); 176 STAILQ_FOREACH(algo, &cc_list, entries) { 177 err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); 178 if (err) { 179 /* Sbuf overflow condition. */ 180 err = EOVERFLOW; 181 break; 182 } 183 first = 0; 184 } 185 CC_LIST_RUNLOCK(); 186 187 if (!err) { 188 sbuf_finish(s); 189 err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); 190 } 191 192 sbuf_delete(s); 193 return (err); 194 } 195 196 /* 197 * Return the number of times a proposed removal_cc is 198 * being used as the default. 199 */ 200 static int 201 cc_check_default(struct cc_algo *remove_cc) 202 { 203 int cnt = 0; 204 VNET_ITERATOR_DECL(vnet_iter); 205 206 CC_LIST_LOCK_ASSERT(); 207 208 VNET_LIST_RLOCK_NOSLEEP(); 209 VNET_FOREACH(vnet_iter) { 210 CURVNET_SET(vnet_iter); 211 if ((CC_DEFAULT_ALGO() != NULL) && 212 strncmp(CC_DEFAULT_ALGO()->name, 213 remove_cc->name, 214 TCP_CA_NAME_MAX) == 0) { 215 cnt++; 216 } 217 CURVNET_RESTORE(); 218 } 219 VNET_LIST_RUNLOCK_NOSLEEP(); 220 return (cnt); 221 } 222 223 /* 224 * Initialise CC subsystem on system boot. 225 */ 226 static void 227 cc_init(void) 228 { 229 CC_LIST_LOCK_INIT(); 230 STAILQ_INIT(&cc_list); 231 } 232 233 /* 234 * Returns non-zero on success, 0 on failure. 235 */ 236 int 237 cc_deregister_algo(struct cc_algo *remove_cc) 238 { 239 struct cc_algo *funcs, *tmpfuncs; 240 int err; 241 242 err = ENOENT; 243 244 /* Remove algo from cc_list so that new connections can't use it. */ 245 CC_LIST_WLOCK(); 246 STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 247 if (funcs == remove_cc) { 248 if (cc_check_default(remove_cc)) { 249 CC_LIST_WUNLOCK(); 250 return(EBUSY); 251 } 252 break; 253 } 254 } 255 remove_cc->flags |= CC_MODULE_BEING_REMOVED; 256 CC_LIST_WUNLOCK(); 257 err = tcp_ccalgounload(remove_cc); 258 /* 259 * Now back through and we either remove the temp flag 260 * or pull the registration. 261 */ 262 CC_LIST_WLOCK(); 263 STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 264 if (funcs == remove_cc) { 265 if (err == 0) 266 STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); 267 else 268 funcs->flags &= ~CC_MODULE_BEING_REMOVED; 269 break; 270 } 271 } 272 CC_LIST_WUNLOCK(); 273 return (err); 274 } 275 276 /* 277 * Returns 0 on success, non-zero on failure. 278 */ 279 int 280 cc_register_algo(struct cc_algo *add_cc) 281 { 282 struct cc_algo *funcs; 283 int err; 284 285 err = 0; 286 287 /* 288 * Iterate over list of registered CC algorithms and make sure 289 * we're not trying to add a duplicate. 290 */ 291 CC_LIST_WLOCK(); 292 STAILQ_FOREACH(funcs, &cc_list, entries) { 293 if (funcs == add_cc || 294 strncmp(funcs->name, add_cc->name, 295 TCP_CA_NAME_MAX) == 0) { 296 err = EEXIST; 297 break; 298 } 299 } 300 /* 301 * The first loaded congestion control module will become 302 * the default until we find the "CC_DEFAULT" defined in 303 * the config (if we do). 304 */ 305 if (!err) { 306 STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); 307 if (strcmp(add_cc->name, CC_DEFAULT) == 0) { 308 V_default_cc_ptr = add_cc; 309 } else if (V_default_cc_ptr == NULL) { 310 V_default_cc_ptr = add_cc; 311 } 312 } 313 CC_LIST_WUNLOCK(); 314 315 return (err); 316 } 317 318 static void 319 vnet_cc_sysinit(void *arg) 320 { 321 struct cc_algo *cc; 322 323 if (IS_DEFAULT_VNET(curvnet)) 324 return; 325 326 CURVNET_SET(vnet0); 327 cc = V_default_cc_ptr; 328 CURVNET_RESTORE(); 329 330 V_default_cc_ptr = cc; 331 } 332 VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 333 vnet_cc_sysinit, NULL); 334 335 /* 336 * Perform any necessary tasks before we exit congestion recovery. 337 */ 338 void 339 newreno_cc_post_recovery(struct cc_var *ccv) 340 { 341 int pipe; 342 343 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 344 /* 345 * Fast recovery will conclude after returning from this 346 * function. Window inflation should have left us with 347 * approximately snd_ssthresh outstanding data. But in case we 348 * would be inclined to send a burst, better to do it via the 349 * slow start mechanism. 350 * 351 * XXXLAS: Find a way to do this without needing curack 352 */ 353 if (V_tcp_do_newsack) 354 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 355 else 356 pipe = CCV(ccv, snd_max) - ccv->curack; 357 if (pipe < CCV(ccv, snd_ssthresh)) 358 /* 359 * Ensure that cwnd does not collapse to 1 MSS under 360 * adverse conditions. Implements RFC6582 361 */ 362 CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + 363 CCV(ccv, t_maxseg); 364 else 365 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 366 } 367 } 368 369 void 370 newreno_cc_after_idle(struct cc_var *ccv) 371 { 372 uint32_t rw; 373 /* 374 * If we've been idle for more than one retransmit timeout the old 375 * congestion window is no longer current and we have to reduce it to 376 * the restart window before we can transmit again. 377 * 378 * The restart window is the initial window or the last CWND, whichever 379 * is smaller. 380 * 381 * This is done to prevent us from flooding the path with a full CWND at 382 * wirespeed, overloading router and switch buffers along the way. 383 * 384 * See RFC5681 Section 4.1. "Restarting Idle Connections". 385 * 386 * In addition, per RFC2861 Section 2, the ssthresh is set to the 387 * maximum of the former ssthresh or 3/4 of the old cwnd, to 388 * not exit slow-start prematurely. 389 */ 390 rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); 391 392 CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), 393 CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); 394 395 CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); 396 } 397 398 /* 399 * Perform any necessary tasks before we enter congestion recovery. 400 */ 401 void 402 newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type) 403 { 404 uint32_t cwin, factor; 405 u_int mss; 406 407 cwin = CCV(ccv, snd_cwnd); 408 mss = tcp_fixed_maxseg(ccv->ccvc.tcp); 409 /* 410 * Other TCP congestion controls use newreno_cong_signal(), but 411 * with their own private cc_data. Make sure the cc_data is used 412 * correctly. 413 */ 414 factor = V_newreno_beta; 415 416 /* Catch algos which mistakenly leak private signal types. */ 417 KASSERT((type & CC_SIGPRIVMASK) == 0, 418 ("%s: congestion signal type 0x%08x is private\n", __func__, type)); 419 420 cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 421 2) * mss; 422 423 switch (type) { 424 case CC_NDUPACK: 425 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 426 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) 427 CCV(ccv, snd_ssthresh) = cwin; 428 ENTER_RECOVERY(CCV(ccv, t_flags)); 429 } 430 break; 431 case CC_ECN: 432 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 433 CCV(ccv, snd_ssthresh) = cwin; 434 CCV(ccv, snd_cwnd) = cwin; 435 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 436 } 437 break; 438 case CC_RTO: 439 CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), 440 CCV(ccv, snd_cwnd)) / 2 / mss, 441 2) * mss; 442 CCV(ccv, snd_cwnd) = mss; 443 break; 444 } 445 } 446 447 void 448 newreno_cc_ack_received(struct cc_var *ccv, uint16_t type) 449 { 450 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 451 (ccv->flags & CCF_CWND_LIMITED)) { 452 u_int cw = CCV(ccv, snd_cwnd); 453 u_int incr = CCV(ccv, t_maxseg); 454 455 /* 456 * Regular in-order ACK, open the congestion window. 457 * Method depends on which congestion control state we're 458 * in (slow start or cong avoid) and if ABC (RFC 3465) is 459 * enabled. 460 * 461 * slow start: cwnd <= ssthresh 462 * cong avoid: cwnd > ssthresh 463 * 464 * slow start and ABC (RFC 3465): 465 * Grow cwnd exponentially by the amount of data 466 * ACKed capping the max increment per ACK to 467 * (abc_l_var * maxseg) bytes. 468 * 469 * slow start without ABC (RFC 5681): 470 * Grow cwnd exponentially by maxseg per ACK. 471 * 472 * cong avoid and ABC (RFC 3465): 473 * Grow cwnd linearly by maxseg per RTT for each 474 * cwnd worth of ACKed data. 475 * 476 * cong avoid without ABC (RFC 5681): 477 * Grow cwnd linearly by approximately maxseg per RTT using 478 * maxseg^2 / cwnd per ACK as the increment. 479 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to 480 * avoid capping cwnd. 481 */ 482 if (cw > CCV(ccv, snd_ssthresh)) { 483 if (V_tcp_do_rfc3465) { 484 if (ccv->flags & CCF_ABC_SENTAWND) 485 ccv->flags &= ~CCF_ABC_SENTAWND; 486 else 487 incr = 0; 488 } else 489 incr = max((incr * incr / cw), 1); 490 } else if (V_tcp_do_rfc3465) { 491 /* 492 * In slow-start with ABC enabled and no RTO in sight? 493 * (Must not use abc_l_var > 1 if slow starting after 494 * an RTO. On RTO, snd_nxt = snd_una, so the 495 * snd_nxt == snd_max check is sufficient to 496 * handle this). 497 * 498 * XXXLAS: Find a way to signal SS after RTO that 499 * doesn't rely on tcpcb vars. 500 */ 501 uint16_t abc_val; 502 503 if (ccv->flags & CCF_USE_LOCAL_ABC) 504 abc_val = ccv->labc; 505 else 506 abc_val = V_tcp_abc_l_var; 507 if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) 508 incr = min(ccv->bytes_this_ack, 509 ccv->nsegs * abc_val * 510 CCV(ccv, t_maxseg)); 511 else 512 incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); 513 514 } 515 /* ABC is on by default, so incr equals 0 frequently. */ 516 if (incr > 0) 517 CCV(ccv, snd_cwnd) = min(cw + incr, 518 TCP_MAXWIN << CCV(ccv, snd_scale)); 519 } 520 } 521 522 /* 523 * Handles kld related events. Returns 0 on success, non-zero on failure. 524 */ 525 int 526 cc_modevent(module_t mod, int event_type, void *data) 527 { 528 struct cc_algo *algo; 529 int err; 530 531 err = 0; 532 algo = (struct cc_algo *)data; 533 534 switch(event_type) { 535 case MOD_LOAD: 536 if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { 537 /* 538 * A module must have a cc_data_sz function 539 * even if it has no data it should return 0. 540 */ 541 printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); 542 err = EINVAL; 543 break; 544 } 545 if (algo->mod_init != NULL) 546 err = algo->mod_init(); 547 if (!err) 548 err = cc_register_algo(algo); 549 break; 550 551 case MOD_QUIESCE: 552 case MOD_SHUTDOWN: 553 case MOD_UNLOAD: 554 err = cc_deregister_algo(algo); 555 if (!err && algo->mod_destroy != NULL) 556 algo->mod_destroy(); 557 if (err == ENOENT) 558 err = 0; 559 break; 560 561 default: 562 err = EINVAL; 563 break; 564 } 565 566 return (err); 567 } 568 569 SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); 570 571 /* Declare sysctl tree and populate it. */ 572 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 573 "Congestion control related settings"); 574 575 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, 576 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 577 NULL, 0, cc_default_algo, "A", 578 "Default congestion control algorithm"); 579 580 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, 581 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 582 NULL, 0, cc_list_available, "A", 583 "List available congestion control algorithms"); 584 585 VNET_DEFINE(int, cc_do_abe) = 0; 586 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, 587 &VNET_NAME(cc_do_abe), 0, 588 "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); 589 590 VNET_DEFINE(int, cc_abe_frlossreduce) = 0; 591 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, 592 &VNET_NAME(cc_abe_frlossreduce), 0, 593 "Apply standard beta instead of ABE-beta during ECN-signalled congestion " 594 "recovery episodes if loss also needs to be repaired"); 595