cc.c (75dfc66c1b2b44609e5a7c3e1d6a751be4922689) | cc.c (b8d60729deefa0bd13e6a395fcab4928e6e10445) |
---|---|
1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2007-2008 5 * Swinburne University of Technology, Melbourne, Australia. 6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7 * Copyright (c) 2010 The FreeBSD Foundation 8 * All rights reserved. --- 36 unchanged lines hidden (view full) --- 45 * Australia, which was made possible in part by a grant from the Cisco 46 * University Research Program Fund at Community Foundation Silicon Valley. 47 * More details are available at: 48 * http://caia.swin.edu.au/urp/newtcp/ 49 */ 50 51#include <sys/cdefs.h> 52__FBSDID("$FreeBSD$"); | 1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2007-2008 5 * Swinburne University of Technology, Melbourne, Australia. 6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7 * Copyright (c) 2010 The FreeBSD Foundation 8 * All rights reserved. --- 36 unchanged lines hidden (view full) --- 45 * Australia, which was made possible in part by a grant from the Cisco 46 * University Research Program Fund at Community Foundation Silicon Valley. 47 * More details are available at: 48 * http://caia.swin.edu.au/urp/newtcp/ 49 */ 50 51#include <sys/cdefs.h> 52__FBSDID("$FreeBSD$"); |
53 | 53#include <opt_cc.h> |
54#include <sys/param.h> 55#include <sys/kernel.h> 56#include <sys/libkern.h> 57#include <sys/lock.h> 58#include <sys/malloc.h> 59#include <sys/module.h> 60#include <sys/mutex.h> 61#include <sys/queue.h> 62#include <sys/rwlock.h> 63#include <sys/sbuf.h> 64#include <sys/socket.h> 65#include <sys/socketvar.h> 66#include <sys/sysctl.h> 67 68#include <net/vnet.h> 69 70#include <netinet/in.h> 71#include <netinet/in_pcb.h> 72#include <netinet/tcp.h> | 54#include <sys/param.h> 55#include <sys/kernel.h> 56#include <sys/libkern.h> 57#include <sys/lock.h> 58#include <sys/malloc.h> 59#include <sys/module.h> 60#include <sys/mutex.h> 61#include <sys/queue.h> 62#include <sys/rwlock.h> 63#include <sys/sbuf.h> 64#include <sys/socket.h> 65#include <sys/socketvar.h> 66#include <sys/sysctl.h> 67 68#include <net/vnet.h> 69 70#include <netinet/in.h> 71#include <netinet/in_pcb.h> 72#include <netinet/tcp.h> |
73#include <netinet/tcp_seq.h> |
|
73#include <netinet/tcp_var.h> | 74#include <netinet/tcp_var.h> |
75#include <netinet/tcp_log_buf.h> 76#include <netinet/tcp_hpts.h> |
|
74#include <netinet/cc/cc.h> | 77#include <netinet/cc/cc.h> |
75 | |
76#include <netinet/cc/cc_module.h> 77 | 78#include <netinet/cc/cc_module.h> 79 |
80MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); 81 |
|
78/* 79 * List of available cc algorithms on the current system. First element 80 * is used as the system default CC algorithm. 81 */ 82struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 83 84/* Protects the cc_list TAILQ. */ 85struct rwlock cc_list_lock; 86 | 82/* 83 * List of available cc algorithms on the current system. First element 84 * is used as the system default CC algorithm. 85 */ 86struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 87 88/* Protects the cc_list TAILQ. */ 89struct rwlock cc_list_lock; 90 |
87VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo; | 91VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; |
88 | 92 |
93VNET_DEFINE(uint32_t, newreno_beta) = 50; 94#define V_newreno_beta VNET(newreno_beta) 95 |
|
89/* 90 * Sysctl handler to show and change the default CC algorithm. 91 */ 92static int 93cc_default_algo(SYSCTL_HANDLER_ARGS) 94{ 95 char default_cc[TCP_CA_NAME_MAX]; 96 struct cc_algo *funcs; 97 int error; 98 99 /* Get the current default: */ 100 CC_LIST_RLOCK(); | 96/* 97 * Sysctl handler to show and change the default CC algorithm. 98 */ 99static int 100cc_default_algo(SYSCTL_HANDLER_ARGS) 101{ 102 char default_cc[TCP_CA_NAME_MAX]; 103 struct cc_algo *funcs; 104 int error; 105 106 /* Get the current default: */ 107 CC_LIST_RLOCK(); |
101 strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc)); | 108 if (CC_DEFAULT_ALGO() != NULL) 109 strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); 110 else 111 memset(default_cc, 0, TCP_CA_NAME_MAX); |
102 CC_LIST_RUNLOCK(); 103 104 error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); 105 106 /* Check for error or no change */ 107 if (error != 0 || req->newptr == NULL) 108 goto done; 109 110 error = ESRCH; | 112 CC_LIST_RUNLOCK(); 113 114 error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); 115 116 /* Check for error or no change */ 117 if (error != 0 || req->newptr == NULL) 118 goto done; 119 120 error = ESRCH; |
111 | |
112 /* Find algo with specified name and set it to default. */ 113 CC_LIST_RLOCK(); 114 STAILQ_FOREACH(funcs, &cc_list, entries) { 115 if (strncmp(default_cc, funcs->name, sizeof(default_cc))) 116 continue; 117 V_default_cc_ptr = funcs; 118 error = 0; 119 break; --- 16 unchanged lines hidden (view full) --- 136 err = nalgos = 0; 137 first = 1; 138 139 CC_LIST_RLOCK(); 140 STAILQ_FOREACH(algo, &cc_list, entries) { 141 nalgos++; 142 } 143 CC_LIST_RUNLOCK(); | 121 /* Find algo with specified name and set it to default. */ 122 CC_LIST_RLOCK(); 123 STAILQ_FOREACH(funcs, &cc_list, entries) { 124 if (strncmp(default_cc, funcs->name, sizeof(default_cc))) 125 continue; 126 V_default_cc_ptr = funcs; 127 error = 0; 128 break; --- 16 unchanged lines hidden (view full) --- 145 err = nalgos = 0; 146 first = 1; 147 148 CC_LIST_RLOCK(); 149 STAILQ_FOREACH(algo, &cc_list, entries) { 150 nalgos++; 151 } 152 CC_LIST_RUNLOCK(); |
144 | 153 if (nalgos == 0) { 154 return (ENOENT); 155 } |
145 s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); 146 147 if (s == NULL) 148 return (ENOMEM); 149 150 /* 151 * It is theoretically possible for the CC list to have grown in size 152 * since the call to sbuf_new() and therefore for the sbuf to be too --- 18 unchanged lines hidden (view full) --- 171 err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); 172 } 173 174 sbuf_delete(s); 175 return (err); 176} 177 178/* | 156 s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); 157 158 if (s == NULL) 159 return (ENOMEM); 160 161 /* 162 * It is theoretically possible for the CC list to have grown in size 163 * since the call to sbuf_new() and therefore for the sbuf to be too --- 18 unchanged lines hidden (view full) --- 182 err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); 183 } 184 185 sbuf_delete(s); 186 return (err); 187} 188 189/* |
179 * Reset the default CC algo to NewReno for any netstack which is using the algo 180 * that is about to go away as its default. | 190 * Return the number of times a proposed removal_cc is 191 * being used as the default. |
181 */ | 192 */ |
182static void 183cc_checkreset_default(struct cc_algo *remove_cc) | 193static int 194cc_check_default(struct cc_algo *remove_cc) |
184{ | 195{ |
196 int cnt = 0; |
|
185 VNET_ITERATOR_DECL(vnet_iter); 186 187 CC_LIST_LOCK_ASSERT(); 188 189 VNET_LIST_RLOCK_NOSLEEP(); 190 VNET_FOREACH(vnet_iter) { 191 CURVNET_SET(vnet_iter); | 197 VNET_ITERATOR_DECL(vnet_iter); 198 199 CC_LIST_LOCK_ASSERT(); 200 201 VNET_LIST_RLOCK_NOSLEEP(); 202 VNET_FOREACH(vnet_iter) { 203 CURVNET_SET(vnet_iter); |
192 if (strncmp(CC_DEFAULT()->name, remove_cc->name, 193 TCP_CA_NAME_MAX) == 0) 194 V_default_cc_ptr = &newreno_cc_algo; | 204 if ((CC_DEFAULT_ALGO() != NULL) && 205 strncmp(CC_DEFAULT_ALGO()->name, 206 remove_cc->name, 207 TCP_CA_NAME_MAX) == 0) { 208 cnt++; 209 } |
195 CURVNET_RESTORE(); 196 } 197 VNET_LIST_RUNLOCK_NOSLEEP(); | 210 CURVNET_RESTORE(); 211 } 212 VNET_LIST_RUNLOCK_NOSLEEP(); |
213 return (cnt); |
|
198} 199 200/* 201 * Initialise CC subsystem on system boot. 202 */ 203static void 204cc_init(void) 205{ --- 7 unchanged lines hidden (view full) --- 213int 214cc_deregister_algo(struct cc_algo *remove_cc) 215{ 216 struct cc_algo *funcs, *tmpfuncs; 217 int err; 218 219 err = ENOENT; 220 | 214} 215 216/* 217 * Initialise CC subsystem on system boot. 218 */ 219static void 220cc_init(void) 221{ --- 7 unchanged lines hidden (view full) --- 229int 230cc_deregister_algo(struct cc_algo *remove_cc) 231{ 232 struct cc_algo *funcs, *tmpfuncs; 233 int err; 234 235 err = ENOENT; 236 |
221 /* Never allow newreno to be deregistered. */ 222 if (&newreno_cc_algo == remove_cc) 223 return (EPERM); 224 | |
225 /* Remove algo from cc_list so that new connections can't use it. */ 226 CC_LIST_WLOCK(); 227 STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 228 if (funcs == remove_cc) { | 237 /* Remove algo from cc_list so that new connections can't use it. */ 238 CC_LIST_WLOCK(); 239 STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 240 if (funcs == remove_cc) { |
229 cc_checkreset_default(remove_cc); 230 STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); 231 err = 0; | 241 if (cc_check_default(remove_cc)) { 242 err = EBUSY; 243 break; 244 } 245 /* Add a temp flag to stop new adds to it */ 246 funcs->flags |= CC_MODULE_BEING_REMOVED; |
232 break; 233 } 234 } 235 CC_LIST_WUNLOCK(); | 247 break; 248 } 249 } 250 CC_LIST_WUNLOCK(); |
236 237 if (!err) 238 /* 239 * XXXLAS: 240 * - We may need to handle non-zero return values in future. 241 * - If we add CC framework support for protocols other than 242 * TCP, we may want a more generic way to handle this step. 243 */ 244 tcp_ccalgounload(remove_cc); 245 | 251 err = tcp_ccalgounload(remove_cc); 252 /* 253 * Now back through and we either remove the temp flag 254 * or pull the registration. 255 */ 256 CC_LIST_WLOCK(); 257 STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 258 if (funcs == remove_cc) { 259 if (err == 0) 260 STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); 261 else 262 funcs->flags &= ~CC_MODULE_BEING_REMOVED; 263 break; 264 } 265 } 266 CC_LIST_WUNLOCK(); |
246 return (err); 247} 248 249/* 250 * Returns 0 on success, non-zero on failure. 251 */ 252int 253cc_register_algo(struct cc_algo *add_cc) --- 4 unchanged lines hidden (view full) --- 258 err = 0; 259 260 /* 261 * Iterate over list of registered CC algorithms and make sure 262 * we're not trying to add a duplicate. 263 */ 264 CC_LIST_WLOCK(); 265 STAILQ_FOREACH(funcs, &cc_list, entries) { | 267 return (err); 268} 269 270/* 271 * Returns 0 on success, non-zero on failure. 272 */ 273int 274cc_register_algo(struct cc_algo *add_cc) --- 4 unchanged lines hidden (view full) --- 279 err = 0; 280 281 /* 282 * Iterate over list of registered CC algorithms and make sure 283 * we're not trying to add a duplicate. 284 */ 285 CC_LIST_WLOCK(); 286 STAILQ_FOREACH(funcs, &cc_list, entries) { |
266 if (funcs == add_cc || strncmp(funcs->name, add_cc->name, 267 TCP_CA_NAME_MAX) == 0) | 287 if (funcs == add_cc || 288 strncmp(funcs->name, add_cc->name, 289 TCP_CA_NAME_MAX) == 0) { |
268 err = EEXIST; | 290 err = EEXIST; |
291 break; 292 } |
|
269 } | 293 } |
270 271 if (!err) | 294 /* 295 * The first loaded congestion control module will become 296 * the default until we find the "CC_DEFAULT" defined in 297 * the config (if we do). 298 */ 299 if (!err) { |
272 STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); | 300 STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); |
273 | 301 if (strcmp(add_cc->name, CC_DEFAULT) == 0) { 302 V_default_cc_ptr = add_cc; 303 } else if (V_default_cc_ptr == NULL) { 304 V_default_cc_ptr = add_cc; 305 } 306 } |
274 CC_LIST_WUNLOCK(); 275 276 return (err); 277} 278 279/* | 307 CC_LIST_WUNLOCK(); 308 309 return (err); 310} 311 312/* |
313 * Perform any necessary tasks before we exit congestion recovery. 314 */ 315void 316newreno_cc_post_recovery(struct cc_var *ccv) 317{ 318 int pipe; 319 320 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 321 /* 322 * Fast recovery will conclude after returning from this 323 * function. Window inflation should have left us with 324 * approximately snd_ssthresh outstanding data. But in case we 325 * would be inclined to send a burst, better to do it via the 326 * slow start mechanism. 327 * 328 * XXXLAS: Find a way to do this without needing curack 329 */ 330 if (V_tcp_do_newsack) 331 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 332 else 333 pipe = CCV(ccv, snd_max) - ccv->curack; 334 if (pipe < CCV(ccv, snd_ssthresh)) 335 /* 336 * Ensure that cwnd does not collapse to 1 MSS under 337 * adverse conditons. Implements RFC6582 338 */ 339 CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + 340 CCV(ccv, t_maxseg); 341 else 342 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 343 } 344} 345 346void 347newreno_cc_after_idle(struct cc_var *ccv) 348{ 349 uint32_t rw; 350 /* 351 * If we've been idle for more than one retransmit timeout the old 352 * congestion window is no longer current and we have to reduce it to 353 * the restart window before we can transmit again. 354 * 355 * The restart window is the initial window or the last CWND, whichever 356 * is smaller. 357 * 358 * This is done to prevent us from flooding the path with a full CWND at 359 * wirespeed, overloading router and switch buffers along the way. 360 * 361 * See RFC5681 Section 4.1. "Restarting Idle Connections". 362 * 363 * In addition, per RFC2861 Section 2, the ssthresh is set to the 364 * maximum of the former ssthresh or 3/4 of the old cwnd, to 365 * not exit slow-start prematurely. 366 */ 367 rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); 368 369 CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), 370 CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); 371 372 CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); 373} 374 375/* 376 * Perform any necessary tasks before we enter congestion recovery. 377 */ 378void 379newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type) 380{ 381 uint32_t cwin, factor; 382 u_int mss; 383 384 cwin = CCV(ccv, snd_cwnd); 385 mss = tcp_fixed_maxseg(ccv->ccvc.tcp); 386 /* 387 * Other TCP congestion controls use newreno_cong_signal(), but 388 * with their own private cc_data. Make sure the cc_data is used 389 * correctly. 390 */ 391 factor = V_newreno_beta; 392 393 /* Catch algos which mistakenly leak private signal types. */ 394 KASSERT((type & CC_SIGPRIVMASK) == 0, 395 ("%s: congestion signal type 0x%08x is private\n", __func__, type)); 396 397 cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 398 2) * mss; 399 400 switch (type) { 401 case CC_NDUPACK: 402 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 403 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) 404 CCV(ccv, snd_ssthresh) = cwin; 405 ENTER_RECOVERY(CCV(ccv, t_flags)); 406 } 407 break; 408 case CC_ECN: 409 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 410 CCV(ccv, snd_ssthresh) = cwin; 411 CCV(ccv, snd_cwnd) = cwin; 412 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 413 } 414 break; 415 case CC_RTO: 416 CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), 417 CCV(ccv, snd_cwnd)) / 2 / mss, 418 2) * mss; 419 CCV(ccv, snd_cwnd) = mss; 420 break; 421 } 422} 423 424void 425newreno_cc_ack_received(struct cc_var *ccv, uint16_t type) 426{ 427 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 428 (ccv->flags & CCF_CWND_LIMITED)) { 429 u_int cw = CCV(ccv, snd_cwnd); 430 u_int incr = CCV(ccv, t_maxseg); 431 432 /* 433 * Regular in-order ACK, open the congestion window. 434 * Method depends on which congestion control state we're 435 * in (slow start or cong avoid) and if ABC (RFC 3465) is 436 * enabled. 437 * 438 * slow start: cwnd <= ssthresh 439 * cong avoid: cwnd > ssthresh 440 * 441 * slow start and ABC (RFC 3465): 442 * Grow cwnd exponentially by the amount of data 443 * ACKed capping the max increment per ACK to 444 * (abc_l_var * maxseg) bytes. 445 * 446 * slow start without ABC (RFC 5681): 447 * Grow cwnd exponentially by maxseg per ACK. 448 * 449 * cong avoid and ABC (RFC 3465): 450 * Grow cwnd linearly by maxseg per RTT for each 451 * cwnd worth of ACKed data. 452 * 453 * cong avoid without ABC (RFC 5681): 454 * Grow cwnd linearly by approximately maxseg per RTT using 455 * maxseg^2 / cwnd per ACK as the increment. 456 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to 457 * avoid capping cwnd. 458 */ 459 if (cw > CCV(ccv, snd_ssthresh)) { 460 if (V_tcp_do_rfc3465) { 461 if (ccv->flags & CCF_ABC_SENTAWND) 462 ccv->flags &= ~CCF_ABC_SENTAWND; 463 else 464 incr = 0; 465 } else 466 incr = max((incr * incr / cw), 1); 467 } else if (V_tcp_do_rfc3465) { 468 /* 469 * In slow-start with ABC enabled and no RTO in sight? 470 * (Must not use abc_l_var > 1 if slow starting after 471 * an RTO. On RTO, snd_nxt = snd_una, so the 472 * snd_nxt == snd_max check is sufficient to 473 * handle this). 474 * 475 * XXXLAS: Find a way to signal SS after RTO that 476 * doesn't rely on tcpcb vars. 477 */ 478 uint16_t abc_val; 479 480 if (ccv->flags & CCF_USE_LOCAL_ABC) 481 abc_val = ccv->labc; 482 else 483 abc_val = V_tcp_abc_l_var; 484 if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) 485 incr = min(ccv->bytes_this_ack, 486 ccv->nsegs * abc_val * 487 CCV(ccv, t_maxseg)); 488 else 489 incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); 490 491 } 492 /* ABC is on by default, so incr equals 0 frequently. */ 493 if (incr > 0) 494 CCV(ccv, snd_cwnd) = min(cw + incr, 495 TCP_MAXWIN << CCV(ccv, snd_scale)); 496 } 497} 498 499/* |
|
280 * Handles kld related events. Returns 0 on success, non-zero on failure. 281 */ 282int 283cc_modevent(module_t mod, int event_type, void *data) 284{ 285 struct cc_algo *algo; 286 int err; 287 288 err = 0; 289 algo = (struct cc_algo *)data; 290 291 switch(event_type) { 292 case MOD_LOAD: | 500 * Handles kld related events. Returns 0 on success, non-zero on failure. 501 */ 502int 503cc_modevent(module_t mod, int event_type, void *data) 504{ 505 struct cc_algo *algo; 506 int err; 507 508 err = 0; 509 algo = (struct cc_algo *)data; 510 511 switch(event_type) { 512 case MOD_LOAD: |
513 if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { 514 /* 515 * A module must have a cc_data_sz function 516 * even if it has no data it should return 0. 517 */ 518 printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); 519 err = EINVAL; 520 break; 521 } |
|
293 if (algo->mod_init != NULL) 294 err = algo->mod_init(); 295 if (!err) 296 err = cc_register_algo(algo); 297 break; 298 299 case MOD_QUIESCE: 300 case MOD_SHUTDOWN: --- 42 unchanged lines hidden --- | 522 if (algo->mod_init != NULL) 523 err = algo->mod_init(); 524 if (!err) 525 err = cc_register_algo(algo); 526 break; 527 528 case MOD_QUIESCE: 529 case MOD_SHUTDOWN: --- 42 unchanged lines hidden --- |