1dbc42409SLawrence Stewart /*- 2fe267a55SPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3fe267a55SPedro F. Giffuni * 4dbc42409SLawrence Stewart * Copyright (c) 2007-2008 5dbc42409SLawrence Stewart * Swinburne University of Technology, Melbourne, Australia. 6dbc42409SLawrence Stewart * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7dbc42409SLawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation 8dbc42409SLawrence Stewart * All rights reserved. 9dbc42409SLawrence Stewart * 10dbc42409SLawrence Stewart * This software was developed at the Centre for Advanced Internet 11891b8ed4SLawrence Stewart * Architectures, Swinburne University of Technology, by Lawrence Stewart and 12891b8ed4SLawrence Stewart * James Healy, made possible in part by a grant from the Cisco University 13891b8ed4SLawrence Stewart * Research Program Fund at Community Foundation Silicon Valley. 14dbc42409SLawrence Stewart * 15dbc42409SLawrence Stewart * Portions of this software were developed at the Centre for Advanced 16dbc42409SLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne, 17dbc42409SLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 18dbc42409SLawrence Stewart * 19dbc42409SLawrence Stewart * Redistribution and use in source and binary forms, with or without 20dbc42409SLawrence Stewart * modification, are permitted provided that the following conditions 21dbc42409SLawrence Stewart * are met: 22dbc42409SLawrence Stewart * 1. Redistributions of source code must retain the above copyright 23dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer. 24dbc42409SLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright 25dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer in the 26dbc42409SLawrence Stewart * documentation and/or other materials provided with the distribution. 27dbc42409SLawrence Stewart * 28dbc42409SLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29dbc42409SLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30dbc42409SLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31dbc42409SLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32dbc42409SLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33dbc42409SLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34dbc42409SLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35dbc42409SLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36dbc42409SLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37dbc42409SLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38dbc42409SLawrence Stewart * SUCH DAMAGE. 39dbc42409SLawrence Stewart */ 40dbc42409SLawrence Stewart 41dbc42409SLawrence Stewart /* 42dbc42409SLawrence Stewart * This software was first released in 2007 by James Healy and Lawrence Stewart 43891b8ed4SLawrence Stewart * whilst working on the NewTCP research project at Swinburne University of 44891b8ed4SLawrence Stewart * Technology's Centre for Advanced Internet Architectures, Melbourne, 45891b8ed4SLawrence Stewart * Australia, which was made possible in part by a grant from the Cisco 46891b8ed4SLawrence Stewart * University Research Program Fund at Community Foundation Silicon Valley. 47891b8ed4SLawrence Stewart * More details are available at: 48dbc42409SLawrence Stewart * http://caia.swin.edu.au/urp/newtcp/ 49dbc42409SLawrence Stewart */ 50dbc42409SLawrence Stewart 51dbc42409SLawrence Stewart #include <sys/cdefs.h> 52dbc42409SLawrence Stewart __FBSDID("$FreeBSD$"); 53b8d60729SRandall Stewart #include <opt_cc.h> 54dbc42409SLawrence Stewart #include <sys/param.h> 55dbc42409SLawrence Stewart #include <sys/kernel.h> 56dbc42409SLawrence Stewart #include <sys/libkern.h> 57dbc42409SLawrence Stewart #include <sys/lock.h> 58dbc42409SLawrence Stewart #include <sys/malloc.h> 59dbc42409SLawrence Stewart #include <sys/module.h> 60dbc42409SLawrence Stewart #include <sys/mutex.h> 61dbc42409SLawrence Stewart #include <sys/queue.h> 62dbc42409SLawrence Stewart #include <sys/rwlock.h> 63dbc42409SLawrence Stewart #include <sys/sbuf.h> 64dbc42409SLawrence Stewart #include <sys/socket.h> 65dbc42409SLawrence Stewart #include <sys/socketvar.h> 66dbc42409SLawrence Stewart #include <sys/sysctl.h> 67dbc42409SLawrence Stewart 68b66d74c1SGleb Smirnoff #include <net/vnet.h> 69dbc42409SLawrence Stewart 70dbc42409SLawrence Stewart #include <netinet/in.h> 71dbc42409SLawrence Stewart #include <netinet/in_pcb.h> 722de3e790SGleb Smirnoff #include <netinet/tcp.h> 73b8d60729SRandall Stewart #include <netinet/tcp_seq.h> 74dbc42409SLawrence Stewart #include <netinet/tcp_var.h> 75b8d60729SRandall Stewart #include <netinet/tcp_log_buf.h> 76b8d60729SRandall Stewart #include <netinet/tcp_hpts.h> 774644fda3SGleb Smirnoff #include <netinet/cc/cc.h> 78dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h> 79dbc42409SLawrence Stewart 807e3c9ec9SWarner Losh /* 817e3c9ec9SWarner Losh * Have a sane default if no CC_DEFAULT is specified in the kernel config file. 827e3c9ec9SWarner Losh */ 837e3c9ec9SWarner Losh #ifndef CC_DEFAULT 847e3c9ec9SWarner Losh #define CC_DEFAULT "newreno" 857e3c9ec9SWarner Losh #endif 867e3c9ec9SWarner Losh 87a9696510SRandall Stewart uint32_t hystart_minrtt_thresh = 4000; 88a9696510SRandall Stewart uint32_t hystart_maxrtt_thresh = 16000; 89a9696510SRandall Stewart uint32_t hystart_n_rttsamples = 8; 90a9696510SRandall Stewart uint32_t hystart_css_growth_div = 4; 91a9696510SRandall Stewart uint32_t hystart_css_rounds = 5; 92a9696510SRandall Stewart uint32_t hystart_bblogs = 0; 93a9696510SRandall Stewart 94b8d60729SRandall Stewart MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); 95b8d60729SRandall Stewart 96dbc42409SLawrence Stewart /* 97dbc42409SLawrence Stewart * List of available cc algorithms on the current system. First element 98dbc42409SLawrence Stewart * is used as the system default CC algorithm. 99dbc42409SLawrence Stewart */ 100dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 101dbc42409SLawrence Stewart 102dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */ 103dbc42409SLawrence Stewart struct rwlock cc_list_lock; 104dbc42409SLawrence Stewart 105b8d60729SRandall Stewart VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; 106b8d60729SRandall Stewart 107b8d60729SRandall Stewart VNET_DEFINE(uint32_t, newreno_beta) = 50; 108b8d60729SRandall Stewart #define V_newreno_beta VNET(newreno_beta) 109dbc42409SLawrence Stewart 110ea9017fbSRandall Stewart void 111ea9017fbSRandall Stewart cc_refer(struct cc_algo *algo) 112ea9017fbSRandall Stewart { 113ea9017fbSRandall Stewart CC_LIST_LOCK_ASSERT(); 114ea9017fbSRandall Stewart refcount_acquire(&algo->cc_refcount); 115ea9017fbSRandall Stewart } 116ea9017fbSRandall Stewart 117ea9017fbSRandall Stewart void 118ea9017fbSRandall Stewart cc_release(struct cc_algo *algo) 119ea9017fbSRandall Stewart { 120ea9017fbSRandall Stewart CC_LIST_LOCK_ASSERT(); 121ea9017fbSRandall Stewart refcount_release(&algo->cc_refcount); 122ea9017fbSRandall Stewart } 123ea9017fbSRandall Stewart 124ea9017fbSRandall Stewart 125ea9017fbSRandall Stewart void 126ea9017fbSRandall Stewart cc_attach(struct tcpcb *tp, struct cc_algo *algo) 127ea9017fbSRandall Stewart { 128ea9017fbSRandall Stewart /* 129ea9017fbSRandall Stewart * Attach the tcpcb to the algorithm. 130ea9017fbSRandall Stewart */ 131ea9017fbSRandall Stewart CC_LIST_RLOCK(); 132ea9017fbSRandall Stewart CC_ALGO(tp) = algo; 133ea9017fbSRandall Stewart cc_refer(algo); 134ea9017fbSRandall Stewart CC_LIST_RUNLOCK(); 135ea9017fbSRandall Stewart } 136ea9017fbSRandall Stewart 137ea9017fbSRandall Stewart void 138ea9017fbSRandall Stewart cc_detach(struct tcpcb *tp) 139ea9017fbSRandall Stewart { 140ea9017fbSRandall Stewart struct cc_algo *algo; 141ea9017fbSRandall Stewart 142ea9017fbSRandall Stewart CC_LIST_RLOCK(); 143ea9017fbSRandall Stewart algo = CC_ALGO(tp); 144ea9017fbSRandall Stewart CC_ALGO(tp) = NULL; 145ea9017fbSRandall Stewart cc_release(algo); 146ea9017fbSRandall Stewart CC_LIST_RUNLOCK(); 147ea9017fbSRandall Stewart } 148ea9017fbSRandall Stewart 149dbc42409SLawrence Stewart /* 150dbc42409SLawrence Stewart * Sysctl handler to show and change the default CC algorithm. 151dbc42409SLawrence Stewart */ 152dbc42409SLawrence Stewart static int 153dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS) 154dbc42409SLawrence Stewart { 155ebf92e86SLawrence Stewart char default_cc[TCP_CA_NAME_MAX]; 156dbc42409SLawrence Stewart struct cc_algo *funcs; 1570e1152fcSHans Petter Selasky int error; 158dbc42409SLawrence Stewart 1590e1152fcSHans Petter Selasky /* Get the current default: */ 160dbc42409SLawrence Stewart CC_LIST_RLOCK(); 161b8d60729SRandall Stewart if (CC_DEFAULT_ALGO() != NULL) 162b8d60729SRandall Stewart strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); 163b8d60729SRandall Stewart else 164b8d60729SRandall Stewart memset(default_cc, 0, TCP_CA_NAME_MAX); 165dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 1660e1152fcSHans Petter Selasky 1670e1152fcSHans Petter Selasky error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); 1680e1152fcSHans Petter Selasky 1690e1152fcSHans Petter Selasky /* Check for error or no change */ 1700e1152fcSHans Petter Selasky if (error != 0 || req->newptr == NULL) 1710e1152fcSHans Petter Selasky goto done; 1720e1152fcSHans Petter Selasky 1730e1152fcSHans Petter Selasky error = ESRCH; 174dbc42409SLawrence Stewart /* Find algo with specified name and set it to default. */ 17578b01840SLawrence Stewart CC_LIST_RLOCK(); 176dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 1770e1152fcSHans Petter Selasky if (strncmp(default_cc, funcs->name, sizeof(default_cc))) 17860a945f9SHans Petter Selasky continue; 179ea9017fbSRandall Stewart if (funcs->flags & CC_MODULE_BEING_REMOVED) { 180ea9017fbSRandall Stewart /* Its being removed, its not eligible */ 181ea9017fbSRandall Stewart continue; 182ea9017fbSRandall Stewart } 18378b01840SLawrence Stewart V_default_cc_ptr = funcs; 1840e1152fcSHans Petter Selasky error = 0; 1850e1152fcSHans Petter Selasky break; 186dbc42409SLawrence Stewart } 18778b01840SLawrence Stewart CC_LIST_RUNLOCK(); 1880e1152fcSHans Petter Selasky done: 1890e1152fcSHans Petter Selasky return (error); 190dbc42409SLawrence Stewart } 191dbc42409SLawrence Stewart 192dbc42409SLawrence Stewart /* 193dbc42409SLawrence Stewart * Sysctl handler to display the list of available CC algorithms. 194dbc42409SLawrence Stewart */ 195dbc42409SLawrence Stewart static int 196dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS) 197dbc42409SLawrence Stewart { 198dbc42409SLawrence Stewart struct cc_algo *algo; 199ea9017fbSRandall Stewart int error, nalgos; 200ea9017fbSRandall Stewart int linesz; 201ea9017fbSRandall Stewart char *buffer, *cp; 202ea9017fbSRandall Stewart size_t bufsz, outsz; 203dbc42409SLawrence Stewart 204ea9017fbSRandall Stewart error = nalgos = 0; 205a66ac850SLawrence Stewart CC_LIST_RLOCK(); 206a66ac850SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 207a66ac850SLawrence Stewart nalgos++; 208a66ac850SLawrence Stewart } 209a66ac850SLawrence Stewart CC_LIST_RUNLOCK(); 210b8d60729SRandall Stewart if (nalgos == 0) { 211b8d60729SRandall Stewart return (ENOENT); 212b8d60729SRandall Stewart } 213ea9017fbSRandall Stewart bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1); 214ea9017fbSRandall Stewart buffer = malloc(bufsz, M_TEMP, M_WAITOK); 215ea9017fbSRandall Stewart cp = buffer; 216dbc42409SLawrence Stewart 217ea9017fbSRandall Stewart linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D', 218ea9017fbSRandall Stewart "PCB count"); 219ea9017fbSRandall Stewart cp += linesz; 220ea9017fbSRandall Stewart bufsz -= linesz; 221ea9017fbSRandall Stewart outsz = linesz; 222dbc42409SLawrence Stewart CC_LIST_RLOCK(); 223dbc42409SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 224ea9017fbSRandall Stewart linesz = snprintf(cp, bufsz, "%-16s%c %u\n", 225ea9017fbSRandall Stewart algo->name, 226ea9017fbSRandall Stewart (algo == CC_DEFAULT_ALGO()) ? '*' : ' ', 227ea9017fbSRandall Stewart algo->cc_refcount); 228ea9017fbSRandall Stewart if (linesz >= bufsz) { 229ea9017fbSRandall Stewart error = EOVERFLOW; 230dbc42409SLawrence Stewart break; 231a66ac850SLawrence Stewart } 232ea9017fbSRandall Stewart cp += linesz; 233ea9017fbSRandall Stewart bufsz -= linesz; 234ea9017fbSRandall Stewart outsz += linesz; 235dbc42409SLawrence Stewart } 236dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 237ea9017fbSRandall Stewart if (error == 0) 238ea9017fbSRandall Stewart error = sysctl_handle_string(oidp, buffer, outsz + 1, req); 239ea9017fbSRandall Stewart free(buffer, M_TEMP); 240ea9017fbSRandall Stewart return (error); 241dbc42409SLawrence Stewart } 242dbc42409SLawrence Stewart 243dbc42409SLawrence Stewart /* 244b8d60729SRandall Stewart * Return the number of times a proposed removal_cc is 245b8d60729SRandall Stewart * being used as the default. 24678b01840SLawrence Stewart */ 247b8d60729SRandall Stewart static int 248b8d60729SRandall Stewart cc_check_default(struct cc_algo *remove_cc) 24978b01840SLawrence Stewart { 250b8d60729SRandall Stewart int cnt = 0; 25178b01840SLawrence Stewart VNET_ITERATOR_DECL(vnet_iter); 25278b01840SLawrence Stewart 25378b01840SLawrence Stewart CC_LIST_LOCK_ASSERT(); 25478b01840SLawrence Stewart 25578b01840SLawrence Stewart VNET_LIST_RLOCK_NOSLEEP(); 25678b01840SLawrence Stewart VNET_FOREACH(vnet_iter) { 25778b01840SLawrence Stewart CURVNET_SET(vnet_iter); 258b8d60729SRandall Stewart if ((CC_DEFAULT_ALGO() != NULL) && 259b8d60729SRandall Stewart strncmp(CC_DEFAULT_ALGO()->name, 260b8d60729SRandall Stewart remove_cc->name, 261b8d60729SRandall Stewart TCP_CA_NAME_MAX) == 0) { 262b8d60729SRandall Stewart cnt++; 263b8d60729SRandall Stewart } 26478b01840SLawrence Stewart CURVNET_RESTORE(); 26578b01840SLawrence Stewart } 26678b01840SLawrence Stewart VNET_LIST_RUNLOCK_NOSLEEP(); 267b8d60729SRandall Stewart return (cnt); 26878b01840SLawrence Stewart } 26978b01840SLawrence Stewart 27078b01840SLawrence Stewart /* 271dbc42409SLawrence Stewart * Initialise CC subsystem on system boot. 272dbc42409SLawrence Stewart */ 27314f57a8bSLawrence Stewart static void 27414f57a8bSLawrence Stewart cc_init(void) 275dbc42409SLawrence Stewart { 276dbc42409SLawrence Stewart CC_LIST_LOCK_INIT(); 277dbc42409SLawrence Stewart STAILQ_INIT(&cc_list); 278dbc42409SLawrence Stewart } 279dbc42409SLawrence Stewart 280dbc42409SLawrence Stewart /* 281dbc42409SLawrence Stewart * Returns non-zero on success, 0 on failure. 282dbc42409SLawrence Stewart */ 283*ccdfd621SMichael Tuexen static int 284*ccdfd621SMichael Tuexen cc_deregister_algo_locked(struct cc_algo *remove_cc) 285dbc42409SLawrence Stewart { 286ea9017fbSRandall Stewart struct cc_algo *funcs; 287ea9017fbSRandall Stewart int found = 0; 288dbc42409SLawrence Stewart 289ea9017fbSRandall Stewart /* This is unlikely to fail */ 290ea9017fbSRandall Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 291ea9017fbSRandall Stewart if (funcs == remove_cc) 292ea9017fbSRandall Stewart found = 1; 293ea9017fbSRandall Stewart } 294ea9017fbSRandall Stewart if (found == 0) { 295ea9017fbSRandall Stewart /* Nothing to remove? */ 296ea9017fbSRandall Stewart return (ENOENT); 297ea9017fbSRandall Stewart } 298ea9017fbSRandall Stewart /* We assert it should have been MOD_QUIESCE'd */ 299ea9017fbSRandall Stewart KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED), 300ea9017fbSRandall Stewart ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc)); 301b8d60729SRandall Stewart if (cc_check_default(remove_cc)) { 302db0ac6deSCy Schubert return(EBUSY); 303b8d60729SRandall Stewart } 304ea9017fbSRandall Stewart if (remove_cc->cc_refcount != 0) { 305ea9017fbSRandall Stewart return (EBUSY); 306b8d60729SRandall Stewart } 307*ccdfd621SMichael Tuexen /* Remove algo from cc_list so that new connections can't use it. */ 308ea9017fbSRandall Stewart STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries); 309d4290f7eSMichael Tuexen return (0); 310b1fe92b2SMichael Tuexen } 311b1fe92b2SMichael Tuexen 312b1fe92b2SMichael Tuexen /* 313*ccdfd621SMichael Tuexen * Returns non-zero on success, 0 on failure. 314*ccdfd621SMichael Tuexen */ 315*ccdfd621SMichael Tuexen int 316*ccdfd621SMichael Tuexen cc_deregister_algo(struct cc_algo *remove_cc) 317*ccdfd621SMichael Tuexen { 318*ccdfd621SMichael Tuexen int ret; 319*ccdfd621SMichael Tuexen 320*ccdfd621SMichael Tuexen CC_LIST_WLOCK(); 321*ccdfd621SMichael Tuexen ret = cc_deregister_algo_locked(remove_cc); 322*ccdfd621SMichael Tuexen CC_LIST_WUNLOCK(); 323*ccdfd621SMichael Tuexen return (ret); 324*ccdfd621SMichael Tuexen } 325*ccdfd621SMichael Tuexen 326*ccdfd621SMichael Tuexen /* 327dbc42409SLawrence Stewart * Returns 0 on success, non-zero on failure. 328dbc42409SLawrence Stewart */ 329dbc42409SLawrence Stewart int 330dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc) 331dbc42409SLawrence Stewart { 332dbc42409SLawrence Stewart struct cc_algo *funcs; 333dbc42409SLawrence Stewart int err; 334dbc42409SLawrence Stewart 335dbc42409SLawrence Stewart err = 0; 336dbc42409SLawrence Stewart 337dbc42409SLawrence Stewart /* 338dbc42409SLawrence Stewart * Iterate over list of registered CC algorithms and make sure 339dbc42409SLawrence Stewart * we're not trying to add a duplicate. 340dbc42409SLawrence Stewart */ 341dbc42409SLawrence Stewart CC_LIST_WLOCK(); 342dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 343b8d60729SRandall Stewart if (funcs == add_cc || 344b8d60729SRandall Stewart strncmp(funcs->name, add_cc->name, 345b8d60729SRandall Stewart TCP_CA_NAME_MAX) == 0) { 346dbc42409SLawrence Stewart err = EEXIST; 347b8d60729SRandall Stewart break; 348dbc42409SLawrence Stewart } 349b8d60729SRandall Stewart } 350ea9017fbSRandall Stewart /* Init its reference count */ 351ea9017fbSRandall Stewart if (err == 0) 352ea9017fbSRandall Stewart refcount_init(&add_cc->cc_refcount, 0); 353b8d60729SRandall Stewart /* 354b8d60729SRandall Stewart * The first loaded congestion control module will become 355b8d60729SRandall Stewart * the default until we find the "CC_DEFAULT" defined in 356b8d60729SRandall Stewart * the config (if we do). 357b8d60729SRandall Stewart */ 358b8d60729SRandall Stewart if (!err) { 359dbc42409SLawrence Stewart STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); 360b8d60729SRandall Stewart if (strcmp(add_cc->name, CC_DEFAULT) == 0) { 361b8d60729SRandall Stewart V_default_cc_ptr = add_cc; 362b8d60729SRandall Stewart } else if (V_default_cc_ptr == NULL) { 363b8d60729SRandall Stewart V_default_cc_ptr = add_cc; 364b8d60729SRandall Stewart } 365b8d60729SRandall Stewart } 366dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 367dbc42409SLawrence Stewart 368dbc42409SLawrence Stewart return (err); 369dbc42409SLawrence Stewart } 370dbc42409SLawrence Stewart 371034a9240SMark Johnston static void 372034a9240SMark Johnston vnet_cc_sysinit(void *arg) 373034a9240SMark Johnston { 374034a9240SMark Johnston struct cc_algo *cc; 375034a9240SMark Johnston 376034a9240SMark Johnston if (IS_DEFAULT_VNET(curvnet)) 377034a9240SMark Johnston return; 378034a9240SMark Johnston 379034a9240SMark Johnston CURVNET_SET(vnet0); 380034a9240SMark Johnston cc = V_default_cc_ptr; 381034a9240SMark Johnston CURVNET_RESTORE(); 382034a9240SMark Johnston 383034a9240SMark Johnston V_default_cc_ptr = cc; 384034a9240SMark Johnston } 385034a9240SMark Johnston VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 386034a9240SMark Johnston vnet_cc_sysinit, NULL); 387034a9240SMark Johnston 388dbc42409SLawrence Stewart /* 389b8d60729SRandall Stewart * Perform any necessary tasks before we exit congestion recovery. 390b8d60729SRandall Stewart */ 391b8d60729SRandall Stewart void 392b8d60729SRandall Stewart newreno_cc_post_recovery(struct cc_var *ccv) 393b8d60729SRandall Stewart { 394b8d60729SRandall Stewart int pipe; 395b8d60729SRandall Stewart 396b8d60729SRandall Stewart if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 397b8d60729SRandall Stewart /* 398b8d60729SRandall Stewart * Fast recovery will conclude after returning from this 399b8d60729SRandall Stewart * function. Window inflation should have left us with 400b8d60729SRandall Stewart * approximately snd_ssthresh outstanding data. But in case we 401b8d60729SRandall Stewart * would be inclined to send a burst, better to do it via the 402b8d60729SRandall Stewart * slow start mechanism. 403b8d60729SRandall Stewart * 404b8d60729SRandall Stewart * XXXLAS: Find a way to do this without needing curack 405b8d60729SRandall Stewart */ 406b8d60729SRandall Stewart if (V_tcp_do_newsack) 407b8d60729SRandall Stewart pipe = tcp_compute_pipe(ccv->ccvc.tcp); 408b8d60729SRandall Stewart else 409b8d60729SRandall Stewart pipe = CCV(ccv, snd_max) - ccv->curack; 410b8d60729SRandall Stewart if (pipe < CCV(ccv, snd_ssthresh)) 411b8d60729SRandall Stewart /* 412b8d60729SRandall Stewart * Ensure that cwnd does not collapse to 1 MSS under 413b4fbc855SGordon Bergling * adverse conditions. Implements RFC6582 414b8d60729SRandall Stewart */ 415b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + 416b8d60729SRandall Stewart CCV(ccv, t_maxseg); 417b8d60729SRandall Stewart else 418b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 419b8d60729SRandall Stewart } 420b8d60729SRandall Stewart } 421b8d60729SRandall Stewart 422b8d60729SRandall Stewart void 423b8d60729SRandall Stewart newreno_cc_after_idle(struct cc_var *ccv) 424b8d60729SRandall Stewart { 425b8d60729SRandall Stewart uint32_t rw; 426b8d60729SRandall Stewart /* 427b8d60729SRandall Stewart * If we've been idle for more than one retransmit timeout the old 428b8d60729SRandall Stewart * congestion window is no longer current and we have to reduce it to 429b8d60729SRandall Stewart * the restart window before we can transmit again. 430b8d60729SRandall Stewart * 431b8d60729SRandall Stewart * The restart window is the initial window or the last CWND, whichever 432b8d60729SRandall Stewart * is smaller. 433b8d60729SRandall Stewart * 434b8d60729SRandall Stewart * This is done to prevent us from flooding the path with a full CWND at 435b8d60729SRandall Stewart * wirespeed, overloading router and switch buffers along the way. 436b8d60729SRandall Stewart * 437b8d60729SRandall Stewart * See RFC5681 Section 4.1. "Restarting Idle Connections". 438b8d60729SRandall Stewart * 439b8d60729SRandall Stewart * In addition, per RFC2861 Section 2, the ssthresh is set to the 440b8d60729SRandall Stewart * maximum of the former ssthresh or 3/4 of the old cwnd, to 441b8d60729SRandall Stewart * not exit slow-start prematurely. 442b8d60729SRandall Stewart */ 443b8d60729SRandall Stewart rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); 444b8d60729SRandall Stewart 445b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), 446b8d60729SRandall Stewart CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); 447b8d60729SRandall Stewart 448b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); 449b8d60729SRandall Stewart } 450b8d60729SRandall Stewart 451b8d60729SRandall Stewart /* 452b8d60729SRandall Stewart * Perform any necessary tasks before we enter congestion recovery. 453b8d60729SRandall Stewart */ 454b8d60729SRandall Stewart void 455b8d60729SRandall Stewart newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type) 456b8d60729SRandall Stewart { 457b8d60729SRandall Stewart uint32_t cwin, factor; 458b8d60729SRandall Stewart u_int mss; 459b8d60729SRandall Stewart 460b8d60729SRandall Stewart cwin = CCV(ccv, snd_cwnd); 461b8d60729SRandall Stewart mss = tcp_fixed_maxseg(ccv->ccvc.tcp); 462b8d60729SRandall Stewart /* 463b8d60729SRandall Stewart * Other TCP congestion controls use newreno_cong_signal(), but 464b8d60729SRandall Stewart * with their own private cc_data. Make sure the cc_data is used 465b8d60729SRandall Stewart * correctly. 466b8d60729SRandall Stewart */ 467b8d60729SRandall Stewart factor = V_newreno_beta; 468b8d60729SRandall Stewart 469b8d60729SRandall Stewart /* Catch algos which mistakenly leak private signal types. */ 470b8d60729SRandall Stewart KASSERT((type & CC_SIGPRIVMASK) == 0, 471b8d60729SRandall Stewart ("%s: congestion signal type 0x%08x is private\n", __func__, type)); 472b8d60729SRandall Stewart 473b8d60729SRandall Stewart cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 474b8d60729SRandall Stewart 2) * mss; 475b8d60729SRandall Stewart 476b8d60729SRandall Stewart switch (type) { 477b8d60729SRandall Stewart case CC_NDUPACK: 478b8d60729SRandall Stewart if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 479b8d60729SRandall Stewart if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) 480b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = cwin; 481b8d60729SRandall Stewart ENTER_RECOVERY(CCV(ccv, t_flags)); 482b8d60729SRandall Stewart } 483b8d60729SRandall Stewart break; 484b8d60729SRandall Stewart case CC_ECN: 485b8d60729SRandall Stewart if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 486b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = cwin; 487b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = cwin; 488b8d60729SRandall Stewart ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 489b8d60729SRandall Stewart } 490b8d60729SRandall Stewart break; 491b8d60729SRandall Stewart case CC_RTO: 492b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), 493b8d60729SRandall Stewart CCV(ccv, snd_cwnd)) / 2 / mss, 494b8d60729SRandall Stewart 2) * mss; 495b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = mss; 496b8d60729SRandall Stewart break; 497b8d60729SRandall Stewart } 498b8d60729SRandall Stewart } 499b8d60729SRandall Stewart 500b8d60729SRandall Stewart void 501b8d60729SRandall Stewart newreno_cc_ack_received(struct cc_var *ccv, uint16_t type) 502b8d60729SRandall Stewart { 503b8d60729SRandall Stewart if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 504b8d60729SRandall Stewart (ccv->flags & CCF_CWND_LIMITED)) { 505b8d60729SRandall Stewart u_int cw = CCV(ccv, snd_cwnd); 506b8d60729SRandall Stewart u_int incr = CCV(ccv, t_maxseg); 507b8d60729SRandall Stewart 508b8d60729SRandall Stewart /* 509b8d60729SRandall Stewart * Regular in-order ACK, open the congestion window. 510b8d60729SRandall Stewart * Method depends on which congestion control state we're 511b8d60729SRandall Stewart * in (slow start or cong avoid) and if ABC (RFC 3465) is 512b8d60729SRandall Stewart * enabled. 513b8d60729SRandall Stewart * 514b8d60729SRandall Stewart * slow start: cwnd <= ssthresh 515b8d60729SRandall Stewart * cong avoid: cwnd > ssthresh 516b8d60729SRandall Stewart * 517b8d60729SRandall Stewart * slow start and ABC (RFC 3465): 518b8d60729SRandall Stewart * Grow cwnd exponentially by the amount of data 519b8d60729SRandall Stewart * ACKed capping the max increment per ACK to 520b8d60729SRandall Stewart * (abc_l_var * maxseg) bytes. 521b8d60729SRandall Stewart * 522b8d60729SRandall Stewart * slow start without ABC (RFC 5681): 523b8d60729SRandall Stewart * Grow cwnd exponentially by maxseg per ACK. 524b8d60729SRandall Stewart * 525b8d60729SRandall Stewart * cong avoid and ABC (RFC 3465): 526b8d60729SRandall Stewart * Grow cwnd linearly by maxseg per RTT for each 527b8d60729SRandall Stewart * cwnd worth of ACKed data. 528b8d60729SRandall Stewart * 529b8d60729SRandall Stewart * cong avoid without ABC (RFC 5681): 530b8d60729SRandall Stewart * Grow cwnd linearly by approximately maxseg per RTT using 531b8d60729SRandall Stewart * maxseg^2 / cwnd per ACK as the increment. 532b8d60729SRandall Stewart * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to 533b8d60729SRandall Stewart * avoid capping cwnd. 534b8d60729SRandall Stewart */ 535b8d60729SRandall Stewart if (cw > CCV(ccv, snd_ssthresh)) { 536b8d60729SRandall Stewart if (V_tcp_do_rfc3465) { 537b8d60729SRandall Stewart if (ccv->flags & CCF_ABC_SENTAWND) 538b8d60729SRandall Stewart ccv->flags &= ~CCF_ABC_SENTAWND; 539b8d60729SRandall Stewart else 540b8d60729SRandall Stewart incr = 0; 541b8d60729SRandall Stewart } else 542b8d60729SRandall Stewart incr = max((incr * incr / cw), 1); 543b8d60729SRandall Stewart } else if (V_tcp_do_rfc3465) { 544b8d60729SRandall Stewart /* 545b8d60729SRandall Stewart * In slow-start with ABC enabled and no RTO in sight? 546b8d60729SRandall Stewart * (Must not use abc_l_var > 1 if slow starting after 547b8d60729SRandall Stewart * an RTO. On RTO, snd_nxt = snd_una, so the 548b8d60729SRandall Stewart * snd_nxt == snd_max check is sufficient to 549b8d60729SRandall Stewart * handle this). 550b8d60729SRandall Stewart * 551b8d60729SRandall Stewart * XXXLAS: Find a way to signal SS after RTO that 552b8d60729SRandall Stewart * doesn't rely on tcpcb vars. 553b8d60729SRandall Stewart */ 554b8d60729SRandall Stewart uint16_t abc_val; 555b8d60729SRandall Stewart 556b8d60729SRandall Stewart if (ccv->flags & CCF_USE_LOCAL_ABC) 557b8d60729SRandall Stewart abc_val = ccv->labc; 558b8d60729SRandall Stewart else 559b8d60729SRandall Stewart abc_val = V_tcp_abc_l_var; 560b8d60729SRandall Stewart if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) 561b8d60729SRandall Stewart incr = min(ccv->bytes_this_ack, 562b8d60729SRandall Stewart ccv->nsegs * abc_val * 563b8d60729SRandall Stewart CCV(ccv, t_maxseg)); 564b8d60729SRandall Stewart else 565b8d60729SRandall Stewart incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); 566b8d60729SRandall Stewart 567b8d60729SRandall Stewart } 568b8d60729SRandall Stewart /* ABC is on by default, so incr equals 0 frequently. */ 569b8d60729SRandall Stewart if (incr > 0) 570b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = min(cw + incr, 571b8d60729SRandall Stewart TCP_MAXWIN << CCV(ccv, snd_scale)); 572b8d60729SRandall Stewart } 573b8d60729SRandall Stewart } 574b8d60729SRandall Stewart 575ea9017fbSRandall Stewart static int 576ea9017fbSRandall Stewart cc_stop_new_assignments(struct cc_algo *algo) 577ea9017fbSRandall Stewart { 578ea9017fbSRandall Stewart CC_LIST_WLOCK(); 579ea9017fbSRandall Stewart if (cc_check_default(algo)) { 580ea9017fbSRandall Stewart /* A default cannot be removed */ 581ea9017fbSRandall Stewart CC_LIST_WUNLOCK(); 582ea9017fbSRandall Stewart return (EBUSY); 583ea9017fbSRandall Stewart } 584ea9017fbSRandall Stewart algo->flags |= CC_MODULE_BEING_REMOVED; 585ea9017fbSRandall Stewart CC_LIST_WUNLOCK(); 586ea9017fbSRandall Stewart return (0); 587ea9017fbSRandall Stewart } 588ea9017fbSRandall Stewart 589b8d60729SRandall Stewart /* 590dbc42409SLawrence Stewart * Handles kld related events. Returns 0 on success, non-zero on failure. 591dbc42409SLawrence Stewart */ 592dbc42409SLawrence Stewart int 593dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data) 594dbc42409SLawrence Stewart { 595dbc42409SLawrence Stewart struct cc_algo *algo; 596dbc42409SLawrence Stewart int err; 597dbc42409SLawrence Stewart 598dbc42409SLawrence Stewart err = 0; 599dbc42409SLawrence Stewart algo = (struct cc_algo *)data; 600dbc42409SLawrence Stewart 601dbc42409SLawrence Stewart switch(event_type) { 602dbc42409SLawrence Stewart case MOD_LOAD: 603b8d60729SRandall Stewart if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { 604b8d60729SRandall Stewart /* 605b8d60729SRandall Stewart * A module must have a cc_data_sz function 606b8d60729SRandall Stewart * even if it has no data it should return 0. 607b8d60729SRandall Stewart */ 608b8d60729SRandall Stewart printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); 609b8d60729SRandall Stewart err = EINVAL; 610b8d60729SRandall Stewart break; 611b8d60729SRandall Stewart } 612dbc42409SLawrence Stewart if (algo->mod_init != NULL) 613dbc42409SLawrence Stewart err = algo->mod_init(); 614dbc42409SLawrence Stewart if (!err) 615dbc42409SLawrence Stewart err = cc_register_algo(algo); 616dbc42409SLawrence Stewart break; 617dbc42409SLawrence Stewart 618dbc42409SLawrence Stewart case MOD_SHUTDOWN: 619dbc42409SLawrence Stewart break; 620ea9017fbSRandall Stewart case MOD_QUIESCE: 621ea9017fbSRandall Stewart /* Stop any new assigments */ 622ea9017fbSRandall Stewart err = cc_stop_new_assignments(algo); 623ea9017fbSRandall Stewart break; 624ea9017fbSRandall Stewart case MOD_UNLOAD: 625ea9017fbSRandall Stewart /* 626ea9017fbSRandall Stewart * Deregister and remove the module from the list 627ea9017fbSRandall Stewart */ 628ea9017fbSRandall Stewart CC_LIST_WLOCK(); 629ea9017fbSRandall Stewart /* Even with -f we can't unload if its the default */ 630ea9017fbSRandall Stewart if (cc_check_default(algo)) { 631ea9017fbSRandall Stewart /* A default cannot be removed */ 632ea9017fbSRandall Stewart CC_LIST_WUNLOCK(); 633ea9017fbSRandall Stewart return (EBUSY); 634ea9017fbSRandall Stewart } 635ea9017fbSRandall Stewart /* 636ea9017fbSRandall Stewart * If -f was used and users are still attached to 637ea9017fbSRandall Stewart * the algorithm things are going to go boom. 638ea9017fbSRandall Stewart */ 639*ccdfd621SMichael Tuexen err = cc_deregister_algo_locked(algo); 640*ccdfd621SMichael Tuexen CC_LIST_WUNLOCK(); 641ea9017fbSRandall Stewart if ((err == 0) && (algo->mod_destroy != NULL)) { 642ea9017fbSRandall Stewart algo->mod_destroy(); 643ea9017fbSRandall Stewart } 644ea9017fbSRandall Stewart break; 645dbc42409SLawrence Stewart default: 646dbc42409SLawrence Stewart err = EINVAL; 647dbc42409SLawrence Stewart break; 648dbc42409SLawrence Stewart } 649dbc42409SLawrence Stewart 650dbc42409SLawrence Stewart return (err); 651dbc42409SLawrence Stewart } 652dbc42409SLawrence Stewart 65314f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); 65414f57a8bSLawrence Stewart 655dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */ 6567029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 657439e76ecSBrad Davis "Congestion control related settings"); 658dbc42409SLawrence Stewart 6596df8a710SGleb Smirnoff SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, 6607029da5cSPawel Biernacki CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 6617029da5cSPawel Biernacki NULL, 0, cc_default_algo, "A", 6627029da5cSPawel Biernacki "Default congestion control algorithm"); 663dbc42409SLawrence Stewart 6647029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, 6657029da5cSPawel Biernacki CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 666dbc42409SLawrence Stewart NULL, 0, cc_list_available, "A", 667439e76ecSBrad Davis "List available congestion control algorithms"); 668370efe5aSLawrence Stewart 669a9696510SRandall Stewart SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus, 670a9696510SRandall Stewart CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 671a9696510SRandall Stewart "New Reno related HyStart++ settings"); 672a9696510SRandall Stewart 673a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh, 674a9696510SRandall Stewart CTLFLAG_RW, 675a9696510SRandall Stewart &hystart_minrtt_thresh, 4000, 676a9696510SRandall Stewart "HyStarts++ minimum RTT thresh used in clamp (in microseconds)"); 677a9696510SRandall Stewart 678a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh, 679a9696510SRandall Stewart CTLFLAG_RW, 680a9696510SRandall Stewart &hystart_maxrtt_thresh, 16000, 681a9696510SRandall Stewart "HyStarts++ maximum RTT thresh used in clamp (in microseconds)"); 682a9696510SRandall Stewart 683a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples, 684a9696510SRandall Stewart CTLFLAG_RW, 685a9696510SRandall Stewart &hystart_n_rttsamples, 8, 686a9696510SRandall Stewart "The number of RTT samples that must be seen to consider HyStart++"); 687a9696510SRandall Stewart 688a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div, 689a9696510SRandall Stewart CTLFLAG_RW, 690a9696510SRandall Stewart &hystart_css_growth_div, 4, 691a9696510SRandall Stewart "The divisor to the growth when in Hystart++ CSS"); 692a9696510SRandall Stewart 693a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds, 694a9696510SRandall Stewart CTLFLAG_RW, 695a9696510SRandall Stewart &hystart_css_rounds, 5, 696a9696510SRandall Stewart "The number of rounds HyStart++ lasts in CSS before falling to CA"); 697a9696510SRandall Stewart 698a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs, 699a9696510SRandall Stewart CTLFLAG_RW, 700a9696510SRandall Stewart &hystart_bblogs, 0, 701a9696510SRandall Stewart "Do we enable HyStart++ Black Box logs to be generated if BB logging is on"); 702a9696510SRandall Stewart 703370efe5aSLawrence Stewart VNET_DEFINE(int, cc_do_abe) = 0; 704370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, 705370efe5aSLawrence Stewart &VNET_NAME(cc_do_abe), 0, 706370efe5aSLawrence Stewart "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); 707370efe5aSLawrence Stewart 708370efe5aSLawrence Stewart VNET_DEFINE(int, cc_abe_frlossreduce) = 0; 709370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, 710370efe5aSLawrence Stewart &VNET_NAME(cc_abe_frlossreduce), 0, 711370efe5aSLawrence Stewart "Apply standard beta instead of ABE-beta during ECN-signalled congestion " 712370efe5aSLawrence Stewart "recovery episodes if loss also needs to be repaired"); 713