1dbc42409SLawrence Stewart /*- 2fe267a55SPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3fe267a55SPedro F. Giffuni * 4dbc42409SLawrence Stewart * Copyright (c) 2007-2008 5dbc42409SLawrence Stewart * Swinburne University of Technology, Melbourne, Australia. 6dbc42409SLawrence Stewart * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7dbc42409SLawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation 8dbc42409SLawrence Stewart * All rights reserved. 9dbc42409SLawrence Stewart * 10dbc42409SLawrence Stewart * This software was developed at the Centre for Advanced Internet 11891b8ed4SLawrence Stewart * Architectures, Swinburne University of Technology, by Lawrence Stewart and 12891b8ed4SLawrence Stewart * James Healy, made possible in part by a grant from the Cisco University 13891b8ed4SLawrence Stewart * Research Program Fund at Community Foundation Silicon Valley. 14dbc42409SLawrence Stewart * 15dbc42409SLawrence Stewart * Portions of this software were developed at the Centre for Advanced 16dbc42409SLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne, 17dbc42409SLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 18dbc42409SLawrence Stewart * 19dbc42409SLawrence Stewart * Redistribution and use in source and binary forms, with or without 20dbc42409SLawrence Stewart * modification, are permitted provided that the following conditions 21dbc42409SLawrence Stewart * are met: 22dbc42409SLawrence Stewart * 1. Redistributions of source code must retain the above copyright 23dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer. 24dbc42409SLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright 25dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer in the 26dbc42409SLawrence Stewart * documentation and/or other materials provided with the distribution. 27dbc42409SLawrence Stewart * 28dbc42409SLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29dbc42409SLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30dbc42409SLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31dbc42409SLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32dbc42409SLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33dbc42409SLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34dbc42409SLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35dbc42409SLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36dbc42409SLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37dbc42409SLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38dbc42409SLawrence Stewart * SUCH DAMAGE. 39dbc42409SLawrence Stewart */ 40dbc42409SLawrence Stewart 41dbc42409SLawrence Stewart /* 42dbc42409SLawrence Stewart * This software was first released in 2007 by James Healy and Lawrence Stewart 43891b8ed4SLawrence Stewart * whilst working on the NewTCP research project at Swinburne University of 44891b8ed4SLawrence Stewart * Technology's Centre for Advanced Internet Architectures, Melbourne, 45891b8ed4SLawrence Stewart * Australia, which was made possible in part by a grant from the Cisco 46891b8ed4SLawrence Stewart * University Research Program Fund at Community Foundation Silicon Valley. 47891b8ed4SLawrence Stewart * More details are available at: 48dbc42409SLawrence Stewart * http://caia.swin.edu.au/urp/newtcp/ 49dbc42409SLawrence Stewart */ 50dbc42409SLawrence Stewart 51dbc42409SLawrence Stewart #include <sys/cdefs.h> 52dbc42409SLawrence Stewart __FBSDID("$FreeBSD$"); 53b8d60729SRandall Stewart #include <opt_cc.h> 54dbc42409SLawrence Stewart #include <sys/param.h> 55dbc42409SLawrence Stewart #include <sys/kernel.h> 56dbc42409SLawrence Stewart #include <sys/libkern.h> 57dbc42409SLawrence Stewart #include <sys/lock.h> 58dbc42409SLawrence Stewart #include <sys/malloc.h> 59dbc42409SLawrence Stewart #include <sys/module.h> 60dbc42409SLawrence Stewart #include <sys/mutex.h> 61dbc42409SLawrence Stewart #include <sys/queue.h> 62dbc42409SLawrence Stewart #include <sys/rwlock.h> 63dbc42409SLawrence Stewart #include <sys/sbuf.h> 64dbc42409SLawrence Stewart #include <sys/socket.h> 65dbc42409SLawrence Stewart #include <sys/socketvar.h> 66dbc42409SLawrence Stewart #include <sys/sysctl.h> 67dbc42409SLawrence Stewart 68b66d74c1SGleb Smirnoff #include <net/vnet.h> 69dbc42409SLawrence Stewart 70dbc42409SLawrence Stewart #include <netinet/in.h> 71dbc42409SLawrence Stewart #include <netinet/in_pcb.h> 722de3e790SGleb Smirnoff #include <netinet/tcp.h> 73b8d60729SRandall Stewart #include <netinet/tcp_seq.h> 74dbc42409SLawrence Stewart #include <netinet/tcp_var.h> 75b8d60729SRandall Stewart #include <netinet/tcp_log_buf.h> 76b8d60729SRandall Stewart #include <netinet/tcp_hpts.h> 774644fda3SGleb Smirnoff #include <netinet/cc/cc.h> 78dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h> 79dbc42409SLawrence Stewart 807e3c9ec9SWarner Losh /* 817e3c9ec9SWarner Losh * Have a sane default if no CC_DEFAULT is specified in the kernel config file. 827e3c9ec9SWarner Losh */ 837e3c9ec9SWarner Losh #ifndef CC_DEFAULT 84bb1d472dSRichard Scheffenegger #define CC_DEFAULT "cubic" 857e3c9ec9SWarner Losh #endif 867e3c9ec9SWarner Losh 87a9696510SRandall Stewart uint32_t hystart_minrtt_thresh = 4000; 88a9696510SRandall Stewart uint32_t hystart_maxrtt_thresh = 16000; 89a9696510SRandall Stewart uint32_t hystart_n_rttsamples = 8; 90a9696510SRandall Stewart uint32_t hystart_css_growth_div = 4; 91a9696510SRandall Stewart uint32_t hystart_css_rounds = 5; 92a9696510SRandall Stewart uint32_t hystart_bblogs = 0; 93a9696510SRandall Stewart 94b8d60729SRandall Stewart MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); 95b8d60729SRandall Stewart 96dbc42409SLawrence Stewart /* 97dbc42409SLawrence Stewart * List of available cc algorithms on the current system. First element 98dbc42409SLawrence Stewart * is used as the system default CC algorithm. 99dbc42409SLawrence Stewart */ 100dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 101dbc42409SLawrence Stewart 102dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */ 103dbc42409SLawrence Stewart struct rwlock cc_list_lock; 104dbc42409SLawrence Stewart 105b8d60729SRandall Stewart VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; 106b8d60729SRandall Stewart 107b8d60729SRandall Stewart VNET_DEFINE(uint32_t, newreno_beta) = 50; 108b8d60729SRandall Stewart #define V_newreno_beta VNET(newreno_beta) 109*0fdc2472SMichael Tuexen VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; 110dbc42409SLawrence Stewart 111ea9017fbSRandall Stewart void 112ea9017fbSRandall Stewart cc_refer(struct cc_algo *algo) 113ea9017fbSRandall Stewart { 114ea9017fbSRandall Stewart CC_LIST_LOCK_ASSERT(); 115ea9017fbSRandall Stewart refcount_acquire(&algo->cc_refcount); 116ea9017fbSRandall Stewart } 117ea9017fbSRandall Stewart 118ea9017fbSRandall Stewart void 119ea9017fbSRandall Stewart cc_release(struct cc_algo *algo) 120ea9017fbSRandall Stewart { 121ea9017fbSRandall Stewart CC_LIST_LOCK_ASSERT(); 122ea9017fbSRandall Stewart refcount_release(&algo->cc_refcount); 123ea9017fbSRandall Stewart } 124ea9017fbSRandall Stewart 125ea9017fbSRandall Stewart 126ea9017fbSRandall Stewart void 127ea9017fbSRandall Stewart cc_attach(struct tcpcb *tp, struct cc_algo *algo) 128ea9017fbSRandall Stewart { 129ea9017fbSRandall Stewart /* 130ea9017fbSRandall Stewart * Attach the tcpcb to the algorithm. 131ea9017fbSRandall Stewart */ 132ea9017fbSRandall Stewart CC_LIST_RLOCK(); 133ea9017fbSRandall Stewart CC_ALGO(tp) = algo; 134ea9017fbSRandall Stewart cc_refer(algo); 135ea9017fbSRandall Stewart CC_LIST_RUNLOCK(); 136ea9017fbSRandall Stewart } 137ea9017fbSRandall Stewart 138ea9017fbSRandall Stewart void 139ea9017fbSRandall Stewart cc_detach(struct tcpcb *tp) 140ea9017fbSRandall Stewart { 141ea9017fbSRandall Stewart struct cc_algo *algo; 142ea9017fbSRandall Stewart 143ea9017fbSRandall Stewart CC_LIST_RLOCK(); 144ea9017fbSRandall Stewart algo = CC_ALGO(tp); 145ea9017fbSRandall Stewart CC_ALGO(tp) = NULL; 146ea9017fbSRandall Stewart cc_release(algo); 147ea9017fbSRandall Stewart CC_LIST_RUNLOCK(); 148ea9017fbSRandall Stewart } 149ea9017fbSRandall Stewart 150dbc42409SLawrence Stewart /* 151dbc42409SLawrence Stewart * Sysctl handler to show and change the default CC algorithm. 152dbc42409SLawrence Stewart */ 153dbc42409SLawrence Stewart static int 154dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS) 155dbc42409SLawrence Stewart { 156ebf92e86SLawrence Stewart char default_cc[TCP_CA_NAME_MAX]; 157dbc42409SLawrence Stewart struct cc_algo *funcs; 1580e1152fcSHans Petter Selasky int error; 159dbc42409SLawrence Stewart 1600e1152fcSHans Petter Selasky /* Get the current default: */ 161dbc42409SLawrence Stewart CC_LIST_RLOCK(); 162b8d60729SRandall Stewart if (CC_DEFAULT_ALGO() != NULL) 163b8d60729SRandall Stewart strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); 164b8d60729SRandall Stewart else 165b8d60729SRandall Stewart memset(default_cc, 0, TCP_CA_NAME_MAX); 166dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 1670e1152fcSHans Petter Selasky 1680e1152fcSHans Petter Selasky error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); 1690e1152fcSHans Petter Selasky 1700e1152fcSHans Petter Selasky /* Check for error or no change */ 1710e1152fcSHans Petter Selasky if (error != 0 || req->newptr == NULL) 1720e1152fcSHans Petter Selasky goto done; 1730e1152fcSHans Petter Selasky 1740e1152fcSHans Petter Selasky error = ESRCH; 175dbc42409SLawrence Stewart /* Find algo with specified name and set it to default. */ 17678b01840SLawrence Stewart CC_LIST_RLOCK(); 177dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 1780e1152fcSHans Petter Selasky if (strncmp(default_cc, funcs->name, sizeof(default_cc))) 17960a945f9SHans Petter Selasky continue; 180ea9017fbSRandall Stewart if (funcs->flags & CC_MODULE_BEING_REMOVED) { 181ea9017fbSRandall Stewart /* Its being removed, its not eligible */ 182ea9017fbSRandall Stewart continue; 183ea9017fbSRandall Stewart } 18478b01840SLawrence Stewart V_default_cc_ptr = funcs; 1850e1152fcSHans Petter Selasky error = 0; 1860e1152fcSHans Petter Selasky break; 187dbc42409SLawrence Stewart } 18878b01840SLawrence Stewart CC_LIST_RUNLOCK(); 1890e1152fcSHans Petter Selasky done: 1900e1152fcSHans Petter Selasky return (error); 191dbc42409SLawrence Stewart } 192dbc42409SLawrence Stewart 193dbc42409SLawrence Stewart /* 194dbc42409SLawrence Stewart * Sysctl handler to display the list of available CC algorithms. 195dbc42409SLawrence Stewart */ 196dbc42409SLawrence Stewart static int 197dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS) 198dbc42409SLawrence Stewart { 199dbc42409SLawrence Stewart struct cc_algo *algo; 200ea9017fbSRandall Stewart int error, nalgos; 201ea9017fbSRandall Stewart int linesz; 202ea9017fbSRandall Stewart char *buffer, *cp; 203ea9017fbSRandall Stewart size_t bufsz, outsz; 204dbc42409SLawrence Stewart 205ea9017fbSRandall Stewart error = nalgos = 0; 206a66ac850SLawrence Stewart CC_LIST_RLOCK(); 207a66ac850SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 208a66ac850SLawrence Stewart nalgos++; 209a66ac850SLawrence Stewart } 210a66ac850SLawrence Stewart CC_LIST_RUNLOCK(); 211b8d60729SRandall Stewart if (nalgos == 0) { 212b8d60729SRandall Stewart return (ENOENT); 213b8d60729SRandall Stewart } 214ea9017fbSRandall Stewart bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1); 215ea9017fbSRandall Stewart buffer = malloc(bufsz, M_TEMP, M_WAITOK); 216ea9017fbSRandall Stewart cp = buffer; 217dbc42409SLawrence Stewart 218ea9017fbSRandall Stewart linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D', 219ea9017fbSRandall Stewart "PCB count"); 220ea9017fbSRandall Stewart cp += linesz; 221ea9017fbSRandall Stewart bufsz -= linesz; 222ea9017fbSRandall Stewart outsz = linesz; 223dbc42409SLawrence Stewart CC_LIST_RLOCK(); 224dbc42409SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 225ea9017fbSRandall Stewart linesz = snprintf(cp, bufsz, "%-16s%c %u\n", 226ea9017fbSRandall Stewart algo->name, 227ea9017fbSRandall Stewart (algo == CC_DEFAULT_ALGO()) ? '*' : ' ', 228ea9017fbSRandall Stewart algo->cc_refcount); 229ea9017fbSRandall Stewart if (linesz >= bufsz) { 230ea9017fbSRandall Stewart error = EOVERFLOW; 231dbc42409SLawrence Stewart break; 232a66ac850SLawrence Stewart } 233ea9017fbSRandall Stewart cp += linesz; 234ea9017fbSRandall Stewart bufsz -= linesz; 235ea9017fbSRandall Stewart outsz += linesz; 236dbc42409SLawrence Stewart } 237dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 238ea9017fbSRandall Stewart if (error == 0) 239ea9017fbSRandall Stewart error = sysctl_handle_string(oidp, buffer, outsz + 1, req); 240ea9017fbSRandall Stewart free(buffer, M_TEMP); 241ea9017fbSRandall Stewart return (error); 242dbc42409SLawrence Stewart } 243dbc42409SLawrence Stewart 244dbc42409SLawrence Stewart /* 245b8d60729SRandall Stewart * Return the number of times a proposed removal_cc is 246b8d60729SRandall Stewart * being used as the default. 24778b01840SLawrence Stewart */ 248b8d60729SRandall Stewart static int 249b8d60729SRandall Stewart cc_check_default(struct cc_algo *remove_cc) 25078b01840SLawrence Stewart { 251b8d60729SRandall Stewart int cnt = 0; 25278b01840SLawrence Stewart VNET_ITERATOR_DECL(vnet_iter); 25378b01840SLawrence Stewart 25478b01840SLawrence Stewart CC_LIST_LOCK_ASSERT(); 25578b01840SLawrence Stewart 25678b01840SLawrence Stewart VNET_LIST_RLOCK_NOSLEEP(); 25778b01840SLawrence Stewart VNET_FOREACH(vnet_iter) { 25878b01840SLawrence Stewart CURVNET_SET(vnet_iter); 259b8d60729SRandall Stewart if ((CC_DEFAULT_ALGO() != NULL) && 260b8d60729SRandall Stewart strncmp(CC_DEFAULT_ALGO()->name, 261b8d60729SRandall Stewart remove_cc->name, 262b8d60729SRandall Stewart TCP_CA_NAME_MAX) == 0) { 263b8d60729SRandall Stewart cnt++; 264b8d60729SRandall Stewart } 26578b01840SLawrence Stewart CURVNET_RESTORE(); 26678b01840SLawrence Stewart } 26778b01840SLawrence Stewart VNET_LIST_RUNLOCK_NOSLEEP(); 268b8d60729SRandall Stewart return (cnt); 26978b01840SLawrence Stewart } 27078b01840SLawrence Stewart 27178b01840SLawrence Stewart /* 272dbc42409SLawrence Stewart * Initialise CC subsystem on system boot. 273dbc42409SLawrence Stewart */ 27414f57a8bSLawrence Stewart static void 27514f57a8bSLawrence Stewart cc_init(void) 276dbc42409SLawrence Stewart { 277dbc42409SLawrence Stewart CC_LIST_LOCK_INIT(); 278dbc42409SLawrence Stewart STAILQ_INIT(&cc_list); 279dbc42409SLawrence Stewart } 280dbc42409SLawrence Stewart 281dbc42409SLawrence Stewart /* 282dbc42409SLawrence Stewart * Returns non-zero on success, 0 on failure. 283dbc42409SLawrence Stewart */ 284ccdfd621SMichael Tuexen static int 285ccdfd621SMichael Tuexen cc_deregister_algo_locked(struct cc_algo *remove_cc) 286dbc42409SLawrence Stewart { 287ea9017fbSRandall Stewart struct cc_algo *funcs; 288ea9017fbSRandall Stewart int found = 0; 289dbc42409SLawrence Stewart 290ea9017fbSRandall Stewart /* This is unlikely to fail */ 291ea9017fbSRandall Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 292ea9017fbSRandall Stewart if (funcs == remove_cc) 293ea9017fbSRandall Stewart found = 1; 294ea9017fbSRandall Stewart } 295ea9017fbSRandall Stewart if (found == 0) { 296ea9017fbSRandall Stewart /* Nothing to remove? */ 297ea9017fbSRandall Stewart return (ENOENT); 298ea9017fbSRandall Stewart } 299ea9017fbSRandall Stewart /* We assert it should have been MOD_QUIESCE'd */ 300ea9017fbSRandall Stewart KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED), 301ea9017fbSRandall Stewart ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc)); 302b8d60729SRandall Stewart if (cc_check_default(remove_cc)) { 303db0ac6deSCy Schubert return(EBUSY); 304b8d60729SRandall Stewart } 305ea9017fbSRandall Stewart if (remove_cc->cc_refcount != 0) { 306ea9017fbSRandall Stewart return (EBUSY); 307b8d60729SRandall Stewart } 308ccdfd621SMichael Tuexen /* Remove algo from cc_list so that new connections can't use it. */ 309ea9017fbSRandall Stewart STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries); 310d4290f7eSMichael Tuexen return (0); 311b1fe92b2SMichael Tuexen } 312b1fe92b2SMichael Tuexen 313b1fe92b2SMichael Tuexen /* 314ccdfd621SMichael Tuexen * Returns non-zero on success, 0 on failure. 315ccdfd621SMichael Tuexen */ 316ccdfd621SMichael Tuexen int 317ccdfd621SMichael Tuexen cc_deregister_algo(struct cc_algo *remove_cc) 318ccdfd621SMichael Tuexen { 319ccdfd621SMichael Tuexen int ret; 320ccdfd621SMichael Tuexen 321ccdfd621SMichael Tuexen CC_LIST_WLOCK(); 322ccdfd621SMichael Tuexen ret = cc_deregister_algo_locked(remove_cc); 323ccdfd621SMichael Tuexen CC_LIST_WUNLOCK(); 324ccdfd621SMichael Tuexen return (ret); 325ccdfd621SMichael Tuexen } 326ccdfd621SMichael Tuexen 327ccdfd621SMichael Tuexen /* 328dbc42409SLawrence Stewart * Returns 0 on success, non-zero on failure. 329dbc42409SLawrence Stewart */ 330dbc42409SLawrence Stewart int 331dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc) 332dbc42409SLawrence Stewart { 333dbc42409SLawrence Stewart struct cc_algo *funcs; 334dbc42409SLawrence Stewart int err; 335dbc42409SLawrence Stewart 336dbc42409SLawrence Stewart err = 0; 337dbc42409SLawrence Stewart 338dbc42409SLawrence Stewart /* 339dbc42409SLawrence Stewart * Iterate over list of registered CC algorithms and make sure 340dbc42409SLawrence Stewart * we're not trying to add a duplicate. 341dbc42409SLawrence Stewart */ 342dbc42409SLawrence Stewart CC_LIST_WLOCK(); 343dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 344b8d60729SRandall Stewart if (funcs == add_cc || 345b8d60729SRandall Stewart strncmp(funcs->name, add_cc->name, 346b8d60729SRandall Stewart TCP_CA_NAME_MAX) == 0) { 347dbc42409SLawrence Stewart err = EEXIST; 348b8d60729SRandall Stewart break; 349dbc42409SLawrence Stewart } 350b8d60729SRandall Stewart } 351ea9017fbSRandall Stewart /* Init its reference count */ 352ea9017fbSRandall Stewart if (err == 0) 353ea9017fbSRandall Stewart refcount_init(&add_cc->cc_refcount, 0); 354b8d60729SRandall Stewart /* 355b8d60729SRandall Stewart * The first loaded congestion control module will become 356b8d60729SRandall Stewart * the default until we find the "CC_DEFAULT" defined in 357b8d60729SRandall Stewart * the config (if we do). 358b8d60729SRandall Stewart */ 359b8d60729SRandall Stewart if (!err) { 360dbc42409SLawrence Stewart STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); 361b8d60729SRandall Stewart if (strcmp(add_cc->name, CC_DEFAULT) == 0) { 362b8d60729SRandall Stewart V_default_cc_ptr = add_cc; 363b8d60729SRandall Stewart } else if (V_default_cc_ptr == NULL) { 364b8d60729SRandall Stewart V_default_cc_ptr = add_cc; 365b8d60729SRandall Stewart } 366b8d60729SRandall Stewart } 367dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 368dbc42409SLawrence Stewart 369dbc42409SLawrence Stewart return (err); 370dbc42409SLawrence Stewart } 371dbc42409SLawrence Stewart 372034a9240SMark Johnston static void 373034a9240SMark Johnston vnet_cc_sysinit(void *arg) 374034a9240SMark Johnston { 375034a9240SMark Johnston struct cc_algo *cc; 376034a9240SMark Johnston 377034a9240SMark Johnston if (IS_DEFAULT_VNET(curvnet)) 378034a9240SMark Johnston return; 379034a9240SMark Johnston 380034a9240SMark Johnston CURVNET_SET(vnet0); 381034a9240SMark Johnston cc = V_default_cc_ptr; 382034a9240SMark Johnston CURVNET_RESTORE(); 383034a9240SMark Johnston 384034a9240SMark Johnston V_default_cc_ptr = cc; 385034a9240SMark Johnston } 386034a9240SMark Johnston VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 387034a9240SMark Johnston vnet_cc_sysinit, NULL); 388034a9240SMark Johnston 389dbc42409SLawrence Stewart /* 390b8d60729SRandall Stewart * Perform any necessary tasks before we exit congestion recovery. 391b8d60729SRandall Stewart */ 392b8d60729SRandall Stewart void 393b8d60729SRandall Stewart newreno_cc_post_recovery(struct cc_var *ccv) 394b8d60729SRandall Stewart { 395b8d60729SRandall Stewart int pipe; 396b8d60729SRandall Stewart 397b8d60729SRandall Stewart if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 398b8d60729SRandall Stewart /* 399b8d60729SRandall Stewart * Fast recovery will conclude after returning from this 400b8d60729SRandall Stewart * function. Window inflation should have left us with 401b8d60729SRandall Stewart * approximately snd_ssthresh outstanding data. But in case we 402b8d60729SRandall Stewart * would be inclined to send a burst, better to do it via the 403b8d60729SRandall Stewart * slow start mechanism. 404b8d60729SRandall Stewart * 405b8d60729SRandall Stewart * XXXLAS: Find a way to do this without needing curack 406b8d60729SRandall Stewart */ 407b8d60729SRandall Stewart if (V_tcp_do_newsack) 408b8d60729SRandall Stewart pipe = tcp_compute_pipe(ccv->ccvc.tcp); 409b8d60729SRandall Stewart else 410b8d60729SRandall Stewart pipe = CCV(ccv, snd_max) - ccv->curack; 411b8d60729SRandall Stewart if (pipe < CCV(ccv, snd_ssthresh)) 412b8d60729SRandall Stewart /* 413b8d60729SRandall Stewart * Ensure that cwnd does not collapse to 1 MSS under 414b4fbc855SGordon Bergling * adverse conditions. Implements RFC6582 415b8d60729SRandall Stewart */ 416b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + 417b8d60729SRandall Stewart CCV(ccv, t_maxseg); 418b8d60729SRandall Stewart else 419b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 420b8d60729SRandall Stewart } 421b8d60729SRandall Stewart } 422b8d60729SRandall Stewart 423b8d60729SRandall Stewart void 424b8d60729SRandall Stewart newreno_cc_after_idle(struct cc_var *ccv) 425b8d60729SRandall Stewart { 426b8d60729SRandall Stewart uint32_t rw; 427b8d60729SRandall Stewart /* 428b8d60729SRandall Stewart * If we've been idle for more than one retransmit timeout the old 429b8d60729SRandall Stewart * congestion window is no longer current and we have to reduce it to 430b8d60729SRandall Stewart * the restart window before we can transmit again. 431b8d60729SRandall Stewart * 432b8d60729SRandall Stewart * The restart window is the initial window or the last CWND, whichever 433b8d60729SRandall Stewart * is smaller. 434b8d60729SRandall Stewart * 435b8d60729SRandall Stewart * This is done to prevent us from flooding the path with a full CWND at 436b8d60729SRandall Stewart * wirespeed, overloading router and switch buffers along the way. 437b8d60729SRandall Stewart * 438b8d60729SRandall Stewart * See RFC5681 Section 4.1. "Restarting Idle Connections". 439b8d60729SRandall Stewart * 440b8d60729SRandall Stewart * In addition, per RFC2861 Section 2, the ssthresh is set to the 441b8d60729SRandall Stewart * maximum of the former ssthresh or 3/4 of the old cwnd, to 442b8d60729SRandall Stewart * not exit slow-start prematurely. 443b8d60729SRandall Stewart */ 444b8d60729SRandall Stewart rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); 445b8d60729SRandall Stewart 446b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), 447b8d60729SRandall Stewart CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); 448b8d60729SRandall Stewart 449b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); 450b8d60729SRandall Stewart } 451b8d60729SRandall Stewart 452b8d60729SRandall Stewart /* 453b8d60729SRandall Stewart * Perform any necessary tasks before we enter congestion recovery. 454b8d60729SRandall Stewart */ 455b8d60729SRandall Stewart void 456b8d60729SRandall Stewart newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type) 457b8d60729SRandall Stewart { 458b8d60729SRandall Stewart uint32_t cwin, factor; 459b8d60729SRandall Stewart u_int mss; 460b8d60729SRandall Stewart 461b8d60729SRandall Stewart cwin = CCV(ccv, snd_cwnd); 462b8d60729SRandall Stewart mss = tcp_fixed_maxseg(ccv->ccvc.tcp); 463b8d60729SRandall Stewart /* 464b8d60729SRandall Stewart * Other TCP congestion controls use newreno_cong_signal(), but 465b8d60729SRandall Stewart * with their own private cc_data. Make sure the cc_data is used 466b8d60729SRandall Stewart * correctly. 467b8d60729SRandall Stewart */ 468b8d60729SRandall Stewart factor = V_newreno_beta; 469b8d60729SRandall Stewart 470b8d60729SRandall Stewart /* Catch algos which mistakenly leak private signal types. */ 471b8d60729SRandall Stewart KASSERT((type & CC_SIGPRIVMASK) == 0, 472b8d60729SRandall Stewart ("%s: congestion signal type 0x%08x is private\n", __func__, type)); 473b8d60729SRandall Stewart 474b8d60729SRandall Stewart cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 475b8d60729SRandall Stewart 2) * mss; 476b8d60729SRandall Stewart 477b8d60729SRandall Stewart switch (type) { 478b8d60729SRandall Stewart case CC_NDUPACK: 479b8d60729SRandall Stewart if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 480b8d60729SRandall Stewart if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) 481b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = cwin; 482b8d60729SRandall Stewart ENTER_RECOVERY(CCV(ccv, t_flags)); 483b8d60729SRandall Stewart } 484b8d60729SRandall Stewart break; 485b8d60729SRandall Stewart case CC_ECN: 486b8d60729SRandall Stewart if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 487b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = cwin; 488b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = cwin; 489b8d60729SRandall Stewart ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 490b8d60729SRandall Stewart } 491b8d60729SRandall Stewart break; 492b8d60729SRandall Stewart case CC_RTO: 493b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), 494b8d60729SRandall Stewart CCV(ccv, snd_cwnd)) / 2 / mss, 495b8d60729SRandall Stewart 2) * mss; 496b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = mss; 497b8d60729SRandall Stewart break; 498b8d60729SRandall Stewart } 499b8d60729SRandall Stewart } 500b8d60729SRandall Stewart 501b8d60729SRandall Stewart void 502b8d60729SRandall Stewart newreno_cc_ack_received(struct cc_var *ccv, uint16_t type) 503b8d60729SRandall Stewart { 504b8d60729SRandall Stewart if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 505b8d60729SRandall Stewart (ccv->flags & CCF_CWND_LIMITED)) { 506b8d60729SRandall Stewart u_int cw = CCV(ccv, snd_cwnd); 507b8d60729SRandall Stewart u_int incr = CCV(ccv, t_maxseg); 508b8d60729SRandall Stewart 509b8d60729SRandall Stewart /* 510b8d60729SRandall Stewart * Regular in-order ACK, open the congestion window. 511b8d60729SRandall Stewart * Method depends on which congestion control state we're 512b8d60729SRandall Stewart * in (slow start or cong avoid) and if ABC (RFC 3465) is 513b8d60729SRandall Stewart * enabled. 514b8d60729SRandall Stewart * 515b8d60729SRandall Stewart * slow start: cwnd <= ssthresh 516b8d60729SRandall Stewart * cong avoid: cwnd > ssthresh 517b8d60729SRandall Stewart * 518b8d60729SRandall Stewart * slow start and ABC (RFC 3465): 519b8d60729SRandall Stewart * Grow cwnd exponentially by the amount of data 520b8d60729SRandall Stewart * ACKed capping the max increment per ACK to 521b8d60729SRandall Stewart * (abc_l_var * maxseg) bytes. 522b8d60729SRandall Stewart * 523b8d60729SRandall Stewart * slow start without ABC (RFC 5681): 524b8d60729SRandall Stewart * Grow cwnd exponentially by maxseg per ACK. 525b8d60729SRandall Stewart * 526b8d60729SRandall Stewart * cong avoid and ABC (RFC 3465): 527b8d60729SRandall Stewart * Grow cwnd linearly by maxseg per RTT for each 528b8d60729SRandall Stewart * cwnd worth of ACKed data. 529b8d60729SRandall Stewart * 530b8d60729SRandall Stewart * cong avoid without ABC (RFC 5681): 531b8d60729SRandall Stewart * Grow cwnd linearly by approximately maxseg per RTT using 532b8d60729SRandall Stewart * maxseg^2 / cwnd per ACK as the increment. 533b8d60729SRandall Stewart * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to 534b8d60729SRandall Stewart * avoid capping cwnd. 535b8d60729SRandall Stewart */ 536b8d60729SRandall Stewart if (cw > CCV(ccv, snd_ssthresh)) { 537b8d60729SRandall Stewart if (V_tcp_do_rfc3465) { 538b8d60729SRandall Stewart if (ccv->flags & CCF_ABC_SENTAWND) 539b8d60729SRandall Stewart ccv->flags &= ~CCF_ABC_SENTAWND; 540b8d60729SRandall Stewart else 541b8d60729SRandall Stewart incr = 0; 542b8d60729SRandall Stewart } else 543b8d60729SRandall Stewart incr = max((incr * incr / cw), 1); 544b8d60729SRandall Stewart } else if (V_tcp_do_rfc3465) { 545b8d60729SRandall Stewart /* 546b8d60729SRandall Stewart * In slow-start with ABC enabled and no RTO in sight? 547b8d60729SRandall Stewart * (Must not use abc_l_var > 1 if slow starting after 548b8d60729SRandall Stewart * an RTO. On RTO, snd_nxt = snd_una, so the 549b8d60729SRandall Stewart * snd_nxt == snd_max check is sufficient to 550b8d60729SRandall Stewart * handle this). 551b8d60729SRandall Stewart * 552b8d60729SRandall Stewart * XXXLAS: Find a way to signal SS after RTO that 553b8d60729SRandall Stewart * doesn't rely on tcpcb vars. 554b8d60729SRandall Stewart */ 555b8d60729SRandall Stewart uint16_t abc_val; 556b8d60729SRandall Stewart 557b8d60729SRandall Stewart if (ccv->flags & CCF_USE_LOCAL_ABC) 558b8d60729SRandall Stewart abc_val = ccv->labc; 559b8d60729SRandall Stewart else 560b8d60729SRandall Stewart abc_val = V_tcp_abc_l_var; 561b8d60729SRandall Stewart if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) 562b8d60729SRandall Stewart incr = min(ccv->bytes_this_ack, 563b8d60729SRandall Stewart ccv->nsegs * abc_val * 564b8d60729SRandall Stewart CCV(ccv, t_maxseg)); 565b8d60729SRandall Stewart else 566b8d60729SRandall Stewart incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); 567b8d60729SRandall Stewart 568b8d60729SRandall Stewart } 569b8d60729SRandall Stewart /* ABC is on by default, so incr equals 0 frequently. */ 570b8d60729SRandall Stewart if (incr > 0) 571b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = min(cw + incr, 572b8d60729SRandall Stewart TCP_MAXWIN << CCV(ccv, snd_scale)); 573b8d60729SRandall Stewart } 574b8d60729SRandall Stewart } 575b8d60729SRandall Stewart 576ea9017fbSRandall Stewart static int 577ea9017fbSRandall Stewart cc_stop_new_assignments(struct cc_algo *algo) 578ea9017fbSRandall Stewart { 579ea9017fbSRandall Stewart CC_LIST_WLOCK(); 580ea9017fbSRandall Stewart if (cc_check_default(algo)) { 581ea9017fbSRandall Stewart /* A default cannot be removed */ 582ea9017fbSRandall Stewart CC_LIST_WUNLOCK(); 583ea9017fbSRandall Stewart return (EBUSY); 584ea9017fbSRandall Stewart } 585ea9017fbSRandall Stewart algo->flags |= CC_MODULE_BEING_REMOVED; 586ea9017fbSRandall Stewart CC_LIST_WUNLOCK(); 587ea9017fbSRandall Stewart return (0); 588ea9017fbSRandall Stewart } 589ea9017fbSRandall Stewart 590b8d60729SRandall Stewart /* 591dbc42409SLawrence Stewart * Handles kld related events. Returns 0 on success, non-zero on failure. 592dbc42409SLawrence Stewart */ 593dbc42409SLawrence Stewart int 594dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data) 595dbc42409SLawrence Stewart { 596dbc42409SLawrence Stewart struct cc_algo *algo; 597dbc42409SLawrence Stewart int err; 598dbc42409SLawrence Stewart 599dbc42409SLawrence Stewart err = 0; 600dbc42409SLawrence Stewart algo = (struct cc_algo *)data; 601dbc42409SLawrence Stewart 602dbc42409SLawrence Stewart switch(event_type) { 603dbc42409SLawrence Stewart case MOD_LOAD: 604b8d60729SRandall Stewart if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { 605b8d60729SRandall Stewart /* 606b8d60729SRandall Stewart * A module must have a cc_data_sz function 607b8d60729SRandall Stewart * even if it has no data it should return 0. 608b8d60729SRandall Stewart */ 609b8d60729SRandall Stewart printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); 610b8d60729SRandall Stewart err = EINVAL; 611b8d60729SRandall Stewart break; 612b8d60729SRandall Stewart } 613dbc42409SLawrence Stewart if (algo->mod_init != NULL) 614dbc42409SLawrence Stewart err = algo->mod_init(); 615dbc42409SLawrence Stewart if (!err) 616dbc42409SLawrence Stewart err = cc_register_algo(algo); 617dbc42409SLawrence Stewart break; 618dbc42409SLawrence Stewart 619dbc42409SLawrence Stewart case MOD_SHUTDOWN: 620dbc42409SLawrence Stewart break; 621ea9017fbSRandall Stewart case MOD_QUIESCE: 622ea9017fbSRandall Stewart /* Stop any new assigments */ 623ea9017fbSRandall Stewart err = cc_stop_new_assignments(algo); 624ea9017fbSRandall Stewart break; 625ea9017fbSRandall Stewart case MOD_UNLOAD: 626ea9017fbSRandall Stewart /* 627ea9017fbSRandall Stewart * Deregister and remove the module from the list 628ea9017fbSRandall Stewart */ 629ea9017fbSRandall Stewart CC_LIST_WLOCK(); 630ea9017fbSRandall Stewart /* Even with -f we can't unload if its the default */ 631ea9017fbSRandall Stewart if (cc_check_default(algo)) { 632ea9017fbSRandall Stewart /* A default cannot be removed */ 633ea9017fbSRandall Stewart CC_LIST_WUNLOCK(); 634ea9017fbSRandall Stewart return (EBUSY); 635ea9017fbSRandall Stewart } 636ea9017fbSRandall Stewart /* 637ea9017fbSRandall Stewart * If -f was used and users are still attached to 638ea9017fbSRandall Stewart * the algorithm things are going to go boom. 639ea9017fbSRandall Stewart */ 640ccdfd621SMichael Tuexen err = cc_deregister_algo_locked(algo); 641ccdfd621SMichael Tuexen CC_LIST_WUNLOCK(); 642ea9017fbSRandall Stewart if ((err == 0) && (algo->mod_destroy != NULL)) { 643ea9017fbSRandall Stewart algo->mod_destroy(); 644ea9017fbSRandall Stewart } 645ea9017fbSRandall Stewart break; 646dbc42409SLawrence Stewart default: 647dbc42409SLawrence Stewart err = EINVAL; 648dbc42409SLawrence Stewart break; 649dbc42409SLawrence Stewart } 650dbc42409SLawrence Stewart 651dbc42409SLawrence Stewart return (err); 652dbc42409SLawrence Stewart } 653dbc42409SLawrence Stewart 65414f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); 65514f57a8bSLawrence Stewart 656dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */ 6577029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 658439e76ecSBrad Davis "Congestion control related settings"); 659dbc42409SLawrence Stewart 6606df8a710SGleb Smirnoff SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, 6617029da5cSPawel Biernacki CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 6627029da5cSPawel Biernacki NULL, 0, cc_default_algo, "A", 6637029da5cSPawel Biernacki "Default congestion control algorithm"); 664dbc42409SLawrence Stewart 6657029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, 6667029da5cSPawel Biernacki CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 667dbc42409SLawrence Stewart NULL, 0, cc_list_available, "A", 668439e76ecSBrad Davis "List available congestion control algorithms"); 669370efe5aSLawrence Stewart 670a9696510SRandall Stewart SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus, 671a9696510SRandall Stewart CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 672a9696510SRandall Stewart "New Reno related HyStart++ settings"); 673a9696510SRandall Stewart 674a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh, 675a9696510SRandall Stewart CTLFLAG_RW, 676a9696510SRandall Stewart &hystart_minrtt_thresh, 4000, 677a9696510SRandall Stewart "HyStarts++ minimum RTT thresh used in clamp (in microseconds)"); 678a9696510SRandall Stewart 679a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh, 680a9696510SRandall Stewart CTLFLAG_RW, 681a9696510SRandall Stewart &hystart_maxrtt_thresh, 16000, 682a9696510SRandall Stewart "HyStarts++ maximum RTT thresh used in clamp (in microseconds)"); 683a9696510SRandall Stewart 684a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples, 685a9696510SRandall Stewart CTLFLAG_RW, 686a9696510SRandall Stewart &hystart_n_rttsamples, 8, 687a9696510SRandall Stewart "The number of RTT samples that must be seen to consider HyStart++"); 688a9696510SRandall Stewart 689a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div, 690a9696510SRandall Stewart CTLFLAG_RW, 691a9696510SRandall Stewart &hystart_css_growth_div, 4, 692a9696510SRandall Stewart "The divisor to the growth when in Hystart++ CSS"); 693a9696510SRandall Stewart 694a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds, 695a9696510SRandall Stewart CTLFLAG_RW, 696a9696510SRandall Stewart &hystart_css_rounds, 5, 697a9696510SRandall Stewart "The number of rounds HyStart++ lasts in CSS before falling to CA"); 698a9696510SRandall Stewart 699a9696510SRandall Stewart SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs, 700a9696510SRandall Stewart CTLFLAG_RW, 701a9696510SRandall Stewart &hystart_bblogs, 0, 702a9696510SRandall Stewart "Do we enable HyStart++ Black Box logs to be generated if BB logging is on"); 703a9696510SRandall Stewart 704370efe5aSLawrence Stewart VNET_DEFINE(int, cc_do_abe) = 0; 705370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, 706370efe5aSLawrence Stewart &VNET_NAME(cc_do_abe), 0, 707370efe5aSLawrence Stewart "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); 708370efe5aSLawrence Stewart 709370efe5aSLawrence Stewart VNET_DEFINE(int, cc_abe_frlossreduce) = 0; 710370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, 711370efe5aSLawrence Stewart &VNET_NAME(cc_abe_frlossreduce), 0, 712370efe5aSLawrence Stewart "Apply standard beta instead of ABE-beta during ECN-signalled congestion " 713370efe5aSLawrence Stewart "recovery episodes if loss also needs to be repaired"); 714