1dbc42409SLawrence Stewart /*- 2fe267a55SPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3fe267a55SPedro F. Giffuni * 4dbc42409SLawrence Stewart * Copyright (c) 2007-2008 5dbc42409SLawrence Stewart * Swinburne University of Technology, Melbourne, Australia. 6dbc42409SLawrence Stewart * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7dbc42409SLawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation 8dbc42409SLawrence Stewart * All rights reserved. 9dbc42409SLawrence Stewart * 10dbc42409SLawrence Stewart * This software was developed at the Centre for Advanced Internet 11891b8ed4SLawrence Stewart * Architectures, Swinburne University of Technology, by Lawrence Stewart and 12891b8ed4SLawrence Stewart * James Healy, made possible in part by a grant from the Cisco University 13891b8ed4SLawrence Stewart * Research Program Fund at Community Foundation Silicon Valley. 14dbc42409SLawrence Stewart * 15dbc42409SLawrence Stewart * Portions of this software were developed at the Centre for Advanced 16dbc42409SLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne, 17dbc42409SLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 18dbc42409SLawrence Stewart * 19dbc42409SLawrence Stewart * Redistribution and use in source and binary forms, with or without 20dbc42409SLawrence Stewart * modification, are permitted provided that the following conditions 21dbc42409SLawrence Stewart * are met: 22dbc42409SLawrence Stewart * 1. Redistributions of source code must retain the above copyright 23dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer. 24dbc42409SLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright 25dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer in the 26dbc42409SLawrence Stewart * documentation and/or other materials provided with the distribution. 27dbc42409SLawrence Stewart * 28dbc42409SLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29dbc42409SLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30dbc42409SLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31dbc42409SLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32dbc42409SLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33dbc42409SLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34dbc42409SLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35dbc42409SLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36dbc42409SLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37dbc42409SLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38dbc42409SLawrence Stewart * SUCH DAMAGE. 39dbc42409SLawrence Stewart */ 40dbc42409SLawrence Stewart 41dbc42409SLawrence Stewart /* 42dbc42409SLawrence Stewart * This software was first released in 2007 by James Healy and Lawrence Stewart 43891b8ed4SLawrence Stewart * whilst working on the NewTCP research project at Swinburne University of 44891b8ed4SLawrence Stewart * Technology's Centre for Advanced Internet Architectures, Melbourne, 45891b8ed4SLawrence Stewart * Australia, which was made possible in part by a grant from the Cisco 46891b8ed4SLawrence Stewart * University Research Program Fund at Community Foundation Silicon Valley. 47891b8ed4SLawrence Stewart * More details are available at: 48dbc42409SLawrence Stewart * http://caia.swin.edu.au/urp/newtcp/ 49dbc42409SLawrence Stewart */ 50dbc42409SLawrence Stewart 51dbc42409SLawrence Stewart #include <sys/cdefs.h> 52dbc42409SLawrence Stewart __FBSDID("$FreeBSD$"); 53b8d60729SRandall Stewart #include <opt_cc.h> 54dbc42409SLawrence Stewart #include <sys/param.h> 55dbc42409SLawrence Stewart #include <sys/kernel.h> 56dbc42409SLawrence Stewart #include <sys/libkern.h> 57dbc42409SLawrence Stewart #include <sys/lock.h> 58dbc42409SLawrence Stewart #include <sys/malloc.h> 59dbc42409SLawrence Stewart #include <sys/module.h> 60dbc42409SLawrence Stewart #include <sys/mutex.h> 61dbc42409SLawrence Stewart #include <sys/queue.h> 62dbc42409SLawrence Stewart #include <sys/rwlock.h> 63dbc42409SLawrence Stewart #include <sys/sbuf.h> 64dbc42409SLawrence Stewart #include <sys/socket.h> 65dbc42409SLawrence Stewart #include <sys/socketvar.h> 66dbc42409SLawrence Stewart #include <sys/sysctl.h> 67dbc42409SLawrence Stewart 68b66d74c1SGleb Smirnoff #include <net/vnet.h> 69dbc42409SLawrence Stewart 70dbc42409SLawrence Stewart #include <netinet/in.h> 71dbc42409SLawrence Stewart #include <netinet/in_pcb.h> 722de3e790SGleb Smirnoff #include <netinet/tcp.h> 73b8d60729SRandall Stewart #include <netinet/tcp_seq.h> 74dbc42409SLawrence Stewart #include <netinet/tcp_var.h> 75b8d60729SRandall Stewart #include <netinet/tcp_log_buf.h> 76b8d60729SRandall Stewart #include <netinet/tcp_hpts.h> 774644fda3SGleb Smirnoff #include <netinet/cc/cc.h> 78dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h> 79dbc42409SLawrence Stewart 807e3c9ec9SWarner Losh /* 817e3c9ec9SWarner Losh * Have a sane default if no CC_DEFAULT is specified in the kernel config file. 827e3c9ec9SWarner Losh */ 837e3c9ec9SWarner Losh #ifndef CC_DEFAULT 847e3c9ec9SWarner Losh #define CC_DEFAULT "newreno" 857e3c9ec9SWarner Losh #endif 867e3c9ec9SWarner Losh 87b8d60729SRandall Stewart MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); 88b8d60729SRandall Stewart 89dbc42409SLawrence Stewart /* 90dbc42409SLawrence Stewart * List of available cc algorithms on the current system. First element 91dbc42409SLawrence Stewart * is used as the system default CC algorithm. 92dbc42409SLawrence Stewart */ 93dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 94dbc42409SLawrence Stewart 95dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */ 96dbc42409SLawrence Stewart struct rwlock cc_list_lock; 97dbc42409SLawrence Stewart 98b8d60729SRandall Stewart VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; 99b8d60729SRandall Stewart 100b8d60729SRandall Stewart VNET_DEFINE(uint32_t, newreno_beta) = 50; 101b8d60729SRandall Stewart #define V_newreno_beta VNET(newreno_beta) 102dbc42409SLawrence Stewart 103dbc42409SLawrence Stewart /* 104dbc42409SLawrence Stewart * Sysctl handler to show and change the default CC algorithm. 105dbc42409SLawrence Stewart */ 106dbc42409SLawrence Stewart static int 107dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS) 108dbc42409SLawrence Stewart { 109ebf92e86SLawrence Stewart char default_cc[TCP_CA_NAME_MAX]; 110dbc42409SLawrence Stewart struct cc_algo *funcs; 1110e1152fcSHans Petter Selasky int error; 112dbc42409SLawrence Stewart 1130e1152fcSHans Petter Selasky /* Get the current default: */ 114dbc42409SLawrence Stewart CC_LIST_RLOCK(); 115b8d60729SRandall Stewart if (CC_DEFAULT_ALGO() != NULL) 116b8d60729SRandall Stewart strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); 117b8d60729SRandall Stewart else 118b8d60729SRandall Stewart memset(default_cc, 0, TCP_CA_NAME_MAX); 119dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 1200e1152fcSHans Petter Selasky 1210e1152fcSHans Petter Selasky error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); 1220e1152fcSHans Petter Selasky 1230e1152fcSHans Petter Selasky /* Check for error or no change */ 1240e1152fcSHans Petter Selasky if (error != 0 || req->newptr == NULL) 1250e1152fcSHans Petter Selasky goto done; 1260e1152fcSHans Petter Selasky 1270e1152fcSHans Petter Selasky error = ESRCH; 128dbc42409SLawrence Stewart /* Find algo with specified name and set it to default. */ 12978b01840SLawrence Stewart CC_LIST_RLOCK(); 130dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 1310e1152fcSHans Petter Selasky if (strncmp(default_cc, funcs->name, sizeof(default_cc))) 13260a945f9SHans Petter Selasky continue; 13378b01840SLawrence Stewart V_default_cc_ptr = funcs; 1340e1152fcSHans Petter Selasky error = 0; 1350e1152fcSHans Petter Selasky break; 136dbc42409SLawrence Stewart } 13778b01840SLawrence Stewart CC_LIST_RUNLOCK(); 1380e1152fcSHans Petter Selasky done: 1390e1152fcSHans Petter Selasky return (error); 140dbc42409SLawrence Stewart } 141dbc42409SLawrence Stewart 142dbc42409SLawrence Stewart /* 143dbc42409SLawrence Stewart * Sysctl handler to display the list of available CC algorithms. 144dbc42409SLawrence Stewart */ 145dbc42409SLawrence Stewart static int 146dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS) 147dbc42409SLawrence Stewart { 148dbc42409SLawrence Stewart struct cc_algo *algo; 149dbc42409SLawrence Stewart struct sbuf *s; 150a66ac850SLawrence Stewart int err, first, nalgos; 151dbc42409SLawrence Stewart 152a66ac850SLawrence Stewart err = nalgos = 0; 153dbc42409SLawrence Stewart first = 1; 154a66ac850SLawrence Stewart 155a66ac850SLawrence Stewart CC_LIST_RLOCK(); 156a66ac850SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 157a66ac850SLawrence Stewart nalgos++; 158a66ac850SLawrence Stewart } 159a66ac850SLawrence Stewart CC_LIST_RUNLOCK(); 160b8d60729SRandall Stewart if (nalgos == 0) { 161b8d60729SRandall Stewart return (ENOENT); 162b8d60729SRandall Stewart } 163a66ac850SLawrence Stewart s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); 164dbc42409SLawrence Stewart 165dbc42409SLawrence Stewart if (s == NULL) 166dbc42409SLawrence Stewart return (ENOMEM); 167dbc42409SLawrence Stewart 168a66ac850SLawrence Stewart /* 169a66ac850SLawrence Stewart * It is theoretically possible for the CC list to have grown in size 170a66ac850SLawrence Stewart * since the call to sbuf_new() and therefore for the sbuf to be too 171a66ac850SLawrence Stewart * small. If this were to happen (incredibly unlikely), the sbuf will 172a66ac850SLawrence Stewart * reach an overflow condition, sbuf_printf() will return an error and 173a66ac850SLawrence Stewart * the sysctl will fail gracefully. 174a66ac850SLawrence Stewart */ 175dbc42409SLawrence Stewart CC_LIST_RLOCK(); 176dbc42409SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 177dbc42409SLawrence Stewart err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); 178a66ac850SLawrence Stewart if (err) { 179a66ac850SLawrence Stewart /* Sbuf overflow condition. */ 180a66ac850SLawrence Stewart err = EOVERFLOW; 181dbc42409SLawrence Stewart break; 182a66ac850SLawrence Stewart } 183dbc42409SLawrence Stewart first = 0; 184dbc42409SLawrence Stewart } 185dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 186dbc42409SLawrence Stewart 187dbc42409SLawrence Stewart if (!err) { 188dbc42409SLawrence Stewart sbuf_finish(s); 189e167cb89SHans Petter Selasky err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); 190dbc42409SLawrence Stewart } 191dbc42409SLawrence Stewart 192dbc42409SLawrence Stewart sbuf_delete(s); 193dbc42409SLawrence Stewart return (err); 194dbc42409SLawrence Stewart } 195dbc42409SLawrence Stewart 196dbc42409SLawrence Stewart /* 197b8d60729SRandall Stewart * Return the number of times a proposed removal_cc is 198b8d60729SRandall Stewart * being used as the default. 19978b01840SLawrence Stewart */ 200b8d60729SRandall Stewart static int 201b8d60729SRandall Stewart cc_check_default(struct cc_algo *remove_cc) 20278b01840SLawrence Stewart { 203b8d60729SRandall Stewart int cnt = 0; 20478b01840SLawrence Stewart VNET_ITERATOR_DECL(vnet_iter); 20578b01840SLawrence Stewart 20678b01840SLawrence Stewart CC_LIST_LOCK_ASSERT(); 20778b01840SLawrence Stewart 20878b01840SLawrence Stewart VNET_LIST_RLOCK_NOSLEEP(); 20978b01840SLawrence Stewart VNET_FOREACH(vnet_iter) { 21078b01840SLawrence Stewart CURVNET_SET(vnet_iter); 211b8d60729SRandall Stewart if ((CC_DEFAULT_ALGO() != NULL) && 212b8d60729SRandall Stewart strncmp(CC_DEFAULT_ALGO()->name, 213b8d60729SRandall Stewart remove_cc->name, 214b8d60729SRandall Stewart TCP_CA_NAME_MAX) == 0) { 215b8d60729SRandall Stewart cnt++; 216b8d60729SRandall Stewart } 21778b01840SLawrence Stewart CURVNET_RESTORE(); 21878b01840SLawrence Stewart } 21978b01840SLawrence Stewart VNET_LIST_RUNLOCK_NOSLEEP(); 220b8d60729SRandall Stewart return (cnt); 22178b01840SLawrence Stewart } 22278b01840SLawrence Stewart 22378b01840SLawrence Stewart /* 224dbc42409SLawrence Stewart * Initialise CC subsystem on system boot. 225dbc42409SLawrence Stewart */ 22614f57a8bSLawrence Stewart static void 22714f57a8bSLawrence Stewart cc_init(void) 228dbc42409SLawrence Stewart { 229dbc42409SLawrence Stewart CC_LIST_LOCK_INIT(); 230dbc42409SLawrence Stewart STAILQ_INIT(&cc_list); 231dbc42409SLawrence Stewart } 232dbc42409SLawrence Stewart 233dbc42409SLawrence Stewart /* 234dbc42409SLawrence Stewart * Returns non-zero on success, 0 on failure. 235dbc42409SLawrence Stewart */ 236dbc42409SLawrence Stewart int 237dbc42409SLawrence Stewart cc_deregister_algo(struct cc_algo *remove_cc) 238dbc42409SLawrence Stewart { 239dbc42409SLawrence Stewart struct cc_algo *funcs, *tmpfuncs; 240dbc42409SLawrence Stewart int err; 241dbc42409SLawrence Stewart 242dbc42409SLawrence Stewart err = ENOENT; 243dbc42409SLawrence Stewart 244dbc42409SLawrence Stewart /* Remove algo from cc_list so that new connections can't use it. */ 245dbc42409SLawrence Stewart CC_LIST_WLOCK(); 246dbc42409SLawrence Stewart STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 247dbc42409SLawrence Stewart if (funcs == remove_cc) { 248b8d60729SRandall Stewart if (cc_check_default(remove_cc)) { 249b8d60729SRandall Stewart err = EBUSY; 250b8d60729SRandall Stewart break; 251b8d60729SRandall Stewart } 252b8d60729SRandall Stewart /* Add a temp flag to stop new adds to it */ 253b8d60729SRandall Stewart funcs->flags |= CC_MODULE_BEING_REMOVED; 254dbc42409SLawrence Stewart break; 255dbc42409SLawrence Stewart } 256dbc42409SLawrence Stewart } 257dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 258b8d60729SRandall Stewart err = tcp_ccalgounload(remove_cc); 259dbc42409SLawrence Stewart /* 260b8d60729SRandall Stewart * Now back through and we either remove the temp flag 261b8d60729SRandall Stewart * or pull the registration. 262dbc42409SLawrence Stewart */ 263b8d60729SRandall Stewart CC_LIST_WLOCK(); 264b8d60729SRandall Stewart STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 265b8d60729SRandall Stewart if (funcs == remove_cc) { 266b8d60729SRandall Stewart if (err == 0) 267b8d60729SRandall Stewart STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); 268b8d60729SRandall Stewart else 269b8d60729SRandall Stewart funcs->flags &= ~CC_MODULE_BEING_REMOVED; 270b8d60729SRandall Stewart break; 271b8d60729SRandall Stewart } 272b8d60729SRandall Stewart } 273b8d60729SRandall Stewart CC_LIST_WUNLOCK(); 274dbc42409SLawrence Stewart return (err); 275dbc42409SLawrence Stewart } 276dbc42409SLawrence Stewart 277dbc42409SLawrence Stewart /* 278dbc42409SLawrence Stewart * Returns 0 on success, non-zero on failure. 279dbc42409SLawrence Stewart */ 280dbc42409SLawrence Stewart int 281dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc) 282dbc42409SLawrence Stewart { 283dbc42409SLawrence Stewart struct cc_algo *funcs; 284dbc42409SLawrence Stewart int err; 285dbc42409SLawrence Stewart 286dbc42409SLawrence Stewart err = 0; 287dbc42409SLawrence Stewart 288dbc42409SLawrence Stewart /* 289dbc42409SLawrence Stewart * Iterate over list of registered CC algorithms and make sure 290dbc42409SLawrence Stewart * we're not trying to add a duplicate. 291dbc42409SLawrence Stewart */ 292dbc42409SLawrence Stewart CC_LIST_WLOCK(); 293dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 294b8d60729SRandall Stewart if (funcs == add_cc || 295b8d60729SRandall Stewart strncmp(funcs->name, add_cc->name, 296b8d60729SRandall Stewart TCP_CA_NAME_MAX) == 0) { 297dbc42409SLawrence Stewart err = EEXIST; 298b8d60729SRandall Stewart break; 299dbc42409SLawrence Stewart } 300b8d60729SRandall Stewart } 301b8d60729SRandall Stewart /* 302b8d60729SRandall Stewart * The first loaded congestion control module will become 303b8d60729SRandall Stewart * the default until we find the "CC_DEFAULT" defined in 304b8d60729SRandall Stewart * the config (if we do). 305b8d60729SRandall Stewart */ 306b8d60729SRandall Stewart if (!err) { 307dbc42409SLawrence Stewart STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); 308b8d60729SRandall Stewart if (strcmp(add_cc->name, CC_DEFAULT) == 0) { 309b8d60729SRandall Stewart V_default_cc_ptr = add_cc; 310b8d60729SRandall Stewart } else if (V_default_cc_ptr == NULL) { 311b8d60729SRandall Stewart V_default_cc_ptr = add_cc; 312b8d60729SRandall Stewart } 313b8d60729SRandall Stewart } 314dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 315dbc42409SLawrence Stewart 316dbc42409SLawrence Stewart return (err); 317dbc42409SLawrence Stewart } 318dbc42409SLawrence Stewart 319034a9240SMark Johnston static void 320034a9240SMark Johnston vnet_cc_sysinit(void *arg) 321034a9240SMark Johnston { 322034a9240SMark Johnston struct cc_algo *cc; 323034a9240SMark Johnston 324034a9240SMark Johnston if (IS_DEFAULT_VNET(curvnet)) 325034a9240SMark Johnston return; 326034a9240SMark Johnston 327034a9240SMark Johnston CURVNET_SET(vnet0); 328034a9240SMark Johnston cc = V_default_cc_ptr; 329034a9240SMark Johnston CURVNET_RESTORE(); 330034a9240SMark Johnston 331034a9240SMark Johnston V_default_cc_ptr = cc; 332034a9240SMark Johnston } 333034a9240SMark Johnston VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, 334034a9240SMark Johnston vnet_cc_sysinit, NULL); 335034a9240SMark Johnston 336dbc42409SLawrence Stewart /* 337b8d60729SRandall Stewart * Perform any necessary tasks before we exit congestion recovery. 338b8d60729SRandall Stewart */ 339b8d60729SRandall Stewart void 340b8d60729SRandall Stewart newreno_cc_post_recovery(struct cc_var *ccv) 341b8d60729SRandall Stewart { 342b8d60729SRandall Stewart int pipe; 343b8d60729SRandall Stewart 344b8d60729SRandall Stewart if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 345b8d60729SRandall Stewart /* 346b8d60729SRandall Stewart * Fast recovery will conclude after returning from this 347b8d60729SRandall Stewart * function. Window inflation should have left us with 348b8d60729SRandall Stewart * approximately snd_ssthresh outstanding data. But in case we 349b8d60729SRandall Stewart * would be inclined to send a burst, better to do it via the 350b8d60729SRandall Stewart * slow start mechanism. 351b8d60729SRandall Stewart * 352b8d60729SRandall Stewart * XXXLAS: Find a way to do this without needing curack 353b8d60729SRandall Stewart */ 354b8d60729SRandall Stewart if (V_tcp_do_newsack) 355b8d60729SRandall Stewart pipe = tcp_compute_pipe(ccv->ccvc.tcp); 356b8d60729SRandall Stewart else 357b8d60729SRandall Stewart pipe = CCV(ccv, snd_max) - ccv->curack; 358b8d60729SRandall Stewart if (pipe < CCV(ccv, snd_ssthresh)) 359b8d60729SRandall Stewart /* 360b8d60729SRandall Stewart * Ensure that cwnd does not collapse to 1 MSS under 361*b4fbc855SGordon Bergling * adverse conditions. Implements RFC6582 362b8d60729SRandall Stewart */ 363b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + 364b8d60729SRandall Stewart CCV(ccv, t_maxseg); 365b8d60729SRandall Stewart else 366b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 367b8d60729SRandall Stewart } 368b8d60729SRandall Stewart } 369b8d60729SRandall Stewart 370b8d60729SRandall Stewart void 371b8d60729SRandall Stewart newreno_cc_after_idle(struct cc_var *ccv) 372b8d60729SRandall Stewart { 373b8d60729SRandall Stewart uint32_t rw; 374b8d60729SRandall Stewart /* 375b8d60729SRandall Stewart * If we've been idle for more than one retransmit timeout the old 376b8d60729SRandall Stewart * congestion window is no longer current and we have to reduce it to 377b8d60729SRandall Stewart * the restart window before we can transmit again. 378b8d60729SRandall Stewart * 379b8d60729SRandall Stewart * The restart window is the initial window or the last CWND, whichever 380b8d60729SRandall Stewart * is smaller. 381b8d60729SRandall Stewart * 382b8d60729SRandall Stewart * This is done to prevent us from flooding the path with a full CWND at 383b8d60729SRandall Stewart * wirespeed, overloading router and switch buffers along the way. 384b8d60729SRandall Stewart * 385b8d60729SRandall Stewart * See RFC5681 Section 4.1. "Restarting Idle Connections". 386b8d60729SRandall Stewart * 387b8d60729SRandall Stewart * In addition, per RFC2861 Section 2, the ssthresh is set to the 388b8d60729SRandall Stewart * maximum of the former ssthresh or 3/4 of the old cwnd, to 389b8d60729SRandall Stewart * not exit slow-start prematurely. 390b8d60729SRandall Stewart */ 391b8d60729SRandall Stewart rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); 392b8d60729SRandall Stewart 393b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), 394b8d60729SRandall Stewart CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); 395b8d60729SRandall Stewart 396b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); 397b8d60729SRandall Stewart } 398b8d60729SRandall Stewart 399b8d60729SRandall Stewart /* 400b8d60729SRandall Stewart * Perform any necessary tasks before we enter congestion recovery. 401b8d60729SRandall Stewart */ 402b8d60729SRandall Stewart void 403b8d60729SRandall Stewart newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type) 404b8d60729SRandall Stewart { 405b8d60729SRandall Stewart uint32_t cwin, factor; 406b8d60729SRandall Stewart u_int mss; 407b8d60729SRandall Stewart 408b8d60729SRandall Stewart cwin = CCV(ccv, snd_cwnd); 409b8d60729SRandall Stewart mss = tcp_fixed_maxseg(ccv->ccvc.tcp); 410b8d60729SRandall Stewart /* 411b8d60729SRandall Stewart * Other TCP congestion controls use newreno_cong_signal(), but 412b8d60729SRandall Stewart * with their own private cc_data. Make sure the cc_data is used 413b8d60729SRandall Stewart * correctly. 414b8d60729SRandall Stewart */ 415b8d60729SRandall Stewart factor = V_newreno_beta; 416b8d60729SRandall Stewart 417b8d60729SRandall Stewart /* Catch algos which mistakenly leak private signal types. */ 418b8d60729SRandall Stewart KASSERT((type & CC_SIGPRIVMASK) == 0, 419b8d60729SRandall Stewart ("%s: congestion signal type 0x%08x is private\n", __func__, type)); 420b8d60729SRandall Stewart 421b8d60729SRandall Stewart cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 422b8d60729SRandall Stewart 2) * mss; 423b8d60729SRandall Stewart 424b8d60729SRandall Stewart switch (type) { 425b8d60729SRandall Stewart case CC_NDUPACK: 426b8d60729SRandall Stewart if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 427b8d60729SRandall Stewart if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) 428b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = cwin; 429b8d60729SRandall Stewart ENTER_RECOVERY(CCV(ccv, t_flags)); 430b8d60729SRandall Stewart } 431b8d60729SRandall Stewart break; 432b8d60729SRandall Stewart case CC_ECN: 433b8d60729SRandall Stewart if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 434b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = cwin; 435b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = cwin; 436b8d60729SRandall Stewart ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 437b8d60729SRandall Stewart } 438b8d60729SRandall Stewart break; 439b8d60729SRandall Stewart case CC_RTO: 440b8d60729SRandall Stewart CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), 441b8d60729SRandall Stewart CCV(ccv, snd_cwnd)) / 2 / mss, 442b8d60729SRandall Stewart 2) * mss; 443b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = mss; 444b8d60729SRandall Stewart break; 445b8d60729SRandall Stewart } 446b8d60729SRandall Stewart } 447b8d60729SRandall Stewart 448b8d60729SRandall Stewart void 449b8d60729SRandall Stewart newreno_cc_ack_received(struct cc_var *ccv, uint16_t type) 450b8d60729SRandall Stewart { 451b8d60729SRandall Stewart if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 452b8d60729SRandall Stewart (ccv->flags & CCF_CWND_LIMITED)) { 453b8d60729SRandall Stewart u_int cw = CCV(ccv, snd_cwnd); 454b8d60729SRandall Stewart u_int incr = CCV(ccv, t_maxseg); 455b8d60729SRandall Stewart 456b8d60729SRandall Stewart /* 457b8d60729SRandall Stewart * Regular in-order ACK, open the congestion window. 458b8d60729SRandall Stewart * Method depends on which congestion control state we're 459b8d60729SRandall Stewart * in (slow start or cong avoid) and if ABC (RFC 3465) is 460b8d60729SRandall Stewart * enabled. 461b8d60729SRandall Stewart * 462b8d60729SRandall Stewart * slow start: cwnd <= ssthresh 463b8d60729SRandall Stewart * cong avoid: cwnd > ssthresh 464b8d60729SRandall Stewart * 465b8d60729SRandall Stewart * slow start and ABC (RFC 3465): 466b8d60729SRandall Stewart * Grow cwnd exponentially by the amount of data 467b8d60729SRandall Stewart * ACKed capping the max increment per ACK to 468b8d60729SRandall Stewart * (abc_l_var * maxseg) bytes. 469b8d60729SRandall Stewart * 470b8d60729SRandall Stewart * slow start without ABC (RFC 5681): 471b8d60729SRandall Stewart * Grow cwnd exponentially by maxseg per ACK. 472b8d60729SRandall Stewart * 473b8d60729SRandall Stewart * cong avoid and ABC (RFC 3465): 474b8d60729SRandall Stewart * Grow cwnd linearly by maxseg per RTT for each 475b8d60729SRandall Stewart * cwnd worth of ACKed data. 476b8d60729SRandall Stewart * 477b8d60729SRandall Stewart * cong avoid without ABC (RFC 5681): 478b8d60729SRandall Stewart * Grow cwnd linearly by approximately maxseg per RTT using 479b8d60729SRandall Stewart * maxseg^2 / cwnd per ACK as the increment. 480b8d60729SRandall Stewart * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to 481b8d60729SRandall Stewart * avoid capping cwnd. 482b8d60729SRandall Stewart */ 483b8d60729SRandall Stewart if (cw > CCV(ccv, snd_ssthresh)) { 484b8d60729SRandall Stewart if (V_tcp_do_rfc3465) { 485b8d60729SRandall Stewart if (ccv->flags & CCF_ABC_SENTAWND) 486b8d60729SRandall Stewart ccv->flags &= ~CCF_ABC_SENTAWND; 487b8d60729SRandall Stewart else 488b8d60729SRandall Stewart incr = 0; 489b8d60729SRandall Stewart } else 490b8d60729SRandall Stewart incr = max((incr * incr / cw), 1); 491b8d60729SRandall Stewart } else if (V_tcp_do_rfc3465) { 492b8d60729SRandall Stewart /* 493b8d60729SRandall Stewart * In slow-start with ABC enabled and no RTO in sight? 494b8d60729SRandall Stewart * (Must not use abc_l_var > 1 if slow starting after 495b8d60729SRandall Stewart * an RTO. On RTO, snd_nxt = snd_una, so the 496b8d60729SRandall Stewart * snd_nxt == snd_max check is sufficient to 497b8d60729SRandall Stewart * handle this). 498b8d60729SRandall Stewart * 499b8d60729SRandall Stewart * XXXLAS: Find a way to signal SS after RTO that 500b8d60729SRandall Stewart * doesn't rely on tcpcb vars. 501b8d60729SRandall Stewart */ 502b8d60729SRandall Stewart uint16_t abc_val; 503b8d60729SRandall Stewart 504b8d60729SRandall Stewart if (ccv->flags & CCF_USE_LOCAL_ABC) 505b8d60729SRandall Stewart abc_val = ccv->labc; 506b8d60729SRandall Stewart else 507b8d60729SRandall Stewart abc_val = V_tcp_abc_l_var; 508b8d60729SRandall Stewart if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) 509b8d60729SRandall Stewart incr = min(ccv->bytes_this_ack, 510b8d60729SRandall Stewart ccv->nsegs * abc_val * 511b8d60729SRandall Stewart CCV(ccv, t_maxseg)); 512b8d60729SRandall Stewart else 513b8d60729SRandall Stewart incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); 514b8d60729SRandall Stewart 515b8d60729SRandall Stewart } 516b8d60729SRandall Stewart /* ABC is on by default, so incr equals 0 frequently. */ 517b8d60729SRandall Stewart if (incr > 0) 518b8d60729SRandall Stewart CCV(ccv, snd_cwnd) = min(cw + incr, 519b8d60729SRandall Stewart TCP_MAXWIN << CCV(ccv, snd_scale)); 520b8d60729SRandall Stewart } 521b8d60729SRandall Stewart } 522b8d60729SRandall Stewart 523b8d60729SRandall Stewart /* 524dbc42409SLawrence Stewart * Handles kld related events. Returns 0 on success, non-zero on failure. 525dbc42409SLawrence Stewart */ 526dbc42409SLawrence Stewart int 527dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data) 528dbc42409SLawrence Stewart { 529dbc42409SLawrence Stewart struct cc_algo *algo; 530dbc42409SLawrence Stewart int err; 531dbc42409SLawrence Stewart 532dbc42409SLawrence Stewart err = 0; 533dbc42409SLawrence Stewart algo = (struct cc_algo *)data; 534dbc42409SLawrence Stewart 535dbc42409SLawrence Stewart switch(event_type) { 536dbc42409SLawrence Stewart case MOD_LOAD: 537b8d60729SRandall Stewart if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { 538b8d60729SRandall Stewart /* 539b8d60729SRandall Stewart * A module must have a cc_data_sz function 540b8d60729SRandall Stewart * even if it has no data it should return 0. 541b8d60729SRandall Stewart */ 542b8d60729SRandall Stewart printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); 543b8d60729SRandall Stewart err = EINVAL; 544b8d60729SRandall Stewart break; 545b8d60729SRandall Stewart } 546dbc42409SLawrence Stewart if (algo->mod_init != NULL) 547dbc42409SLawrence Stewart err = algo->mod_init(); 548dbc42409SLawrence Stewart if (!err) 549dbc42409SLawrence Stewart err = cc_register_algo(algo); 550dbc42409SLawrence Stewart break; 551dbc42409SLawrence Stewart 552dbc42409SLawrence Stewart case MOD_QUIESCE: 553dbc42409SLawrence Stewart case MOD_SHUTDOWN: 554dbc42409SLawrence Stewart case MOD_UNLOAD: 555dbc42409SLawrence Stewart err = cc_deregister_algo(algo); 556dbc42409SLawrence Stewart if (!err && algo->mod_destroy != NULL) 557dbc42409SLawrence Stewart algo->mod_destroy(); 558dbc42409SLawrence Stewart if (err == ENOENT) 559dbc42409SLawrence Stewart err = 0; 560dbc42409SLawrence Stewart break; 561dbc42409SLawrence Stewart 562dbc42409SLawrence Stewart default: 563dbc42409SLawrence Stewart err = EINVAL; 564dbc42409SLawrence Stewart break; 565dbc42409SLawrence Stewart } 566dbc42409SLawrence Stewart 567dbc42409SLawrence Stewart return (err); 568dbc42409SLawrence Stewart } 569dbc42409SLawrence Stewart 57014f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); 57114f57a8bSLawrence Stewart 572dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */ 5737029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 574439e76ecSBrad Davis "Congestion control related settings"); 575dbc42409SLawrence Stewart 5766df8a710SGleb Smirnoff SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, 5777029da5cSPawel Biernacki CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 5787029da5cSPawel Biernacki NULL, 0, cc_default_algo, "A", 5797029da5cSPawel Biernacki "Default congestion control algorithm"); 580dbc42409SLawrence Stewart 5817029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, 5827029da5cSPawel Biernacki CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 583dbc42409SLawrence Stewart NULL, 0, cc_list_available, "A", 584439e76ecSBrad Davis "List available congestion control algorithms"); 585370efe5aSLawrence Stewart 586370efe5aSLawrence Stewart VNET_DEFINE(int, cc_do_abe) = 0; 587370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, 588370efe5aSLawrence Stewart &VNET_NAME(cc_do_abe), 0, 589370efe5aSLawrence Stewart "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); 590370efe5aSLawrence Stewart 591370efe5aSLawrence Stewart VNET_DEFINE(int, cc_abe_frlossreduce) = 0; 592370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, 593370efe5aSLawrence Stewart &VNET_NAME(cc_abe_frlossreduce), 0, 594370efe5aSLawrence Stewart "Apply standard beta instead of ABE-beta during ECN-signalled congestion " 595370efe5aSLawrence Stewart "recovery episodes if loss also needs to be repaired"); 596