1dbc42409SLawrence Stewart /*- 2fe267a55SPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3fe267a55SPedro F. Giffuni * 4dbc42409SLawrence Stewart * Copyright (c) 2007-2008 5dbc42409SLawrence Stewart * Swinburne University of Technology, Melbourne, Australia. 6dbc42409SLawrence Stewart * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7dbc42409SLawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation 8dbc42409SLawrence Stewart * All rights reserved. 9dbc42409SLawrence Stewart * 10dbc42409SLawrence Stewart * This software was developed at the Centre for Advanced Internet 11891b8ed4SLawrence Stewart * Architectures, Swinburne University of Technology, by Lawrence Stewart and 12891b8ed4SLawrence Stewart * James Healy, made possible in part by a grant from the Cisco University 13891b8ed4SLawrence Stewart * Research Program Fund at Community Foundation Silicon Valley. 14dbc42409SLawrence Stewart * 15dbc42409SLawrence Stewart * Portions of this software were developed at the Centre for Advanced 16dbc42409SLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne, 17dbc42409SLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 18dbc42409SLawrence Stewart * 19dbc42409SLawrence Stewart * Redistribution and use in source and binary forms, with or without 20dbc42409SLawrence Stewart * modification, are permitted provided that the following conditions 21dbc42409SLawrence Stewart * are met: 22dbc42409SLawrence Stewart * 1. Redistributions of source code must retain the above copyright 23dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer. 24dbc42409SLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright 25dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer in the 26dbc42409SLawrence Stewart * documentation and/or other materials provided with the distribution. 27dbc42409SLawrence Stewart * 28dbc42409SLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29dbc42409SLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30dbc42409SLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31dbc42409SLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32dbc42409SLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33dbc42409SLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34dbc42409SLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35dbc42409SLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36dbc42409SLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37dbc42409SLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38dbc42409SLawrence Stewart * SUCH DAMAGE. 39dbc42409SLawrence Stewart */ 40dbc42409SLawrence Stewart 41dbc42409SLawrence Stewart /* 42dbc42409SLawrence Stewart * This software was first released in 2007 by James Healy and Lawrence Stewart 43891b8ed4SLawrence Stewart * whilst working on the NewTCP research project at Swinburne University of 44891b8ed4SLawrence Stewart * Technology's Centre for Advanced Internet Architectures, Melbourne, 45891b8ed4SLawrence Stewart * Australia, which was made possible in part by a grant from the Cisco 46891b8ed4SLawrence Stewart * University Research Program Fund at Community Foundation Silicon Valley. 47891b8ed4SLawrence Stewart * More details are available at: 48dbc42409SLawrence Stewart * http://caia.swin.edu.au/urp/newtcp/ 49dbc42409SLawrence Stewart */ 50dbc42409SLawrence Stewart 51dbc42409SLawrence Stewart #include <sys/cdefs.h> 52dbc42409SLawrence Stewart __FBSDID("$FreeBSD$"); 53dbc42409SLawrence Stewart 54dbc42409SLawrence Stewart #include <sys/param.h> 55dbc42409SLawrence Stewart #include <sys/kernel.h> 56dbc42409SLawrence Stewart #include <sys/libkern.h> 57dbc42409SLawrence Stewart #include <sys/lock.h> 58dbc42409SLawrence Stewart #include <sys/malloc.h> 59dbc42409SLawrence Stewart #include <sys/module.h> 60dbc42409SLawrence Stewart #include <sys/mutex.h> 61dbc42409SLawrence Stewart #include <sys/queue.h> 62dbc42409SLawrence Stewart #include <sys/rwlock.h> 63dbc42409SLawrence Stewart #include <sys/sbuf.h> 64dbc42409SLawrence Stewart #include <sys/socket.h> 65dbc42409SLawrence Stewart #include <sys/socketvar.h> 66dbc42409SLawrence Stewart #include <sys/sysctl.h> 67dbc42409SLawrence Stewart 68b66d74c1SGleb Smirnoff #include <net/vnet.h> 69dbc42409SLawrence Stewart 70dbc42409SLawrence Stewart #include <netinet/in.h> 71dbc42409SLawrence Stewart #include <netinet/in_pcb.h> 722de3e790SGleb Smirnoff #include <netinet/tcp.h> 73dbc42409SLawrence Stewart #include <netinet/tcp_var.h> 744644fda3SGleb Smirnoff #include <netinet/cc/cc.h> 75dbc42409SLawrence Stewart 76dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h> 77dbc42409SLawrence Stewart 78dbc42409SLawrence Stewart /* 79dbc42409SLawrence Stewart * List of available cc algorithms on the current system. First element 80dbc42409SLawrence Stewart * is used as the system default CC algorithm. 81dbc42409SLawrence Stewart */ 82dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 83dbc42409SLawrence Stewart 84dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */ 85dbc42409SLawrence Stewart struct rwlock cc_list_lock; 86dbc42409SLawrence Stewart 8778b01840SLawrence Stewart VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo; 88dbc42409SLawrence Stewart 89dbc42409SLawrence Stewart /* 90dbc42409SLawrence Stewart * Sysctl handler to show and change the default CC algorithm. 91dbc42409SLawrence Stewart */ 92dbc42409SLawrence Stewart static int 93dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS) 94dbc42409SLawrence Stewart { 95ebf92e86SLawrence Stewart char default_cc[TCP_CA_NAME_MAX]; 96dbc42409SLawrence Stewart struct cc_algo *funcs; 970e1152fcSHans Petter Selasky int error; 98dbc42409SLawrence Stewart 990e1152fcSHans Petter Selasky /* Get the current default: */ 100dbc42409SLawrence Stewart CC_LIST_RLOCK(); 1010e1152fcSHans Petter Selasky strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc)); 102dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 1030e1152fcSHans Petter Selasky 1040e1152fcSHans Petter Selasky error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); 1050e1152fcSHans Petter Selasky 1060e1152fcSHans Petter Selasky /* Check for error or no change */ 1070e1152fcSHans Petter Selasky if (error != 0 || req->newptr == NULL) 1080e1152fcSHans Petter Selasky goto done; 1090e1152fcSHans Petter Selasky 1100e1152fcSHans Petter Selasky error = ESRCH; 1110e1152fcSHans Petter Selasky 112dbc42409SLawrence Stewart /* Find algo with specified name and set it to default. */ 11378b01840SLawrence Stewart CC_LIST_RLOCK(); 114dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 1150e1152fcSHans Petter Selasky if (strncmp(default_cc, funcs->name, sizeof(default_cc))) 11660a945f9SHans Petter Selasky continue; 11778b01840SLawrence Stewart V_default_cc_ptr = funcs; 1180e1152fcSHans Petter Selasky error = 0; 1190e1152fcSHans Petter Selasky break; 120dbc42409SLawrence Stewart } 12178b01840SLawrence Stewart CC_LIST_RUNLOCK(); 1220e1152fcSHans Petter Selasky done: 1230e1152fcSHans Petter Selasky return (error); 124dbc42409SLawrence Stewart } 125dbc42409SLawrence Stewart 126dbc42409SLawrence Stewart /* 127dbc42409SLawrence Stewart * Sysctl handler to display the list of available CC algorithms. 128dbc42409SLawrence Stewart */ 129dbc42409SLawrence Stewart static int 130dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS) 131dbc42409SLawrence Stewart { 132dbc42409SLawrence Stewart struct cc_algo *algo; 133dbc42409SLawrence Stewart struct sbuf *s; 134a66ac850SLawrence Stewart int err, first, nalgos; 135dbc42409SLawrence Stewart 136a66ac850SLawrence Stewart err = nalgos = 0; 137dbc42409SLawrence Stewart first = 1; 138a66ac850SLawrence Stewart 139a66ac850SLawrence Stewart CC_LIST_RLOCK(); 140a66ac850SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 141a66ac850SLawrence Stewart nalgos++; 142a66ac850SLawrence Stewart } 143a66ac850SLawrence Stewart CC_LIST_RUNLOCK(); 144a66ac850SLawrence Stewart 145a66ac850SLawrence Stewart s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); 146dbc42409SLawrence Stewart 147dbc42409SLawrence Stewart if (s == NULL) 148dbc42409SLawrence Stewart return (ENOMEM); 149dbc42409SLawrence Stewart 150a66ac850SLawrence Stewart /* 151a66ac850SLawrence Stewart * It is theoretically possible for the CC list to have grown in size 152a66ac850SLawrence Stewart * since the call to sbuf_new() and therefore for the sbuf to be too 153a66ac850SLawrence Stewart * small. If this were to happen (incredibly unlikely), the sbuf will 154a66ac850SLawrence Stewart * reach an overflow condition, sbuf_printf() will return an error and 155a66ac850SLawrence Stewart * the sysctl will fail gracefully. 156a66ac850SLawrence Stewart */ 157dbc42409SLawrence Stewart CC_LIST_RLOCK(); 158dbc42409SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 159dbc42409SLawrence Stewart err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); 160a66ac850SLawrence Stewart if (err) { 161a66ac850SLawrence Stewart /* Sbuf overflow condition. */ 162a66ac850SLawrence Stewart err = EOVERFLOW; 163dbc42409SLawrence Stewart break; 164a66ac850SLawrence Stewart } 165dbc42409SLawrence Stewart first = 0; 166dbc42409SLawrence Stewart } 167dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 168dbc42409SLawrence Stewart 169dbc42409SLawrence Stewart if (!err) { 170dbc42409SLawrence Stewart sbuf_finish(s); 171e167cb89SHans Petter Selasky err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); 172dbc42409SLawrence Stewart } 173dbc42409SLawrence Stewart 174dbc42409SLawrence Stewart sbuf_delete(s); 175dbc42409SLawrence Stewart return (err); 176dbc42409SLawrence Stewart } 177dbc42409SLawrence Stewart 178dbc42409SLawrence Stewart /* 17978b01840SLawrence Stewart * Reset the default CC algo to NewReno for any netstack which is using the algo 18078b01840SLawrence Stewart * that is about to go away as its default. 18178b01840SLawrence Stewart */ 18278b01840SLawrence Stewart static void 18378b01840SLawrence Stewart cc_checkreset_default(struct cc_algo *remove_cc) 18478b01840SLawrence Stewart { 18578b01840SLawrence Stewart VNET_ITERATOR_DECL(vnet_iter); 18678b01840SLawrence Stewart 18778b01840SLawrence Stewart CC_LIST_LOCK_ASSERT(); 18878b01840SLawrence Stewart 18978b01840SLawrence Stewart VNET_LIST_RLOCK_NOSLEEP(); 19078b01840SLawrence Stewart VNET_FOREACH(vnet_iter) { 19178b01840SLawrence Stewart CURVNET_SET(vnet_iter); 19278b01840SLawrence Stewart if (strncmp(CC_DEFAULT()->name, remove_cc->name, 19378b01840SLawrence Stewart TCP_CA_NAME_MAX) == 0) 19478b01840SLawrence Stewart V_default_cc_ptr = &newreno_cc_algo; 19578b01840SLawrence Stewart CURVNET_RESTORE(); 19678b01840SLawrence Stewart } 19778b01840SLawrence Stewart VNET_LIST_RUNLOCK_NOSLEEP(); 19878b01840SLawrence Stewart } 19978b01840SLawrence Stewart 20078b01840SLawrence Stewart /* 201dbc42409SLawrence Stewart * Initialise CC subsystem on system boot. 202dbc42409SLawrence Stewart */ 20314f57a8bSLawrence Stewart static void 20414f57a8bSLawrence Stewart cc_init(void) 205dbc42409SLawrence Stewart { 206dbc42409SLawrence Stewart CC_LIST_LOCK_INIT(); 207dbc42409SLawrence Stewart STAILQ_INIT(&cc_list); 208dbc42409SLawrence Stewart } 209dbc42409SLawrence Stewart 210dbc42409SLawrence Stewart /* 211dbc42409SLawrence Stewart * Returns non-zero on success, 0 on failure. 212dbc42409SLawrence Stewart */ 213dbc42409SLawrence Stewart int 214dbc42409SLawrence Stewart cc_deregister_algo(struct cc_algo *remove_cc) 215dbc42409SLawrence Stewart { 216dbc42409SLawrence Stewart struct cc_algo *funcs, *tmpfuncs; 217dbc42409SLawrence Stewart int err; 218dbc42409SLawrence Stewart 219dbc42409SLawrence Stewart err = ENOENT; 220dbc42409SLawrence Stewart 221dbc42409SLawrence Stewart /* Never allow newreno to be deregistered. */ 222dbc42409SLawrence Stewart if (&newreno_cc_algo == remove_cc) 223dbc42409SLawrence Stewart return (EPERM); 224dbc42409SLawrence Stewart 225dbc42409SLawrence Stewart /* Remove algo from cc_list so that new connections can't use it. */ 226dbc42409SLawrence Stewart CC_LIST_WLOCK(); 227dbc42409SLawrence Stewart STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 228dbc42409SLawrence Stewart if (funcs == remove_cc) { 22978b01840SLawrence Stewart cc_checkreset_default(remove_cc); 230dbc42409SLawrence Stewart STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); 231dbc42409SLawrence Stewart err = 0; 232dbc42409SLawrence Stewart break; 233dbc42409SLawrence Stewart } 234dbc42409SLawrence Stewart } 235dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 236dbc42409SLawrence Stewart 23799065ae6SLawrence Stewart if (!err) 238dbc42409SLawrence Stewart /* 23999065ae6SLawrence Stewart * XXXLAS: 24099065ae6SLawrence Stewart * - We may need to handle non-zero return values in future. 24199065ae6SLawrence Stewart * - If we add CC framework support for protocols other than 24299065ae6SLawrence Stewart * TCP, we may want a more generic way to handle this step. 243dbc42409SLawrence Stewart */ 24499065ae6SLawrence Stewart tcp_ccalgounload(remove_cc); 245dbc42409SLawrence Stewart 246dbc42409SLawrence Stewart return (err); 247dbc42409SLawrence Stewart } 248dbc42409SLawrence Stewart 249dbc42409SLawrence Stewart /* 250dbc42409SLawrence Stewart * Returns 0 on success, non-zero on failure. 251dbc42409SLawrence Stewart */ 252dbc42409SLawrence Stewart int 253dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc) 254dbc42409SLawrence Stewart { 255dbc42409SLawrence Stewart struct cc_algo *funcs; 256dbc42409SLawrence Stewart int err; 257dbc42409SLawrence Stewart 258dbc42409SLawrence Stewart err = 0; 259dbc42409SLawrence Stewart 260dbc42409SLawrence Stewart /* 261dbc42409SLawrence Stewart * Iterate over list of registered CC algorithms and make sure 262dbc42409SLawrence Stewart * we're not trying to add a duplicate. 263dbc42409SLawrence Stewart */ 264dbc42409SLawrence Stewart CC_LIST_WLOCK(); 265dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 266dbc42409SLawrence Stewart if (funcs == add_cc || strncmp(funcs->name, add_cc->name, 267dbc42409SLawrence Stewart TCP_CA_NAME_MAX) == 0) 268dbc42409SLawrence Stewart err = EEXIST; 269dbc42409SLawrence Stewart } 270dbc42409SLawrence Stewart 271dbc42409SLawrence Stewart if (!err) 272dbc42409SLawrence Stewart STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); 273dbc42409SLawrence Stewart 274dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 275dbc42409SLawrence Stewart 276dbc42409SLawrence Stewart return (err); 277dbc42409SLawrence Stewart } 278dbc42409SLawrence Stewart 279dbc42409SLawrence Stewart /* 280dbc42409SLawrence Stewart * Handles kld related events. Returns 0 on success, non-zero on failure. 281dbc42409SLawrence Stewart */ 282dbc42409SLawrence Stewart int 283dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data) 284dbc42409SLawrence Stewart { 285dbc42409SLawrence Stewart struct cc_algo *algo; 286dbc42409SLawrence Stewart int err; 287dbc42409SLawrence Stewart 288dbc42409SLawrence Stewart err = 0; 289dbc42409SLawrence Stewart algo = (struct cc_algo *)data; 290dbc42409SLawrence Stewart 291dbc42409SLawrence Stewart switch(event_type) { 292dbc42409SLawrence Stewart case MOD_LOAD: 293dbc42409SLawrence Stewart if (algo->mod_init != NULL) 294dbc42409SLawrence Stewart err = algo->mod_init(); 295dbc42409SLawrence Stewart if (!err) 296dbc42409SLawrence Stewart err = cc_register_algo(algo); 297dbc42409SLawrence Stewart break; 298dbc42409SLawrence Stewart 299dbc42409SLawrence Stewart case MOD_QUIESCE: 300dbc42409SLawrence Stewart case MOD_SHUTDOWN: 301dbc42409SLawrence Stewart case MOD_UNLOAD: 302dbc42409SLawrence Stewart err = cc_deregister_algo(algo); 303dbc42409SLawrence Stewart if (!err && algo->mod_destroy != NULL) 304dbc42409SLawrence Stewart algo->mod_destroy(); 305dbc42409SLawrence Stewart if (err == ENOENT) 306dbc42409SLawrence Stewart err = 0; 307dbc42409SLawrence Stewart break; 308dbc42409SLawrence Stewart 309dbc42409SLawrence Stewart default: 310dbc42409SLawrence Stewart err = EINVAL; 311dbc42409SLawrence Stewart break; 312dbc42409SLawrence Stewart } 313dbc42409SLawrence Stewart 314dbc42409SLawrence Stewart return (err); 315dbc42409SLawrence Stewart } 316dbc42409SLawrence Stewart 31714f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); 31814f57a8bSLawrence Stewart 319dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */ 320*7029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 321439e76ecSBrad Davis "Congestion control related settings"); 322dbc42409SLawrence Stewart 3236df8a710SGleb Smirnoff SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, 324*7029da5cSPawel Biernacki CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 325*7029da5cSPawel Biernacki NULL, 0, cc_default_algo, "A", 326*7029da5cSPawel Biernacki "Default congestion control algorithm"); 327dbc42409SLawrence Stewart 328*7029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, 329*7029da5cSPawel Biernacki CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 330dbc42409SLawrence Stewart NULL, 0, cc_list_available, "A", 331439e76ecSBrad Davis "List available congestion control algorithms"); 332370efe5aSLawrence Stewart 333370efe5aSLawrence Stewart VNET_DEFINE(int, cc_do_abe) = 0; 334370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, 335370efe5aSLawrence Stewart &VNET_NAME(cc_do_abe), 0, 336370efe5aSLawrence Stewart "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); 337370efe5aSLawrence Stewart 338370efe5aSLawrence Stewart VNET_DEFINE(int, cc_abe_frlossreduce) = 0; 339370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, 340370efe5aSLawrence Stewart &VNET_NAME(cc_abe_frlossreduce), 0, 341370efe5aSLawrence Stewart "Apply standard beta instead of ABE-beta during ECN-signalled congestion " 342370efe5aSLawrence Stewart "recovery episodes if loss also needs to be repaired"); 343