1dbc42409SLawrence Stewart /*- 2dbc42409SLawrence Stewart * Copyright (c) 2007-2008 3dbc42409SLawrence Stewart * Swinburne University of Technology, Melbourne, Australia. 4dbc42409SLawrence Stewart * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 5dbc42409SLawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation 6dbc42409SLawrence Stewart * All rights reserved. 7dbc42409SLawrence Stewart * 8dbc42409SLawrence Stewart * This software was developed at the Centre for Advanced Internet 9891b8ed4SLawrence Stewart * Architectures, Swinburne University of Technology, by Lawrence Stewart and 10891b8ed4SLawrence Stewart * James Healy, made possible in part by a grant from the Cisco University 11891b8ed4SLawrence Stewart * Research Program Fund at Community Foundation Silicon Valley. 12dbc42409SLawrence Stewart * 13dbc42409SLawrence Stewart * Portions of this software were developed at the Centre for Advanced 14dbc42409SLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne, 15dbc42409SLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16dbc42409SLawrence Stewart * 17dbc42409SLawrence Stewart * Redistribution and use in source and binary forms, with or without 18dbc42409SLawrence Stewart * modification, are permitted provided that the following conditions 19dbc42409SLawrence Stewart * are met: 20dbc42409SLawrence Stewart * 1. Redistributions of source code must retain the above copyright 21dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer. 22dbc42409SLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright 23dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer in the 24dbc42409SLawrence Stewart * documentation and/or other materials provided with the distribution. 25dbc42409SLawrence Stewart * 26dbc42409SLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27dbc42409SLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28dbc42409SLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29dbc42409SLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30dbc42409SLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31dbc42409SLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32dbc42409SLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33dbc42409SLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34dbc42409SLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35dbc42409SLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36dbc42409SLawrence Stewart * SUCH DAMAGE. 37dbc42409SLawrence Stewart */ 38dbc42409SLawrence Stewart 39dbc42409SLawrence Stewart /* 40dbc42409SLawrence Stewart * This software was first released in 2007 by James Healy and Lawrence Stewart 41891b8ed4SLawrence Stewart * whilst working on the NewTCP research project at Swinburne University of 42891b8ed4SLawrence Stewart * Technology's Centre for Advanced Internet Architectures, Melbourne, 43891b8ed4SLawrence Stewart * Australia, which was made possible in part by a grant from the Cisco 44891b8ed4SLawrence Stewart * University Research Program Fund at Community Foundation Silicon Valley. 45891b8ed4SLawrence Stewart * More details are available at: 46dbc42409SLawrence Stewart * http://caia.swin.edu.au/urp/newtcp/ 47dbc42409SLawrence Stewart */ 48dbc42409SLawrence Stewart 49dbc42409SLawrence Stewart #include <sys/cdefs.h> 50dbc42409SLawrence Stewart __FBSDID("$FreeBSD$"); 51dbc42409SLawrence Stewart 52dbc42409SLawrence Stewart #include <sys/param.h> 53dbc42409SLawrence Stewart #include <sys/kernel.h> 54dbc42409SLawrence Stewart #include <sys/libkern.h> 55dbc42409SLawrence Stewart #include <sys/lock.h> 56dbc42409SLawrence Stewart #include <sys/malloc.h> 57dbc42409SLawrence Stewart #include <sys/module.h> 58dbc42409SLawrence Stewart #include <sys/mutex.h> 59dbc42409SLawrence Stewart #include <sys/queue.h> 60dbc42409SLawrence Stewart #include <sys/rwlock.h> 61dbc42409SLawrence Stewart #include <sys/sbuf.h> 62dbc42409SLawrence Stewart #include <sys/socket.h> 63dbc42409SLawrence Stewart #include <sys/socketvar.h> 64dbc42409SLawrence Stewart #include <sys/sysctl.h> 65dbc42409SLawrence Stewart 66b66d74c1SGleb Smirnoff #include <net/vnet.h> 67dbc42409SLawrence Stewart 68dbc42409SLawrence Stewart #include <netinet/in.h> 69dbc42409SLawrence Stewart #include <netinet/in_pcb.h> 702de3e790SGleb Smirnoff #include <netinet/tcp.h> 71dbc42409SLawrence Stewart #include <netinet/tcp_var.h> 72*4644fda3SGleb Smirnoff #include <netinet/cc/cc.h> 73dbc42409SLawrence Stewart 74dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h> 75dbc42409SLawrence Stewart 76dbc42409SLawrence Stewart /* 77dbc42409SLawrence Stewart * List of available cc algorithms on the current system. First element 78dbc42409SLawrence Stewart * is used as the system default CC algorithm. 79dbc42409SLawrence Stewart */ 80dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 81dbc42409SLawrence Stewart 82dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */ 83dbc42409SLawrence Stewart struct rwlock cc_list_lock; 84dbc42409SLawrence Stewart 8578b01840SLawrence Stewart VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo; 86dbc42409SLawrence Stewart 87dbc42409SLawrence Stewart /* 88dbc42409SLawrence Stewart * Sysctl handler to show and change the default CC algorithm. 89dbc42409SLawrence Stewart */ 90dbc42409SLawrence Stewart static int 91dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS) 92dbc42409SLawrence Stewart { 93ebf92e86SLawrence Stewart char default_cc[TCP_CA_NAME_MAX]; 94dbc42409SLawrence Stewart struct cc_algo *funcs; 950e1152fcSHans Petter Selasky int error; 96dbc42409SLawrence Stewart 970e1152fcSHans Petter Selasky /* Get the current default: */ 98dbc42409SLawrence Stewart CC_LIST_RLOCK(); 990e1152fcSHans Petter Selasky strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc)); 100dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 1010e1152fcSHans Petter Selasky 1020e1152fcSHans Petter Selasky error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); 1030e1152fcSHans Petter Selasky 1040e1152fcSHans Petter Selasky /* Check for error or no change */ 1050e1152fcSHans Petter Selasky if (error != 0 || req->newptr == NULL) 1060e1152fcSHans Petter Selasky goto done; 1070e1152fcSHans Petter Selasky 1080e1152fcSHans Petter Selasky error = ESRCH; 1090e1152fcSHans Petter Selasky 110dbc42409SLawrence Stewart /* Find algo with specified name and set it to default. */ 11178b01840SLawrence Stewart CC_LIST_RLOCK(); 112dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 1130e1152fcSHans Petter Selasky if (strncmp(default_cc, funcs->name, sizeof(default_cc))) 11460a945f9SHans Petter Selasky continue; 11578b01840SLawrence Stewart V_default_cc_ptr = funcs; 1160e1152fcSHans Petter Selasky error = 0; 1170e1152fcSHans Petter Selasky break; 118dbc42409SLawrence Stewart } 11978b01840SLawrence Stewart CC_LIST_RUNLOCK(); 1200e1152fcSHans Petter Selasky done: 1210e1152fcSHans Petter Selasky return (error); 122dbc42409SLawrence Stewart } 123dbc42409SLawrence Stewart 124dbc42409SLawrence Stewart /* 125dbc42409SLawrence Stewart * Sysctl handler to display the list of available CC algorithms. 126dbc42409SLawrence Stewart */ 127dbc42409SLawrence Stewart static int 128dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS) 129dbc42409SLawrence Stewart { 130dbc42409SLawrence Stewart struct cc_algo *algo; 131dbc42409SLawrence Stewart struct sbuf *s; 132a66ac850SLawrence Stewart int err, first, nalgos; 133dbc42409SLawrence Stewart 134a66ac850SLawrence Stewart err = nalgos = 0; 135dbc42409SLawrence Stewart first = 1; 136a66ac850SLawrence Stewart 137a66ac850SLawrence Stewart CC_LIST_RLOCK(); 138a66ac850SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 139a66ac850SLawrence Stewart nalgos++; 140a66ac850SLawrence Stewart } 141a66ac850SLawrence Stewart CC_LIST_RUNLOCK(); 142a66ac850SLawrence Stewart 143a66ac850SLawrence Stewart s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); 144dbc42409SLawrence Stewart 145dbc42409SLawrence Stewart if (s == NULL) 146dbc42409SLawrence Stewart return (ENOMEM); 147dbc42409SLawrence Stewart 148a66ac850SLawrence Stewart /* 149a66ac850SLawrence Stewart * It is theoretically possible for the CC list to have grown in size 150a66ac850SLawrence Stewart * since the call to sbuf_new() and therefore for the sbuf to be too 151a66ac850SLawrence Stewart * small. If this were to happen (incredibly unlikely), the sbuf will 152a66ac850SLawrence Stewart * reach an overflow condition, sbuf_printf() will return an error and 153a66ac850SLawrence Stewart * the sysctl will fail gracefully. 154a66ac850SLawrence Stewart */ 155dbc42409SLawrence Stewart CC_LIST_RLOCK(); 156dbc42409SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 157dbc42409SLawrence Stewart err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); 158a66ac850SLawrence Stewart if (err) { 159a66ac850SLawrence Stewart /* Sbuf overflow condition. */ 160a66ac850SLawrence Stewart err = EOVERFLOW; 161dbc42409SLawrence Stewart break; 162a66ac850SLawrence Stewart } 163dbc42409SLawrence Stewart first = 0; 164dbc42409SLawrence Stewart } 165dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 166dbc42409SLawrence Stewart 167dbc42409SLawrence Stewart if (!err) { 168dbc42409SLawrence Stewart sbuf_finish(s); 169e167cb89SHans Petter Selasky err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); 170dbc42409SLawrence Stewart } 171dbc42409SLawrence Stewart 172dbc42409SLawrence Stewart sbuf_delete(s); 173dbc42409SLawrence Stewart return (err); 174dbc42409SLawrence Stewart } 175dbc42409SLawrence Stewart 176dbc42409SLawrence Stewart /* 17778b01840SLawrence Stewart * Reset the default CC algo to NewReno for any netstack which is using the algo 17878b01840SLawrence Stewart * that is about to go away as its default. 17978b01840SLawrence Stewart */ 18078b01840SLawrence Stewart static void 18178b01840SLawrence Stewart cc_checkreset_default(struct cc_algo *remove_cc) 18278b01840SLawrence Stewart { 18378b01840SLawrence Stewart VNET_ITERATOR_DECL(vnet_iter); 18478b01840SLawrence Stewart 18578b01840SLawrence Stewart CC_LIST_LOCK_ASSERT(); 18678b01840SLawrence Stewart 18778b01840SLawrence Stewart VNET_LIST_RLOCK_NOSLEEP(); 18878b01840SLawrence Stewart VNET_FOREACH(vnet_iter) { 18978b01840SLawrence Stewart CURVNET_SET(vnet_iter); 19078b01840SLawrence Stewart if (strncmp(CC_DEFAULT()->name, remove_cc->name, 19178b01840SLawrence Stewart TCP_CA_NAME_MAX) == 0) 19278b01840SLawrence Stewart V_default_cc_ptr = &newreno_cc_algo; 19378b01840SLawrence Stewart CURVNET_RESTORE(); 19478b01840SLawrence Stewart } 19578b01840SLawrence Stewart VNET_LIST_RUNLOCK_NOSLEEP(); 19678b01840SLawrence Stewart } 19778b01840SLawrence Stewart 19878b01840SLawrence Stewart /* 199dbc42409SLawrence Stewart * Initialise CC subsystem on system boot. 200dbc42409SLawrence Stewart */ 20114f57a8bSLawrence Stewart static void 20214f57a8bSLawrence Stewart cc_init(void) 203dbc42409SLawrence Stewart { 204dbc42409SLawrence Stewart CC_LIST_LOCK_INIT(); 205dbc42409SLawrence Stewart STAILQ_INIT(&cc_list); 206dbc42409SLawrence Stewart } 207dbc42409SLawrence Stewart 208dbc42409SLawrence Stewart /* 209dbc42409SLawrence Stewart * Returns non-zero on success, 0 on failure. 210dbc42409SLawrence Stewart */ 211dbc42409SLawrence Stewart int 212dbc42409SLawrence Stewart cc_deregister_algo(struct cc_algo *remove_cc) 213dbc42409SLawrence Stewart { 214dbc42409SLawrence Stewart struct cc_algo *funcs, *tmpfuncs; 215dbc42409SLawrence Stewart int err; 216dbc42409SLawrence Stewart 217dbc42409SLawrence Stewart err = ENOENT; 218dbc42409SLawrence Stewart 219dbc42409SLawrence Stewart /* Never allow newreno to be deregistered. */ 220dbc42409SLawrence Stewart if (&newreno_cc_algo == remove_cc) 221dbc42409SLawrence Stewart return (EPERM); 222dbc42409SLawrence Stewart 223dbc42409SLawrence Stewart /* Remove algo from cc_list so that new connections can't use it. */ 224dbc42409SLawrence Stewart CC_LIST_WLOCK(); 225dbc42409SLawrence Stewart STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 226dbc42409SLawrence Stewart if (funcs == remove_cc) { 22778b01840SLawrence Stewart cc_checkreset_default(remove_cc); 228dbc42409SLawrence Stewart STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); 229dbc42409SLawrence Stewart err = 0; 230dbc42409SLawrence Stewart break; 231dbc42409SLawrence Stewart } 232dbc42409SLawrence Stewart } 233dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 234dbc42409SLawrence Stewart 23599065ae6SLawrence Stewart if (!err) 236dbc42409SLawrence Stewart /* 23799065ae6SLawrence Stewart * XXXLAS: 23899065ae6SLawrence Stewart * - We may need to handle non-zero return values in future. 23999065ae6SLawrence Stewart * - If we add CC framework support for protocols other than 24099065ae6SLawrence Stewart * TCP, we may want a more generic way to handle this step. 241dbc42409SLawrence Stewart */ 24299065ae6SLawrence Stewart tcp_ccalgounload(remove_cc); 243dbc42409SLawrence Stewart 244dbc42409SLawrence Stewart return (err); 245dbc42409SLawrence Stewart } 246dbc42409SLawrence Stewart 247dbc42409SLawrence Stewart /* 248dbc42409SLawrence Stewart * Returns 0 on success, non-zero on failure. 249dbc42409SLawrence Stewart */ 250dbc42409SLawrence Stewart int 251dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc) 252dbc42409SLawrence Stewart { 253dbc42409SLawrence Stewart struct cc_algo *funcs; 254dbc42409SLawrence Stewart int err; 255dbc42409SLawrence Stewart 256dbc42409SLawrence Stewart err = 0; 257dbc42409SLawrence Stewart 258dbc42409SLawrence Stewart /* 259dbc42409SLawrence Stewart * Iterate over list of registered CC algorithms and make sure 260dbc42409SLawrence Stewart * we're not trying to add a duplicate. 261dbc42409SLawrence Stewart */ 262dbc42409SLawrence Stewart CC_LIST_WLOCK(); 263dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 264dbc42409SLawrence Stewart if (funcs == add_cc || strncmp(funcs->name, add_cc->name, 265dbc42409SLawrence Stewart TCP_CA_NAME_MAX) == 0) 266dbc42409SLawrence Stewart err = EEXIST; 267dbc42409SLawrence Stewart } 268dbc42409SLawrence Stewart 269dbc42409SLawrence Stewart if (!err) 270dbc42409SLawrence Stewart STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); 271dbc42409SLawrence Stewart 272dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 273dbc42409SLawrence Stewart 274dbc42409SLawrence Stewart return (err); 275dbc42409SLawrence Stewart } 276dbc42409SLawrence Stewart 277dbc42409SLawrence Stewart /* 278dbc42409SLawrence Stewart * Handles kld related events. Returns 0 on success, non-zero on failure. 279dbc42409SLawrence Stewart */ 280dbc42409SLawrence Stewart int 281dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data) 282dbc42409SLawrence Stewart { 283dbc42409SLawrence Stewart struct cc_algo *algo; 284dbc42409SLawrence Stewart int err; 285dbc42409SLawrence Stewart 286dbc42409SLawrence Stewart err = 0; 287dbc42409SLawrence Stewart algo = (struct cc_algo *)data; 288dbc42409SLawrence Stewart 289dbc42409SLawrence Stewart switch(event_type) { 290dbc42409SLawrence Stewart case MOD_LOAD: 291dbc42409SLawrence Stewart if (algo->mod_init != NULL) 292dbc42409SLawrence Stewart err = algo->mod_init(); 293dbc42409SLawrence Stewart if (!err) 294dbc42409SLawrence Stewart err = cc_register_algo(algo); 295dbc42409SLawrence Stewart break; 296dbc42409SLawrence Stewart 297dbc42409SLawrence Stewart case MOD_QUIESCE: 298dbc42409SLawrence Stewart case MOD_SHUTDOWN: 299dbc42409SLawrence Stewart case MOD_UNLOAD: 300dbc42409SLawrence Stewart err = cc_deregister_algo(algo); 301dbc42409SLawrence Stewart if (!err && algo->mod_destroy != NULL) 302dbc42409SLawrence Stewart algo->mod_destroy(); 303dbc42409SLawrence Stewart if (err == ENOENT) 304dbc42409SLawrence Stewart err = 0; 305dbc42409SLawrence Stewart break; 306dbc42409SLawrence Stewart 307dbc42409SLawrence Stewart default: 308dbc42409SLawrence Stewart err = EINVAL; 309dbc42409SLawrence Stewart break; 310dbc42409SLawrence Stewart } 311dbc42409SLawrence Stewart 312dbc42409SLawrence Stewart return (err); 313dbc42409SLawrence Stewart } 314dbc42409SLawrence Stewart 31514f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); 31614f57a8bSLawrence Stewart 317dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */ 318dbc42409SLawrence Stewart SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, 319dbc42409SLawrence Stewart "congestion control related settings"); 320dbc42409SLawrence Stewart 3216df8a710SGleb Smirnoff SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, 3226df8a710SGleb Smirnoff CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW, 323dbc42409SLawrence Stewart NULL, 0, cc_default_algo, "A", "default congestion control algorithm"); 324dbc42409SLawrence Stewart 325dbc42409SLawrence Stewart SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, 326dbc42409SLawrence Stewart NULL, 0, cc_list_available, "A", 327dbc42409SLawrence Stewart "list available congestion control algorithms"); 328