1dbc42409SLawrence Stewart /*- 2dbc42409SLawrence Stewart * Copyright (c) 2007-2008 3dbc42409SLawrence Stewart * Swinburne University of Technology, Melbourne, Australia. 4dbc42409SLawrence Stewart * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 5dbc42409SLawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation 6dbc42409SLawrence Stewart * All rights reserved. 7dbc42409SLawrence Stewart * 8dbc42409SLawrence Stewart * This software was developed at the Centre for Advanced Internet 9dbc42409SLawrence Stewart * Architectures, Swinburne University, by Lawrence Stewart and James Healy, 10dbc42409SLawrence Stewart * made possible in part by a grant from the Cisco University Research Program 11dbc42409SLawrence Stewart * Fund at Community Foundation Silicon Valley. 12dbc42409SLawrence Stewart * 13dbc42409SLawrence Stewart * Portions of this software were developed at the Centre for Advanced 14dbc42409SLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne, 15dbc42409SLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16dbc42409SLawrence Stewart * 17dbc42409SLawrence Stewart * Redistribution and use in source and binary forms, with or without 18dbc42409SLawrence Stewart * modification, are permitted provided that the following conditions 19dbc42409SLawrence Stewart * are met: 20dbc42409SLawrence Stewart * 1. Redistributions of source code must retain the above copyright 21dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer. 22dbc42409SLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright 23dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer in the 24dbc42409SLawrence Stewart * documentation and/or other materials provided with the distribution. 25dbc42409SLawrence Stewart * 26dbc42409SLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27dbc42409SLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28dbc42409SLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29dbc42409SLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30dbc42409SLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31dbc42409SLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32dbc42409SLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33dbc42409SLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34dbc42409SLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35dbc42409SLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36dbc42409SLawrence Stewart * SUCH DAMAGE. 37dbc42409SLawrence Stewart */ 38dbc42409SLawrence Stewart 39dbc42409SLawrence Stewart /* 40dbc42409SLawrence Stewart * This software was first released in 2007 by James Healy and Lawrence Stewart 41dbc42409SLawrence Stewart * whilst working on the NewTCP research project at Swinburne University's 42dbc42409SLawrence Stewart * Centre for Advanced Internet Architectures, Melbourne, Australia, which was 43dbc42409SLawrence Stewart * made possible in part by a grant from the Cisco University Research Program 44dbc42409SLawrence Stewart * Fund at Community Foundation Silicon Valley. More details are available at: 45dbc42409SLawrence Stewart * http://caia.swin.edu.au/urp/newtcp/ 46dbc42409SLawrence Stewart */ 47dbc42409SLawrence Stewart 48dbc42409SLawrence Stewart #include <sys/cdefs.h> 49dbc42409SLawrence Stewart __FBSDID("$FreeBSD$"); 50dbc42409SLawrence Stewart 51dbc42409SLawrence Stewart #include <sys/param.h> 52dbc42409SLawrence Stewart #include <sys/kernel.h> 53dbc42409SLawrence Stewart #include <sys/libkern.h> 54dbc42409SLawrence Stewart #include <sys/lock.h> 55dbc42409SLawrence Stewart #include <sys/malloc.h> 56dbc42409SLawrence Stewart #include <sys/module.h> 57dbc42409SLawrence Stewart #include <sys/mutex.h> 58dbc42409SLawrence Stewart #include <sys/queue.h> 59dbc42409SLawrence Stewart #include <sys/rwlock.h> 60dbc42409SLawrence Stewart #include <sys/sbuf.h> 61dbc42409SLawrence Stewart #include <sys/socket.h> 62dbc42409SLawrence Stewart #include <sys/socketvar.h> 63dbc42409SLawrence Stewart #include <sys/sysctl.h> 64dbc42409SLawrence Stewart 65dbc42409SLawrence Stewart #include <net/if.h> 66dbc42409SLawrence Stewart #include <net/if_var.h> 67dbc42409SLawrence Stewart 68dbc42409SLawrence Stewart #include <netinet/cc.h> 69dbc42409SLawrence Stewart #include <netinet/in.h> 70dbc42409SLawrence Stewart #include <netinet/in_pcb.h> 71dbc42409SLawrence Stewart #include <netinet/tcp_var.h> 72dbc42409SLawrence Stewart 73dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h> 74dbc42409SLawrence Stewart 75dbc42409SLawrence Stewart /* 76dbc42409SLawrence Stewart * List of available cc algorithms on the current system. First element 77dbc42409SLawrence Stewart * is used as the system default CC algorithm. 78dbc42409SLawrence Stewart */ 79dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 80dbc42409SLawrence Stewart 81dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */ 82dbc42409SLawrence Stewart struct rwlock cc_list_lock; 83dbc42409SLawrence Stewart 84dbc42409SLawrence Stewart /* 85dbc42409SLawrence Stewart * Set the default CC algorithm to new_default. The default is identified 86dbc42409SLawrence Stewart * by being the first element in the cc_list TAILQ. 87dbc42409SLawrence Stewart */ 88dbc42409SLawrence Stewart static void 89dbc42409SLawrence Stewart cc_set_default(struct cc_algo *new_default) 90dbc42409SLawrence Stewart { 91dbc42409SLawrence Stewart CC_LIST_WLOCK_ASSERT(); 92dbc42409SLawrence Stewart 93dbc42409SLawrence Stewart /* 94dbc42409SLawrence Stewart * Make the requested system default CC algorithm the first element in 95dbc42409SLawrence Stewart * the list if it isn't already. 96dbc42409SLawrence Stewart */ 97dbc42409SLawrence Stewart if (new_default != CC_DEFAULT()) { 98dbc42409SLawrence Stewart STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries); 99dbc42409SLawrence Stewart STAILQ_INSERT_HEAD(&cc_list, new_default, entries); 100dbc42409SLawrence Stewart } 101dbc42409SLawrence Stewart } 102dbc42409SLawrence Stewart 103dbc42409SLawrence Stewart /* 104dbc42409SLawrence Stewart * Sysctl handler to show and change the default CC algorithm. 105dbc42409SLawrence Stewart */ 106dbc42409SLawrence Stewart static int 107dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS) 108dbc42409SLawrence Stewart { 109dbc42409SLawrence Stewart struct cc_algo *funcs; 110dbc42409SLawrence Stewart int err, found; 111dbc42409SLawrence Stewart 112dbc42409SLawrence Stewart err = found = 0; 113dbc42409SLawrence Stewart 114dbc42409SLawrence Stewart if (req->newptr == NULL) { 115dbc42409SLawrence Stewart char default_cc[TCP_CA_NAME_MAX]; 116dbc42409SLawrence Stewart 117dbc42409SLawrence Stewart /* Just print the current default. */ 118dbc42409SLawrence Stewart CC_LIST_RLOCK(); 119dbc42409SLawrence Stewart strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX); 120dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 121dbc42409SLawrence Stewart err = sysctl_handle_string(oidp, default_cc, 1, req); 122dbc42409SLawrence Stewart } else { 123dbc42409SLawrence Stewart /* Find algo with specified name and set it to default. */ 124dbc42409SLawrence Stewart CC_LIST_WLOCK(); 125dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 126dbc42409SLawrence Stewart if (strncmp((char *)req->newptr, funcs->name, 127dbc42409SLawrence Stewart TCP_CA_NAME_MAX) == 0) { 128dbc42409SLawrence Stewart found = 1; 129dbc42409SLawrence Stewart cc_set_default(funcs); 130dbc42409SLawrence Stewart } 131dbc42409SLawrence Stewart } 132dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 133dbc42409SLawrence Stewart 134dbc42409SLawrence Stewart if (!found) 135dbc42409SLawrence Stewart err = ESRCH; 136dbc42409SLawrence Stewart } 137dbc42409SLawrence Stewart 138dbc42409SLawrence Stewart return (err); 139dbc42409SLawrence Stewart } 140dbc42409SLawrence Stewart 141dbc42409SLawrence Stewart /* 142dbc42409SLawrence Stewart * Sysctl handler to display the list of available CC algorithms. 143dbc42409SLawrence Stewart */ 144dbc42409SLawrence Stewart static int 145dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS) 146dbc42409SLawrence Stewart { 147dbc42409SLawrence Stewart struct cc_algo *algo; 148dbc42409SLawrence Stewart struct sbuf *s; 149dbc42409SLawrence Stewart int err, first; 150dbc42409SLawrence Stewart 151dbc42409SLawrence Stewart err = 0; 152dbc42409SLawrence Stewart first = 1; 153dbc42409SLawrence Stewart s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND); 154dbc42409SLawrence Stewart 155dbc42409SLawrence Stewart if (s == NULL) 156dbc42409SLawrence Stewart return (ENOMEM); 157dbc42409SLawrence Stewart 158dbc42409SLawrence Stewart CC_LIST_RLOCK(); 159dbc42409SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 160dbc42409SLawrence Stewart err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); 161dbc42409SLawrence Stewart if (err) 162dbc42409SLawrence Stewart break; 163dbc42409SLawrence Stewart first = 0; 164dbc42409SLawrence Stewart } 165dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 166dbc42409SLawrence Stewart 167dbc42409SLawrence Stewart if (!err) { 168dbc42409SLawrence Stewart sbuf_finish(s); 169dbc42409SLawrence Stewart err = sysctl_handle_string(oidp, sbuf_data(s), 1, req); 170dbc42409SLawrence Stewart } 171dbc42409SLawrence Stewart 172dbc42409SLawrence Stewart sbuf_delete(s); 173dbc42409SLawrence Stewart return (err); 174dbc42409SLawrence Stewart } 175dbc42409SLawrence Stewart 176dbc42409SLawrence Stewart /* 177dbc42409SLawrence Stewart * Initialise CC subsystem on system boot. 178dbc42409SLawrence Stewart */ 179*14f57a8bSLawrence Stewart static void 180*14f57a8bSLawrence Stewart cc_init(void) 181dbc42409SLawrence Stewart { 182dbc42409SLawrence Stewart CC_LIST_LOCK_INIT(); 183dbc42409SLawrence Stewart STAILQ_INIT(&cc_list); 184dbc42409SLawrence Stewart } 185dbc42409SLawrence Stewart 186dbc42409SLawrence Stewart /* 187dbc42409SLawrence Stewart * Returns non-zero on success, 0 on failure. 188dbc42409SLawrence Stewart */ 189dbc42409SLawrence Stewart int 190dbc42409SLawrence Stewart cc_deregister_algo(struct cc_algo *remove_cc) 191dbc42409SLawrence Stewart { 192dbc42409SLawrence Stewart struct cc_algo *funcs, *tmpfuncs; 193dbc42409SLawrence Stewart struct tcpcb *tp; 194dbc42409SLawrence Stewart struct inpcb *inp; 195dbc42409SLawrence Stewart int err; 196dbc42409SLawrence Stewart 197dbc42409SLawrence Stewart err = ENOENT; 198dbc42409SLawrence Stewart 199dbc42409SLawrence Stewart /* Never allow newreno to be deregistered. */ 200dbc42409SLawrence Stewart if (&newreno_cc_algo == remove_cc) 201dbc42409SLawrence Stewart return (EPERM); 202dbc42409SLawrence Stewart 203dbc42409SLawrence Stewart /* Remove algo from cc_list so that new connections can't use it. */ 204dbc42409SLawrence Stewart CC_LIST_WLOCK(); 205dbc42409SLawrence Stewart STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 206dbc42409SLawrence Stewart if (funcs == remove_cc) { 207dbc42409SLawrence Stewart /* 208dbc42409SLawrence Stewart * If we're removing the current system default, 209dbc42409SLawrence Stewart * reset the default to newreno. 210dbc42409SLawrence Stewart */ 211dbc42409SLawrence Stewart if (strncmp(CC_DEFAULT()->name, remove_cc->name, 212dbc42409SLawrence Stewart TCP_CA_NAME_MAX) == 0) 213dbc42409SLawrence Stewart cc_set_default(&newreno_cc_algo); 214dbc42409SLawrence Stewart 215dbc42409SLawrence Stewart STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); 216dbc42409SLawrence Stewart err = 0; 217dbc42409SLawrence Stewart break; 218dbc42409SLawrence Stewart } 219dbc42409SLawrence Stewart } 220dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 221dbc42409SLawrence Stewart 222dbc42409SLawrence Stewart if (!err) { 223dbc42409SLawrence Stewart /* 224dbc42409SLawrence Stewart * Check all active control blocks and change any that are 225dbc42409SLawrence Stewart * using this algorithm back to newreno. If the algorithm that 226dbc42409SLawrence Stewart * was in use requires cleanup code to be run, call it. 227dbc42409SLawrence Stewart * 228dbc42409SLawrence Stewart * New connections already part way through being initialised 229dbc42409SLawrence Stewart * with the CC algo we're removing will not race with this code 230dbc42409SLawrence Stewart * because the INP_INFO_WLOCK is held during initialisation. 231dbc42409SLawrence Stewart * We therefore don't enter the loop below until the connection 232dbc42409SLawrence Stewart * list has stabilised. 233dbc42409SLawrence Stewart */ 234dbc42409SLawrence Stewart INP_INFO_RLOCK(&V_tcbinfo); 235dbc42409SLawrence Stewart LIST_FOREACH(inp, &V_tcb, inp_list) { 236dbc42409SLawrence Stewart INP_WLOCK(inp); 237dbc42409SLawrence Stewart /* Important to skip tcptw structs. */ 238dbc42409SLawrence Stewart if (!(inp->inp_flags & INP_TIMEWAIT) && 239dbc42409SLawrence Stewart (tp = intotcpcb(inp)) != NULL) { 240dbc42409SLawrence Stewart /* 241dbc42409SLawrence Stewart * By holding INP_WLOCK here, we are 242dbc42409SLawrence Stewart * assured that the connection is not 243dbc42409SLawrence Stewart * currently executing inside the CC 244dbc42409SLawrence Stewart * module's functions i.e. it is safe to 245dbc42409SLawrence Stewart * make the switch back to newreno. 246dbc42409SLawrence Stewart */ 247dbc42409SLawrence Stewart if (CC_ALGO(tp) == remove_cc) { 248dbc42409SLawrence Stewart tmpfuncs = CC_ALGO(tp); 249dbc42409SLawrence Stewart /* Newreno does not require any init. */ 250dbc42409SLawrence Stewart CC_ALGO(tp) = &newreno_cc_algo; 251dbc42409SLawrence Stewart if (tmpfuncs->cb_destroy != NULL) 252dbc42409SLawrence Stewart tmpfuncs->cb_destroy(tp->ccv); 253dbc42409SLawrence Stewart } 254dbc42409SLawrence Stewart } 255dbc42409SLawrence Stewart INP_WUNLOCK(inp); 256dbc42409SLawrence Stewart } 257dbc42409SLawrence Stewart INP_INFO_RUNLOCK(&V_tcbinfo); 258dbc42409SLawrence Stewart } 259dbc42409SLawrence Stewart 260dbc42409SLawrence Stewart return (err); 261dbc42409SLawrence Stewart } 262dbc42409SLawrence Stewart 263dbc42409SLawrence Stewart /* 264dbc42409SLawrence Stewart * Returns 0 on success, non-zero on failure. 265dbc42409SLawrence Stewart */ 266dbc42409SLawrence Stewart int 267dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc) 268dbc42409SLawrence Stewart { 269dbc42409SLawrence Stewart struct cc_algo *funcs; 270dbc42409SLawrence Stewart int err; 271dbc42409SLawrence Stewart 272dbc42409SLawrence Stewart err = 0; 273dbc42409SLawrence Stewart 274dbc42409SLawrence Stewart /* 275dbc42409SLawrence Stewart * Iterate over list of registered CC algorithms and make sure 276dbc42409SLawrence Stewart * we're not trying to add a duplicate. 277dbc42409SLawrence Stewart */ 278dbc42409SLawrence Stewart CC_LIST_WLOCK(); 279dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 280dbc42409SLawrence Stewart if (funcs == add_cc || strncmp(funcs->name, add_cc->name, 281dbc42409SLawrence Stewart TCP_CA_NAME_MAX) == 0) 282dbc42409SLawrence Stewart err = EEXIST; 283dbc42409SLawrence Stewart } 284dbc42409SLawrence Stewart 285dbc42409SLawrence Stewart if (!err) 286dbc42409SLawrence Stewart STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); 287dbc42409SLawrence Stewart 288dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 289dbc42409SLawrence Stewart 290dbc42409SLawrence Stewart return (err); 291dbc42409SLawrence Stewart } 292dbc42409SLawrence Stewart 293dbc42409SLawrence Stewart /* 294dbc42409SLawrence Stewart * Handles kld related events. Returns 0 on success, non-zero on failure. 295dbc42409SLawrence Stewart */ 296dbc42409SLawrence Stewart int 297dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data) 298dbc42409SLawrence Stewart { 299dbc42409SLawrence Stewart struct cc_algo *algo; 300dbc42409SLawrence Stewart int err; 301dbc42409SLawrence Stewart 302dbc42409SLawrence Stewart err = 0; 303dbc42409SLawrence Stewart algo = (struct cc_algo *)data; 304dbc42409SLawrence Stewart 305dbc42409SLawrence Stewart switch(event_type) { 306dbc42409SLawrence Stewart case MOD_LOAD: 307dbc42409SLawrence Stewart if (algo->mod_init != NULL) 308dbc42409SLawrence Stewart err = algo->mod_init(); 309dbc42409SLawrence Stewart if (!err) 310dbc42409SLawrence Stewart err = cc_register_algo(algo); 311dbc42409SLawrence Stewart break; 312dbc42409SLawrence Stewart 313dbc42409SLawrence Stewart case MOD_QUIESCE: 314dbc42409SLawrence Stewart case MOD_SHUTDOWN: 315dbc42409SLawrence Stewart case MOD_UNLOAD: 316dbc42409SLawrence Stewart err = cc_deregister_algo(algo); 317dbc42409SLawrence Stewart if (!err && algo->mod_destroy != NULL) 318dbc42409SLawrence Stewart algo->mod_destroy(); 319dbc42409SLawrence Stewart if (err == ENOENT) 320dbc42409SLawrence Stewart err = 0; 321dbc42409SLawrence Stewart break; 322dbc42409SLawrence Stewart 323dbc42409SLawrence Stewart default: 324dbc42409SLawrence Stewart err = EINVAL; 325dbc42409SLawrence Stewart break; 326dbc42409SLawrence Stewart } 327dbc42409SLawrence Stewart 328dbc42409SLawrence Stewart return (err); 329dbc42409SLawrence Stewart } 330dbc42409SLawrence Stewart 331*14f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); 332*14f57a8bSLawrence Stewart 333dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */ 334dbc42409SLawrence Stewart SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, 335dbc42409SLawrence Stewart "congestion control related settings"); 336dbc42409SLawrence Stewart 337dbc42409SLawrence Stewart SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, 338dbc42409SLawrence Stewart NULL, 0, cc_default_algo, "A", "default congestion control algorithm"); 339dbc42409SLawrence Stewart 340dbc42409SLawrence Stewart SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, 341dbc42409SLawrence Stewart NULL, 0, cc_list_available, "A", 342dbc42409SLawrence Stewart "list available congestion control algorithms"); 343