1*dbc42409SLawrence Stewart /*- 2*dbc42409SLawrence Stewart * Copyright (c) 2007-2008 3*dbc42409SLawrence Stewart * Swinburne University of Technology, Melbourne, Australia. 4*dbc42409SLawrence Stewart * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 5*dbc42409SLawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation 6*dbc42409SLawrence Stewart * All rights reserved. 7*dbc42409SLawrence Stewart * 8*dbc42409SLawrence Stewart * This software was developed at the Centre for Advanced Internet 9*dbc42409SLawrence Stewart * Architectures, Swinburne University, by Lawrence Stewart and James Healy, 10*dbc42409SLawrence Stewart * made possible in part by a grant from the Cisco University Research Program 11*dbc42409SLawrence Stewart * Fund at Community Foundation Silicon Valley. 12*dbc42409SLawrence Stewart * 13*dbc42409SLawrence Stewart * Portions of this software were developed at the Centre for Advanced 14*dbc42409SLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne, 15*dbc42409SLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16*dbc42409SLawrence Stewart * 17*dbc42409SLawrence Stewart * Redistribution and use in source and binary forms, with or without 18*dbc42409SLawrence Stewart * modification, are permitted provided that the following conditions 19*dbc42409SLawrence Stewart * are met: 20*dbc42409SLawrence Stewart * 1. Redistributions of source code must retain the above copyright 21*dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer. 22*dbc42409SLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright 23*dbc42409SLawrence Stewart * notice, this list of conditions and the following disclaimer in the 24*dbc42409SLawrence Stewart * documentation and/or other materials provided with the distribution. 25*dbc42409SLawrence Stewart * 26*dbc42409SLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27*dbc42409SLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28*dbc42409SLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29*dbc42409SLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30*dbc42409SLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31*dbc42409SLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32*dbc42409SLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33*dbc42409SLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34*dbc42409SLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35*dbc42409SLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36*dbc42409SLawrence Stewart * SUCH DAMAGE. 37*dbc42409SLawrence Stewart */ 38*dbc42409SLawrence Stewart 39*dbc42409SLawrence Stewart /* 40*dbc42409SLawrence Stewart * This software was first released in 2007 by James Healy and Lawrence Stewart 41*dbc42409SLawrence Stewart * whilst working on the NewTCP research project at Swinburne University's 42*dbc42409SLawrence Stewart * Centre for Advanced Internet Architectures, Melbourne, Australia, which was 43*dbc42409SLawrence Stewart * made possible in part by a grant from the Cisco University Research Program 44*dbc42409SLawrence Stewart * Fund at Community Foundation Silicon Valley. More details are available at: 45*dbc42409SLawrence Stewart * http://caia.swin.edu.au/urp/newtcp/ 46*dbc42409SLawrence Stewart */ 47*dbc42409SLawrence Stewart 48*dbc42409SLawrence Stewart #include <sys/cdefs.h> 49*dbc42409SLawrence Stewart __FBSDID("$FreeBSD$"); 50*dbc42409SLawrence Stewart 51*dbc42409SLawrence Stewart #include <sys/param.h> 52*dbc42409SLawrence Stewart #include <sys/kernel.h> 53*dbc42409SLawrence Stewart #include <sys/libkern.h> 54*dbc42409SLawrence Stewart #include <sys/lock.h> 55*dbc42409SLawrence Stewart #include <sys/malloc.h> 56*dbc42409SLawrence Stewart #include <sys/module.h> 57*dbc42409SLawrence Stewart #include <sys/mutex.h> 58*dbc42409SLawrence Stewart #include <sys/queue.h> 59*dbc42409SLawrence Stewart #include <sys/rwlock.h> 60*dbc42409SLawrence Stewart #include <sys/sbuf.h> 61*dbc42409SLawrence Stewart #include <sys/socket.h> 62*dbc42409SLawrence Stewart #include <sys/socketvar.h> 63*dbc42409SLawrence Stewart #include <sys/sysctl.h> 64*dbc42409SLawrence Stewart 65*dbc42409SLawrence Stewart #include <net/if.h> 66*dbc42409SLawrence Stewart #include <net/if_var.h> 67*dbc42409SLawrence Stewart 68*dbc42409SLawrence Stewart #include <netinet/cc.h> 69*dbc42409SLawrence Stewart #include <netinet/in.h> 70*dbc42409SLawrence Stewart #include <netinet/in_pcb.h> 71*dbc42409SLawrence Stewart #include <netinet/tcp_var.h> 72*dbc42409SLawrence Stewart 73*dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h> 74*dbc42409SLawrence Stewart 75*dbc42409SLawrence Stewart /* 76*dbc42409SLawrence Stewart * List of available cc algorithms on the current system. First element 77*dbc42409SLawrence Stewart * is used as the system default CC algorithm. 78*dbc42409SLawrence Stewart */ 79*dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); 80*dbc42409SLawrence Stewart 81*dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */ 82*dbc42409SLawrence Stewart struct rwlock cc_list_lock; 83*dbc42409SLawrence Stewart 84*dbc42409SLawrence Stewart /* 85*dbc42409SLawrence Stewart * Set the default CC algorithm to new_default. The default is identified 86*dbc42409SLawrence Stewart * by being the first element in the cc_list TAILQ. 87*dbc42409SLawrence Stewart */ 88*dbc42409SLawrence Stewart static void 89*dbc42409SLawrence Stewart cc_set_default(struct cc_algo *new_default) 90*dbc42409SLawrence Stewart { 91*dbc42409SLawrence Stewart CC_LIST_WLOCK_ASSERT(); 92*dbc42409SLawrence Stewart 93*dbc42409SLawrence Stewart /* 94*dbc42409SLawrence Stewart * Make the requested system default CC algorithm the first element in 95*dbc42409SLawrence Stewart * the list if it isn't already. 96*dbc42409SLawrence Stewart */ 97*dbc42409SLawrence Stewart if (new_default != CC_DEFAULT()) { 98*dbc42409SLawrence Stewart STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries); 99*dbc42409SLawrence Stewart STAILQ_INSERT_HEAD(&cc_list, new_default, entries); 100*dbc42409SLawrence Stewart } 101*dbc42409SLawrence Stewart } 102*dbc42409SLawrence Stewart 103*dbc42409SLawrence Stewart /* 104*dbc42409SLawrence Stewart * Sysctl handler to show and change the default CC algorithm. 105*dbc42409SLawrence Stewart */ 106*dbc42409SLawrence Stewart static int 107*dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS) 108*dbc42409SLawrence Stewart { 109*dbc42409SLawrence Stewart struct cc_algo *funcs; 110*dbc42409SLawrence Stewart int err, found; 111*dbc42409SLawrence Stewart 112*dbc42409SLawrence Stewart err = found = 0; 113*dbc42409SLawrence Stewart 114*dbc42409SLawrence Stewart if (req->newptr == NULL) { 115*dbc42409SLawrence Stewart char default_cc[TCP_CA_NAME_MAX]; 116*dbc42409SLawrence Stewart 117*dbc42409SLawrence Stewart /* Just print the current default. */ 118*dbc42409SLawrence Stewart CC_LIST_RLOCK(); 119*dbc42409SLawrence Stewart strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX); 120*dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 121*dbc42409SLawrence Stewart err = sysctl_handle_string(oidp, default_cc, 1, req); 122*dbc42409SLawrence Stewart } else { 123*dbc42409SLawrence Stewart /* Find algo with specified name and set it to default. */ 124*dbc42409SLawrence Stewart CC_LIST_WLOCK(); 125*dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 126*dbc42409SLawrence Stewart if (strncmp((char *)req->newptr, funcs->name, 127*dbc42409SLawrence Stewart TCP_CA_NAME_MAX) == 0) { 128*dbc42409SLawrence Stewart found = 1; 129*dbc42409SLawrence Stewart cc_set_default(funcs); 130*dbc42409SLawrence Stewart } 131*dbc42409SLawrence Stewart } 132*dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 133*dbc42409SLawrence Stewart 134*dbc42409SLawrence Stewart if (!found) 135*dbc42409SLawrence Stewart err = ESRCH; 136*dbc42409SLawrence Stewart } 137*dbc42409SLawrence Stewart 138*dbc42409SLawrence Stewart return (err); 139*dbc42409SLawrence Stewart } 140*dbc42409SLawrence Stewart 141*dbc42409SLawrence Stewart /* 142*dbc42409SLawrence Stewart * Sysctl handler to display the list of available CC algorithms. 143*dbc42409SLawrence Stewart */ 144*dbc42409SLawrence Stewart static int 145*dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS) 146*dbc42409SLawrence Stewart { 147*dbc42409SLawrence Stewart struct cc_algo *algo; 148*dbc42409SLawrence Stewart struct sbuf *s; 149*dbc42409SLawrence Stewart int err, first; 150*dbc42409SLawrence Stewart 151*dbc42409SLawrence Stewart err = 0; 152*dbc42409SLawrence Stewart first = 1; 153*dbc42409SLawrence Stewart s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND); 154*dbc42409SLawrence Stewart 155*dbc42409SLawrence Stewart if (s == NULL) 156*dbc42409SLawrence Stewart return (ENOMEM); 157*dbc42409SLawrence Stewart 158*dbc42409SLawrence Stewart CC_LIST_RLOCK(); 159*dbc42409SLawrence Stewart STAILQ_FOREACH(algo, &cc_list, entries) { 160*dbc42409SLawrence Stewart err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); 161*dbc42409SLawrence Stewart if (err) 162*dbc42409SLawrence Stewart break; 163*dbc42409SLawrence Stewart first = 0; 164*dbc42409SLawrence Stewart } 165*dbc42409SLawrence Stewart CC_LIST_RUNLOCK(); 166*dbc42409SLawrence Stewart 167*dbc42409SLawrence Stewart if (!err) { 168*dbc42409SLawrence Stewart sbuf_finish(s); 169*dbc42409SLawrence Stewart err = sysctl_handle_string(oidp, sbuf_data(s), 1, req); 170*dbc42409SLawrence Stewart } 171*dbc42409SLawrence Stewart 172*dbc42409SLawrence Stewart sbuf_delete(s); 173*dbc42409SLawrence Stewart return (err); 174*dbc42409SLawrence Stewart } 175*dbc42409SLawrence Stewart 176*dbc42409SLawrence Stewart /* 177*dbc42409SLawrence Stewart * Initialise CC subsystem on system boot. 178*dbc42409SLawrence Stewart */ 179*dbc42409SLawrence Stewart void 180*dbc42409SLawrence Stewart cc_init() 181*dbc42409SLawrence Stewart { 182*dbc42409SLawrence Stewart CC_LIST_LOCK_INIT(); 183*dbc42409SLawrence Stewart STAILQ_INIT(&cc_list); 184*dbc42409SLawrence Stewart } 185*dbc42409SLawrence Stewart 186*dbc42409SLawrence Stewart /* 187*dbc42409SLawrence Stewart * Returns non-zero on success, 0 on failure. 188*dbc42409SLawrence Stewart */ 189*dbc42409SLawrence Stewart int 190*dbc42409SLawrence Stewart cc_deregister_algo(struct cc_algo *remove_cc) 191*dbc42409SLawrence Stewart { 192*dbc42409SLawrence Stewart struct cc_algo *funcs, *tmpfuncs; 193*dbc42409SLawrence Stewart struct tcpcb *tp; 194*dbc42409SLawrence Stewart struct inpcb *inp; 195*dbc42409SLawrence Stewart int err; 196*dbc42409SLawrence Stewart 197*dbc42409SLawrence Stewart err = ENOENT; 198*dbc42409SLawrence Stewart 199*dbc42409SLawrence Stewart /* Never allow newreno to be deregistered. */ 200*dbc42409SLawrence Stewart if (&newreno_cc_algo == remove_cc) 201*dbc42409SLawrence Stewart return (EPERM); 202*dbc42409SLawrence Stewart 203*dbc42409SLawrence Stewart /* Remove algo from cc_list so that new connections can't use it. */ 204*dbc42409SLawrence Stewart CC_LIST_WLOCK(); 205*dbc42409SLawrence Stewart STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { 206*dbc42409SLawrence Stewart if (funcs == remove_cc) { 207*dbc42409SLawrence Stewart /* 208*dbc42409SLawrence Stewart * If we're removing the current system default, 209*dbc42409SLawrence Stewart * reset the default to newreno. 210*dbc42409SLawrence Stewart */ 211*dbc42409SLawrence Stewart if (strncmp(CC_DEFAULT()->name, remove_cc->name, 212*dbc42409SLawrence Stewart TCP_CA_NAME_MAX) == 0) 213*dbc42409SLawrence Stewart cc_set_default(&newreno_cc_algo); 214*dbc42409SLawrence Stewart 215*dbc42409SLawrence Stewart STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); 216*dbc42409SLawrence Stewart err = 0; 217*dbc42409SLawrence Stewart break; 218*dbc42409SLawrence Stewart } 219*dbc42409SLawrence Stewart } 220*dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 221*dbc42409SLawrence Stewart 222*dbc42409SLawrence Stewart if (!err) { 223*dbc42409SLawrence Stewart /* 224*dbc42409SLawrence Stewart * Check all active control blocks and change any that are 225*dbc42409SLawrence Stewart * using this algorithm back to newreno. If the algorithm that 226*dbc42409SLawrence Stewart * was in use requires cleanup code to be run, call it. 227*dbc42409SLawrence Stewart * 228*dbc42409SLawrence Stewart * New connections already part way through being initialised 229*dbc42409SLawrence Stewart * with the CC algo we're removing will not race with this code 230*dbc42409SLawrence Stewart * because the INP_INFO_WLOCK is held during initialisation. 231*dbc42409SLawrence Stewart * We therefore don't enter the loop below until the connection 232*dbc42409SLawrence Stewart * list has stabilised. 233*dbc42409SLawrence Stewart */ 234*dbc42409SLawrence Stewart INP_INFO_RLOCK(&V_tcbinfo); 235*dbc42409SLawrence Stewart LIST_FOREACH(inp, &V_tcb, inp_list) { 236*dbc42409SLawrence Stewart INP_WLOCK(inp); 237*dbc42409SLawrence Stewart /* Important to skip tcptw structs. */ 238*dbc42409SLawrence Stewart if (!(inp->inp_flags & INP_TIMEWAIT) && 239*dbc42409SLawrence Stewart (tp = intotcpcb(inp)) != NULL) { 240*dbc42409SLawrence Stewart /* 241*dbc42409SLawrence Stewart * By holding INP_WLOCK here, we are 242*dbc42409SLawrence Stewart * assured that the connection is not 243*dbc42409SLawrence Stewart * currently executing inside the CC 244*dbc42409SLawrence Stewart * module's functions i.e. it is safe to 245*dbc42409SLawrence Stewart * make the switch back to newreno. 246*dbc42409SLawrence Stewart */ 247*dbc42409SLawrence Stewart if (CC_ALGO(tp) == remove_cc) { 248*dbc42409SLawrence Stewart tmpfuncs = CC_ALGO(tp); 249*dbc42409SLawrence Stewart /* Newreno does not require any init. */ 250*dbc42409SLawrence Stewart CC_ALGO(tp) = &newreno_cc_algo; 251*dbc42409SLawrence Stewart if (tmpfuncs->cb_destroy != NULL) 252*dbc42409SLawrence Stewart tmpfuncs->cb_destroy(tp->ccv); 253*dbc42409SLawrence Stewart } 254*dbc42409SLawrence Stewart } 255*dbc42409SLawrence Stewart INP_WUNLOCK(inp); 256*dbc42409SLawrence Stewart } 257*dbc42409SLawrence Stewart INP_INFO_RUNLOCK(&V_tcbinfo); 258*dbc42409SLawrence Stewart } 259*dbc42409SLawrence Stewart 260*dbc42409SLawrence Stewart return (err); 261*dbc42409SLawrence Stewart } 262*dbc42409SLawrence Stewart 263*dbc42409SLawrence Stewart /* 264*dbc42409SLawrence Stewart * Returns 0 on success, non-zero on failure. 265*dbc42409SLawrence Stewart */ 266*dbc42409SLawrence Stewart int 267*dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc) 268*dbc42409SLawrence Stewart { 269*dbc42409SLawrence Stewart struct cc_algo *funcs; 270*dbc42409SLawrence Stewart int err; 271*dbc42409SLawrence Stewart 272*dbc42409SLawrence Stewart err = 0; 273*dbc42409SLawrence Stewart 274*dbc42409SLawrence Stewart /* 275*dbc42409SLawrence Stewart * Iterate over list of registered CC algorithms and make sure 276*dbc42409SLawrence Stewart * we're not trying to add a duplicate. 277*dbc42409SLawrence Stewart */ 278*dbc42409SLawrence Stewart CC_LIST_WLOCK(); 279*dbc42409SLawrence Stewart STAILQ_FOREACH(funcs, &cc_list, entries) { 280*dbc42409SLawrence Stewart if (funcs == add_cc || strncmp(funcs->name, add_cc->name, 281*dbc42409SLawrence Stewart TCP_CA_NAME_MAX) == 0) 282*dbc42409SLawrence Stewart err = EEXIST; 283*dbc42409SLawrence Stewart } 284*dbc42409SLawrence Stewart 285*dbc42409SLawrence Stewart if (!err) 286*dbc42409SLawrence Stewart STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); 287*dbc42409SLawrence Stewart 288*dbc42409SLawrence Stewart CC_LIST_WUNLOCK(); 289*dbc42409SLawrence Stewart 290*dbc42409SLawrence Stewart return (err); 291*dbc42409SLawrence Stewart } 292*dbc42409SLawrence Stewart 293*dbc42409SLawrence Stewart /* 294*dbc42409SLawrence Stewart * Handles kld related events. Returns 0 on success, non-zero on failure. 295*dbc42409SLawrence Stewart */ 296*dbc42409SLawrence Stewart int 297*dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data) 298*dbc42409SLawrence Stewart { 299*dbc42409SLawrence Stewart struct cc_algo *algo; 300*dbc42409SLawrence Stewart int err; 301*dbc42409SLawrence Stewart 302*dbc42409SLawrence Stewart err = 0; 303*dbc42409SLawrence Stewart algo = (struct cc_algo *)data; 304*dbc42409SLawrence Stewart 305*dbc42409SLawrence Stewart switch(event_type) { 306*dbc42409SLawrence Stewart case MOD_LOAD: 307*dbc42409SLawrence Stewart if (algo->mod_init != NULL) 308*dbc42409SLawrence Stewart err = algo->mod_init(); 309*dbc42409SLawrence Stewart if (!err) 310*dbc42409SLawrence Stewart err = cc_register_algo(algo); 311*dbc42409SLawrence Stewart break; 312*dbc42409SLawrence Stewart 313*dbc42409SLawrence Stewart case MOD_QUIESCE: 314*dbc42409SLawrence Stewart case MOD_SHUTDOWN: 315*dbc42409SLawrence Stewart case MOD_UNLOAD: 316*dbc42409SLawrence Stewart err = cc_deregister_algo(algo); 317*dbc42409SLawrence Stewart if (!err && algo->mod_destroy != NULL) 318*dbc42409SLawrence Stewart algo->mod_destroy(); 319*dbc42409SLawrence Stewart if (err == ENOENT) 320*dbc42409SLawrence Stewart err = 0; 321*dbc42409SLawrence Stewart break; 322*dbc42409SLawrence Stewart 323*dbc42409SLawrence Stewart default: 324*dbc42409SLawrence Stewart err = EINVAL; 325*dbc42409SLawrence Stewart break; 326*dbc42409SLawrence Stewart } 327*dbc42409SLawrence Stewart 328*dbc42409SLawrence Stewart return (err); 329*dbc42409SLawrence Stewart } 330*dbc42409SLawrence Stewart 331*dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */ 332*dbc42409SLawrence Stewart SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, 333*dbc42409SLawrence Stewart "congestion control related settings"); 334*dbc42409SLawrence Stewart 335*dbc42409SLawrence Stewart SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, 336*dbc42409SLawrence Stewart NULL, 0, cc_default_algo, "A", "default congestion control algorithm"); 337*dbc42409SLawrence Stewart 338*dbc42409SLawrence Stewart SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, 339*dbc42409SLawrence Stewart NULL, 0, cc_list_available, "A", 340*dbc42409SLawrence Stewart "list available congestion control algorithms"); 341