xref: /freebsd/sys/netinet/cc/cc.c (revision 14f57a8b027ecbc87cdec0e1750ddc9db9b06bb0)
1dbc42409SLawrence Stewart /*-
2dbc42409SLawrence Stewart  * Copyright (c) 2007-2008
3dbc42409SLawrence Stewart  *	Swinburne University of Technology, Melbourne, Australia.
4dbc42409SLawrence Stewart  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
5dbc42409SLawrence Stewart  * Copyright (c) 2010 The FreeBSD Foundation
6dbc42409SLawrence Stewart  * All rights reserved.
7dbc42409SLawrence Stewart  *
8dbc42409SLawrence Stewart  * This software was developed at the Centre for Advanced Internet
9dbc42409SLawrence Stewart  * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
10dbc42409SLawrence Stewart  * made possible in part by a grant from the Cisco University Research Program
11dbc42409SLawrence Stewart  * Fund at Community Foundation Silicon Valley.
12dbc42409SLawrence Stewart  *
13dbc42409SLawrence Stewart  * Portions of this software were developed at the Centre for Advanced
14dbc42409SLawrence Stewart  * Internet Architectures, Swinburne University of Technology, Melbourne,
15dbc42409SLawrence Stewart  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
16dbc42409SLawrence Stewart  *
17dbc42409SLawrence Stewart  * Redistribution and use in source and binary forms, with or without
18dbc42409SLawrence Stewart  * modification, are permitted provided that the following conditions
19dbc42409SLawrence Stewart  * are met:
20dbc42409SLawrence Stewart  * 1. Redistributions of source code must retain the above copyright
21dbc42409SLawrence Stewart  *    notice, this list of conditions and the following disclaimer.
22dbc42409SLawrence Stewart  * 2. Redistributions in binary form must reproduce the above copyright
23dbc42409SLawrence Stewart  *    notice, this list of conditions and the following disclaimer in the
24dbc42409SLawrence Stewart  *    documentation and/or other materials provided with the distribution.
25dbc42409SLawrence Stewart  *
26dbc42409SLawrence Stewart  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27dbc42409SLawrence Stewart  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28dbc42409SLawrence Stewart  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29dbc42409SLawrence Stewart  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30dbc42409SLawrence Stewart  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31dbc42409SLawrence Stewart  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32dbc42409SLawrence Stewart  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33dbc42409SLawrence Stewart  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34dbc42409SLawrence Stewart  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35dbc42409SLawrence Stewart  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36dbc42409SLawrence Stewart  * SUCH DAMAGE.
37dbc42409SLawrence Stewart  */
38dbc42409SLawrence Stewart 
39dbc42409SLawrence Stewart /*
40dbc42409SLawrence Stewart  * This software was first released in 2007 by James Healy and Lawrence Stewart
41dbc42409SLawrence Stewart  * whilst working on the NewTCP research project at Swinburne University's
42dbc42409SLawrence Stewart  * Centre for Advanced Internet Architectures, Melbourne, Australia, which was
43dbc42409SLawrence Stewart  * made possible in part by a grant from the Cisco University Research Program
44dbc42409SLawrence Stewart  * Fund at Community Foundation Silicon Valley. More details are available at:
45dbc42409SLawrence Stewart  *   http://caia.swin.edu.au/urp/newtcp/
46dbc42409SLawrence Stewart  */
47dbc42409SLawrence Stewart 
48dbc42409SLawrence Stewart #include <sys/cdefs.h>
49dbc42409SLawrence Stewart __FBSDID("$FreeBSD$");
50dbc42409SLawrence Stewart 
51dbc42409SLawrence Stewart #include <sys/param.h>
52dbc42409SLawrence Stewart #include <sys/kernel.h>
53dbc42409SLawrence Stewart #include <sys/libkern.h>
54dbc42409SLawrence Stewart #include <sys/lock.h>
55dbc42409SLawrence Stewart #include <sys/malloc.h>
56dbc42409SLawrence Stewart #include <sys/module.h>
57dbc42409SLawrence Stewart #include <sys/mutex.h>
58dbc42409SLawrence Stewart #include <sys/queue.h>
59dbc42409SLawrence Stewart #include <sys/rwlock.h>
60dbc42409SLawrence Stewart #include <sys/sbuf.h>
61dbc42409SLawrence Stewart #include <sys/socket.h>
62dbc42409SLawrence Stewart #include <sys/socketvar.h>
63dbc42409SLawrence Stewart #include <sys/sysctl.h>
64dbc42409SLawrence Stewart 
65dbc42409SLawrence Stewart #include <net/if.h>
66dbc42409SLawrence Stewart #include <net/if_var.h>
67dbc42409SLawrence Stewart 
68dbc42409SLawrence Stewart #include <netinet/cc.h>
69dbc42409SLawrence Stewart #include <netinet/in.h>
70dbc42409SLawrence Stewart #include <netinet/in_pcb.h>
71dbc42409SLawrence Stewart #include <netinet/tcp_var.h>
72dbc42409SLawrence Stewart 
73dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h>
74dbc42409SLawrence Stewart 
75dbc42409SLawrence Stewart /*
76dbc42409SLawrence Stewart  * List of available cc algorithms on the current system. First element
77dbc42409SLawrence Stewart  * is used as the system default CC algorithm.
78dbc42409SLawrence Stewart  */
79dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
80dbc42409SLawrence Stewart 
81dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */
82dbc42409SLawrence Stewart struct rwlock cc_list_lock;
83dbc42409SLawrence Stewart 
84dbc42409SLawrence Stewart /*
85dbc42409SLawrence Stewart  * Set the default CC algorithm to new_default. The default is identified
86dbc42409SLawrence Stewart  * by being the first element in the cc_list TAILQ.
87dbc42409SLawrence Stewart  */
88dbc42409SLawrence Stewart static void
89dbc42409SLawrence Stewart cc_set_default(struct cc_algo *new_default)
90dbc42409SLawrence Stewart {
91dbc42409SLawrence Stewart 	CC_LIST_WLOCK_ASSERT();
92dbc42409SLawrence Stewart 
93dbc42409SLawrence Stewart 	/*
94dbc42409SLawrence Stewart 	 * Make the requested system default CC algorithm the first element in
95dbc42409SLawrence Stewart 	 * the list if it isn't already.
96dbc42409SLawrence Stewart 	 */
97dbc42409SLawrence Stewart 	if (new_default != CC_DEFAULT()) {
98dbc42409SLawrence Stewart 		STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries);
99dbc42409SLawrence Stewart 		STAILQ_INSERT_HEAD(&cc_list, new_default, entries);
100dbc42409SLawrence Stewart 	}
101dbc42409SLawrence Stewart }
102dbc42409SLawrence Stewart 
103dbc42409SLawrence Stewart /*
104dbc42409SLawrence Stewart  * Sysctl handler to show and change the default CC algorithm.
105dbc42409SLawrence Stewart  */
106dbc42409SLawrence Stewart static int
107dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS)
108dbc42409SLawrence Stewart {
109dbc42409SLawrence Stewart 	struct cc_algo *funcs;
110dbc42409SLawrence Stewart 	int err, found;
111dbc42409SLawrence Stewart 
112dbc42409SLawrence Stewart 	err = found = 0;
113dbc42409SLawrence Stewart 
114dbc42409SLawrence Stewart 	if (req->newptr == NULL) {
115dbc42409SLawrence Stewart 		char default_cc[TCP_CA_NAME_MAX];
116dbc42409SLawrence Stewart 
117dbc42409SLawrence Stewart 		/* Just print the current default. */
118dbc42409SLawrence Stewart 		CC_LIST_RLOCK();
119dbc42409SLawrence Stewart 		strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX);
120dbc42409SLawrence Stewart 		CC_LIST_RUNLOCK();
121dbc42409SLawrence Stewart 		err = sysctl_handle_string(oidp, default_cc, 1, req);
122dbc42409SLawrence Stewart 	} else {
123dbc42409SLawrence Stewart 		/* Find algo with specified name and set it to default. */
124dbc42409SLawrence Stewart 		CC_LIST_WLOCK();
125dbc42409SLawrence Stewart 		STAILQ_FOREACH(funcs, &cc_list, entries) {
126dbc42409SLawrence Stewart 			if (strncmp((char *)req->newptr, funcs->name,
127dbc42409SLawrence Stewart 			    TCP_CA_NAME_MAX) == 0) {
128dbc42409SLawrence Stewart 				found = 1;
129dbc42409SLawrence Stewart 				cc_set_default(funcs);
130dbc42409SLawrence Stewart 			}
131dbc42409SLawrence Stewart 		}
132dbc42409SLawrence Stewart 		CC_LIST_WUNLOCK();
133dbc42409SLawrence Stewart 
134dbc42409SLawrence Stewart 		if (!found)
135dbc42409SLawrence Stewart 			err = ESRCH;
136dbc42409SLawrence Stewart 	}
137dbc42409SLawrence Stewart 
138dbc42409SLawrence Stewart 	return (err);
139dbc42409SLawrence Stewart }
140dbc42409SLawrence Stewart 
141dbc42409SLawrence Stewart /*
142dbc42409SLawrence Stewart  * Sysctl handler to display the list of available CC algorithms.
143dbc42409SLawrence Stewart  */
144dbc42409SLawrence Stewart static int
145dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS)
146dbc42409SLawrence Stewart {
147dbc42409SLawrence Stewart 	struct cc_algo *algo;
148dbc42409SLawrence Stewart 	struct sbuf *s;
149dbc42409SLawrence Stewart 	int err, first;
150dbc42409SLawrence Stewart 
151dbc42409SLawrence Stewart 	err = 0;
152dbc42409SLawrence Stewart 	first = 1;
153dbc42409SLawrence Stewart 	s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND);
154dbc42409SLawrence Stewart 
155dbc42409SLawrence Stewart 	if (s == NULL)
156dbc42409SLawrence Stewart 		return (ENOMEM);
157dbc42409SLawrence Stewart 
158dbc42409SLawrence Stewart 	CC_LIST_RLOCK();
159dbc42409SLawrence Stewart 	STAILQ_FOREACH(algo, &cc_list, entries) {
160dbc42409SLawrence Stewart 		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
161dbc42409SLawrence Stewart 		if (err)
162dbc42409SLawrence Stewart 			break;
163dbc42409SLawrence Stewart 		first = 0;
164dbc42409SLawrence Stewart 	}
165dbc42409SLawrence Stewart 	CC_LIST_RUNLOCK();
166dbc42409SLawrence Stewart 
167dbc42409SLawrence Stewart 	if (!err) {
168dbc42409SLawrence Stewart 		sbuf_finish(s);
169dbc42409SLawrence Stewart 		err = sysctl_handle_string(oidp, sbuf_data(s), 1, req);
170dbc42409SLawrence Stewart 	}
171dbc42409SLawrence Stewart 
172dbc42409SLawrence Stewart 	sbuf_delete(s);
173dbc42409SLawrence Stewart 	return (err);
174dbc42409SLawrence Stewart }
175dbc42409SLawrence Stewart 
176dbc42409SLawrence Stewart /*
177dbc42409SLawrence Stewart  * Initialise CC subsystem on system boot.
178dbc42409SLawrence Stewart  */
179*14f57a8bSLawrence Stewart static void
180*14f57a8bSLawrence Stewart cc_init(void)
181dbc42409SLawrence Stewart {
182dbc42409SLawrence Stewart 	CC_LIST_LOCK_INIT();
183dbc42409SLawrence Stewart 	STAILQ_INIT(&cc_list);
184dbc42409SLawrence Stewart }
185dbc42409SLawrence Stewart 
186dbc42409SLawrence Stewart /*
187dbc42409SLawrence Stewart  * Returns non-zero on success, 0 on failure.
188dbc42409SLawrence Stewart  */
189dbc42409SLawrence Stewart int
190dbc42409SLawrence Stewart cc_deregister_algo(struct cc_algo *remove_cc)
191dbc42409SLawrence Stewart {
192dbc42409SLawrence Stewart 	struct cc_algo *funcs, *tmpfuncs;
193dbc42409SLawrence Stewart 	struct tcpcb *tp;
194dbc42409SLawrence Stewart 	struct inpcb *inp;
195dbc42409SLawrence Stewart 	int err;
196dbc42409SLawrence Stewart 
197dbc42409SLawrence Stewart 	err = ENOENT;
198dbc42409SLawrence Stewart 
199dbc42409SLawrence Stewart 	/* Never allow newreno to be deregistered. */
200dbc42409SLawrence Stewart 	if (&newreno_cc_algo == remove_cc)
201dbc42409SLawrence Stewart 		return (EPERM);
202dbc42409SLawrence Stewart 
203dbc42409SLawrence Stewart 	/* Remove algo from cc_list so that new connections can't use it. */
204dbc42409SLawrence Stewart 	CC_LIST_WLOCK();
205dbc42409SLawrence Stewart 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
206dbc42409SLawrence Stewart 		if (funcs == remove_cc) {
207dbc42409SLawrence Stewart 			/*
208dbc42409SLawrence Stewart 			 * If we're removing the current system default,
209dbc42409SLawrence Stewart 			 * reset the default to newreno.
210dbc42409SLawrence Stewart 			 */
211dbc42409SLawrence Stewart 			if (strncmp(CC_DEFAULT()->name, remove_cc->name,
212dbc42409SLawrence Stewart 			    TCP_CA_NAME_MAX) == 0)
213dbc42409SLawrence Stewart 				cc_set_default(&newreno_cc_algo);
214dbc42409SLawrence Stewart 
215dbc42409SLawrence Stewart 			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
216dbc42409SLawrence Stewart 			err = 0;
217dbc42409SLawrence Stewart 			break;
218dbc42409SLawrence Stewart 		}
219dbc42409SLawrence Stewart 	}
220dbc42409SLawrence Stewart 	CC_LIST_WUNLOCK();
221dbc42409SLawrence Stewart 
222dbc42409SLawrence Stewart 	if (!err) {
223dbc42409SLawrence Stewart 		/*
224dbc42409SLawrence Stewart 		 * Check all active control blocks and change any that are
225dbc42409SLawrence Stewart 		 * using this algorithm back to newreno. If the algorithm that
226dbc42409SLawrence Stewart 		 * was in use requires cleanup code to be run, call it.
227dbc42409SLawrence Stewart 		 *
228dbc42409SLawrence Stewart 		 * New connections already part way through being initialised
229dbc42409SLawrence Stewart 		 * with the CC algo we're removing will not race with this code
230dbc42409SLawrence Stewart 		 * because the INP_INFO_WLOCK is held during initialisation.
231dbc42409SLawrence Stewart 		 * We therefore don't enter the loop below until the connection
232dbc42409SLawrence Stewart 		 * list has stabilised.
233dbc42409SLawrence Stewart 		 */
234dbc42409SLawrence Stewart 		INP_INFO_RLOCK(&V_tcbinfo);
235dbc42409SLawrence Stewart 		LIST_FOREACH(inp, &V_tcb, inp_list) {
236dbc42409SLawrence Stewart 			INP_WLOCK(inp);
237dbc42409SLawrence Stewart 			/* Important to skip tcptw structs. */
238dbc42409SLawrence Stewart 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
239dbc42409SLawrence Stewart 			    (tp = intotcpcb(inp)) != NULL) {
240dbc42409SLawrence Stewart 				/*
241dbc42409SLawrence Stewart 				 * By holding INP_WLOCK here, we are
242dbc42409SLawrence Stewart 				 * assured that the connection is not
243dbc42409SLawrence Stewart 				 * currently executing inside the CC
244dbc42409SLawrence Stewart 				 * module's functions i.e. it is safe to
245dbc42409SLawrence Stewart 				 * make the switch back to newreno.
246dbc42409SLawrence Stewart 				 */
247dbc42409SLawrence Stewart 				if (CC_ALGO(tp) == remove_cc) {
248dbc42409SLawrence Stewart 					tmpfuncs = CC_ALGO(tp);
249dbc42409SLawrence Stewart 					/* Newreno does not require any init. */
250dbc42409SLawrence Stewart 					CC_ALGO(tp) = &newreno_cc_algo;
251dbc42409SLawrence Stewart 					if (tmpfuncs->cb_destroy != NULL)
252dbc42409SLawrence Stewart 						tmpfuncs->cb_destroy(tp->ccv);
253dbc42409SLawrence Stewart 				}
254dbc42409SLawrence Stewart 			}
255dbc42409SLawrence Stewart 			INP_WUNLOCK(inp);
256dbc42409SLawrence Stewart 		}
257dbc42409SLawrence Stewart 		INP_INFO_RUNLOCK(&V_tcbinfo);
258dbc42409SLawrence Stewart 	}
259dbc42409SLawrence Stewart 
260dbc42409SLawrence Stewart 	return (err);
261dbc42409SLawrence Stewart }
262dbc42409SLawrence Stewart 
263dbc42409SLawrence Stewart /*
264dbc42409SLawrence Stewart  * Returns 0 on success, non-zero on failure.
265dbc42409SLawrence Stewart  */
266dbc42409SLawrence Stewart int
267dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc)
268dbc42409SLawrence Stewart {
269dbc42409SLawrence Stewart 	struct cc_algo *funcs;
270dbc42409SLawrence Stewart 	int err;
271dbc42409SLawrence Stewart 
272dbc42409SLawrence Stewart 	err = 0;
273dbc42409SLawrence Stewart 
274dbc42409SLawrence Stewart 	/*
275dbc42409SLawrence Stewart 	 * Iterate over list of registered CC algorithms and make sure
276dbc42409SLawrence Stewart 	 * we're not trying to add a duplicate.
277dbc42409SLawrence Stewart 	 */
278dbc42409SLawrence Stewart 	CC_LIST_WLOCK();
279dbc42409SLawrence Stewart 	STAILQ_FOREACH(funcs, &cc_list, entries) {
280dbc42409SLawrence Stewart 		if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
281dbc42409SLawrence Stewart 		    TCP_CA_NAME_MAX) == 0)
282dbc42409SLawrence Stewart 			err = EEXIST;
283dbc42409SLawrence Stewart 	}
284dbc42409SLawrence Stewart 
285dbc42409SLawrence Stewart 	if (!err)
286dbc42409SLawrence Stewart 		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
287dbc42409SLawrence Stewart 
288dbc42409SLawrence Stewart 	CC_LIST_WUNLOCK();
289dbc42409SLawrence Stewart 
290dbc42409SLawrence Stewart 	return (err);
291dbc42409SLawrence Stewart }
292dbc42409SLawrence Stewart 
293dbc42409SLawrence Stewart /*
294dbc42409SLawrence Stewart  * Handles kld related events. Returns 0 on success, non-zero on failure.
295dbc42409SLawrence Stewart  */
296dbc42409SLawrence Stewart int
297dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data)
298dbc42409SLawrence Stewart {
299dbc42409SLawrence Stewart 	struct cc_algo *algo;
300dbc42409SLawrence Stewart 	int err;
301dbc42409SLawrence Stewart 
302dbc42409SLawrence Stewart 	err = 0;
303dbc42409SLawrence Stewart 	algo = (struct cc_algo *)data;
304dbc42409SLawrence Stewart 
305dbc42409SLawrence Stewart 	switch(event_type) {
306dbc42409SLawrence Stewart 	case MOD_LOAD:
307dbc42409SLawrence Stewart 		if (algo->mod_init != NULL)
308dbc42409SLawrence Stewart 			err = algo->mod_init();
309dbc42409SLawrence Stewart 		if (!err)
310dbc42409SLawrence Stewart 			err = cc_register_algo(algo);
311dbc42409SLawrence Stewart 		break;
312dbc42409SLawrence Stewart 
313dbc42409SLawrence Stewart 	case MOD_QUIESCE:
314dbc42409SLawrence Stewart 	case MOD_SHUTDOWN:
315dbc42409SLawrence Stewart 	case MOD_UNLOAD:
316dbc42409SLawrence Stewart 		err = cc_deregister_algo(algo);
317dbc42409SLawrence Stewart 		if (!err && algo->mod_destroy != NULL)
318dbc42409SLawrence Stewart 			algo->mod_destroy();
319dbc42409SLawrence Stewart 		if (err == ENOENT)
320dbc42409SLawrence Stewart 			err = 0;
321dbc42409SLawrence Stewart 		break;
322dbc42409SLawrence Stewart 
323dbc42409SLawrence Stewart 	default:
324dbc42409SLawrence Stewart 		err = EINVAL;
325dbc42409SLawrence Stewart 		break;
326dbc42409SLawrence Stewart 	}
327dbc42409SLawrence Stewart 
328dbc42409SLawrence Stewart 	return (err);
329dbc42409SLawrence Stewart }
330dbc42409SLawrence Stewart 
331*14f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
332*14f57a8bSLawrence Stewart 
333dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */
334dbc42409SLawrence Stewart SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
335dbc42409SLawrence Stewart     "congestion control related settings");
336dbc42409SLawrence Stewart 
337dbc42409SLawrence Stewart SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
338dbc42409SLawrence Stewart     NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
339dbc42409SLawrence Stewart 
340dbc42409SLawrence Stewart SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
341dbc42409SLawrence Stewart     NULL, 0, cc_list_available, "A",
342dbc42409SLawrence Stewart     "list available congestion control algorithms");
343