xref: /freebsd/sys/netinet/cc/cc.c (revision 4644fda3f7a455e47f45a51a2e986d6b1fd6d0f9)
1dbc42409SLawrence Stewart /*-
2dbc42409SLawrence Stewart  * Copyright (c) 2007-2008
3dbc42409SLawrence Stewart  *	Swinburne University of Technology, Melbourne, Australia.
4dbc42409SLawrence Stewart  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
5dbc42409SLawrence Stewart  * Copyright (c) 2010 The FreeBSD Foundation
6dbc42409SLawrence Stewart  * All rights reserved.
7dbc42409SLawrence Stewart  *
8dbc42409SLawrence Stewart  * This software was developed at the Centre for Advanced Internet
9891b8ed4SLawrence Stewart  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
10891b8ed4SLawrence Stewart  * James Healy, made possible in part by a grant from the Cisco University
11891b8ed4SLawrence Stewart  * Research Program Fund at Community Foundation Silicon Valley.
12dbc42409SLawrence Stewart  *
13dbc42409SLawrence Stewart  * Portions of this software were developed at the Centre for Advanced
14dbc42409SLawrence Stewart  * Internet Architectures, Swinburne University of Technology, Melbourne,
15dbc42409SLawrence Stewart  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
16dbc42409SLawrence Stewart  *
17dbc42409SLawrence Stewart  * Redistribution and use in source and binary forms, with or without
18dbc42409SLawrence Stewart  * modification, are permitted provided that the following conditions
19dbc42409SLawrence Stewart  * are met:
20dbc42409SLawrence Stewart  * 1. Redistributions of source code must retain the above copyright
21dbc42409SLawrence Stewart  *    notice, this list of conditions and the following disclaimer.
22dbc42409SLawrence Stewart  * 2. Redistributions in binary form must reproduce the above copyright
23dbc42409SLawrence Stewart  *    notice, this list of conditions and the following disclaimer in the
24dbc42409SLawrence Stewart  *    documentation and/or other materials provided with the distribution.
25dbc42409SLawrence Stewart  *
26dbc42409SLawrence Stewart  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27dbc42409SLawrence Stewart  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28dbc42409SLawrence Stewart  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29dbc42409SLawrence Stewart  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30dbc42409SLawrence Stewart  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31dbc42409SLawrence Stewart  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32dbc42409SLawrence Stewart  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33dbc42409SLawrence Stewart  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34dbc42409SLawrence Stewart  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35dbc42409SLawrence Stewart  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36dbc42409SLawrence Stewart  * SUCH DAMAGE.
37dbc42409SLawrence Stewart  */
38dbc42409SLawrence Stewart 
39dbc42409SLawrence Stewart /*
40dbc42409SLawrence Stewart  * This software was first released in 2007 by James Healy and Lawrence Stewart
41891b8ed4SLawrence Stewart  * whilst working on the NewTCP research project at Swinburne University of
42891b8ed4SLawrence Stewart  * Technology's Centre for Advanced Internet Architectures, Melbourne,
43891b8ed4SLawrence Stewart  * Australia, which was made possible in part by a grant from the Cisco
44891b8ed4SLawrence Stewart  * University Research Program Fund at Community Foundation Silicon Valley.
45891b8ed4SLawrence Stewart  * More details are available at:
46dbc42409SLawrence Stewart  *   http://caia.swin.edu.au/urp/newtcp/
47dbc42409SLawrence Stewart  */
48dbc42409SLawrence Stewart 
49dbc42409SLawrence Stewart #include <sys/cdefs.h>
50dbc42409SLawrence Stewart __FBSDID("$FreeBSD$");
51dbc42409SLawrence Stewart 
52dbc42409SLawrence Stewart #include <sys/param.h>
53dbc42409SLawrence Stewart #include <sys/kernel.h>
54dbc42409SLawrence Stewart #include <sys/libkern.h>
55dbc42409SLawrence Stewart #include <sys/lock.h>
56dbc42409SLawrence Stewart #include <sys/malloc.h>
57dbc42409SLawrence Stewart #include <sys/module.h>
58dbc42409SLawrence Stewart #include <sys/mutex.h>
59dbc42409SLawrence Stewart #include <sys/queue.h>
60dbc42409SLawrence Stewart #include <sys/rwlock.h>
61dbc42409SLawrence Stewart #include <sys/sbuf.h>
62dbc42409SLawrence Stewart #include <sys/socket.h>
63dbc42409SLawrence Stewart #include <sys/socketvar.h>
64dbc42409SLawrence Stewart #include <sys/sysctl.h>
65dbc42409SLawrence Stewart 
66b66d74c1SGleb Smirnoff #include <net/vnet.h>
67dbc42409SLawrence Stewart 
68dbc42409SLawrence Stewart #include <netinet/in.h>
69dbc42409SLawrence Stewart #include <netinet/in_pcb.h>
702de3e790SGleb Smirnoff #include <netinet/tcp.h>
71dbc42409SLawrence Stewart #include <netinet/tcp_var.h>
72*4644fda3SGleb Smirnoff #include <netinet/cc/cc.h>
73dbc42409SLawrence Stewart 
74dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h>
75dbc42409SLawrence Stewart 
76dbc42409SLawrence Stewart /*
77dbc42409SLawrence Stewart  * List of available cc algorithms on the current system. First element
78dbc42409SLawrence Stewart  * is used as the system default CC algorithm.
79dbc42409SLawrence Stewart  */
80dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
81dbc42409SLawrence Stewart 
82dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */
83dbc42409SLawrence Stewart struct rwlock cc_list_lock;
84dbc42409SLawrence Stewart 
8578b01840SLawrence Stewart VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;
86dbc42409SLawrence Stewart 
87dbc42409SLawrence Stewart /*
88dbc42409SLawrence Stewart  * Sysctl handler to show and change the default CC algorithm.
89dbc42409SLawrence Stewart  */
90dbc42409SLawrence Stewart static int
91dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS)
92dbc42409SLawrence Stewart {
93ebf92e86SLawrence Stewart 	char default_cc[TCP_CA_NAME_MAX];
94dbc42409SLawrence Stewart 	struct cc_algo *funcs;
950e1152fcSHans Petter Selasky 	int error;
96dbc42409SLawrence Stewart 
970e1152fcSHans Petter Selasky 	/* Get the current default: */
98dbc42409SLawrence Stewart 	CC_LIST_RLOCK();
990e1152fcSHans Petter Selasky 	strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc));
100dbc42409SLawrence Stewart 	CC_LIST_RUNLOCK();
1010e1152fcSHans Petter Selasky 
1020e1152fcSHans Petter Selasky 	error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
1030e1152fcSHans Petter Selasky 
1040e1152fcSHans Petter Selasky 	/* Check for error or no change */
1050e1152fcSHans Petter Selasky 	if (error != 0 || req->newptr == NULL)
1060e1152fcSHans Petter Selasky 		goto done;
1070e1152fcSHans Petter Selasky 
1080e1152fcSHans Petter Selasky 	error = ESRCH;
1090e1152fcSHans Petter Selasky 
110dbc42409SLawrence Stewart 	/* Find algo with specified name and set it to default. */
11178b01840SLawrence Stewart 	CC_LIST_RLOCK();
112dbc42409SLawrence Stewart 	STAILQ_FOREACH(funcs, &cc_list, entries) {
1130e1152fcSHans Petter Selasky 		if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
11460a945f9SHans Petter Selasky 			continue;
11578b01840SLawrence Stewart 		V_default_cc_ptr = funcs;
1160e1152fcSHans Petter Selasky 		error = 0;
1170e1152fcSHans Petter Selasky 		break;
118dbc42409SLawrence Stewart 	}
11978b01840SLawrence Stewart 	CC_LIST_RUNLOCK();
1200e1152fcSHans Petter Selasky done:
1210e1152fcSHans Petter Selasky 	return (error);
122dbc42409SLawrence Stewart }
123dbc42409SLawrence Stewart 
124dbc42409SLawrence Stewart /*
125dbc42409SLawrence Stewart  * Sysctl handler to display the list of available CC algorithms.
126dbc42409SLawrence Stewart  */
127dbc42409SLawrence Stewart static int
128dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS)
129dbc42409SLawrence Stewart {
130dbc42409SLawrence Stewart 	struct cc_algo *algo;
131dbc42409SLawrence Stewart 	struct sbuf *s;
132a66ac850SLawrence Stewart 	int err, first, nalgos;
133dbc42409SLawrence Stewart 
134a66ac850SLawrence Stewart 	err = nalgos = 0;
135dbc42409SLawrence Stewart 	first = 1;
136a66ac850SLawrence Stewart 
137a66ac850SLawrence Stewart 	CC_LIST_RLOCK();
138a66ac850SLawrence Stewart 	STAILQ_FOREACH(algo, &cc_list, entries) {
139a66ac850SLawrence Stewart 		nalgos++;
140a66ac850SLawrence Stewart 	}
141a66ac850SLawrence Stewart 	CC_LIST_RUNLOCK();
142a66ac850SLawrence Stewart 
143a66ac850SLawrence Stewart 	s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
144dbc42409SLawrence Stewart 
145dbc42409SLawrence Stewart 	if (s == NULL)
146dbc42409SLawrence Stewart 		return (ENOMEM);
147dbc42409SLawrence Stewart 
148a66ac850SLawrence Stewart 	/*
149a66ac850SLawrence Stewart 	 * It is theoretically possible for the CC list to have grown in size
150a66ac850SLawrence Stewart 	 * since the call to sbuf_new() and therefore for the sbuf to be too
151a66ac850SLawrence Stewart 	 * small. If this were to happen (incredibly unlikely), the sbuf will
152a66ac850SLawrence Stewart 	 * reach an overflow condition, sbuf_printf() will return an error and
153a66ac850SLawrence Stewart 	 * the sysctl will fail gracefully.
154a66ac850SLawrence Stewart 	 */
155dbc42409SLawrence Stewart 	CC_LIST_RLOCK();
156dbc42409SLawrence Stewart 	STAILQ_FOREACH(algo, &cc_list, entries) {
157dbc42409SLawrence Stewart 		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
158a66ac850SLawrence Stewart 		if (err) {
159a66ac850SLawrence Stewart 			/* Sbuf overflow condition. */
160a66ac850SLawrence Stewart 			err = EOVERFLOW;
161dbc42409SLawrence Stewart 			break;
162a66ac850SLawrence Stewart 		}
163dbc42409SLawrence Stewart 		first = 0;
164dbc42409SLawrence Stewart 	}
165dbc42409SLawrence Stewart 	CC_LIST_RUNLOCK();
166dbc42409SLawrence Stewart 
167dbc42409SLawrence Stewart 	if (!err) {
168dbc42409SLawrence Stewart 		sbuf_finish(s);
169e167cb89SHans Petter Selasky 		err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
170dbc42409SLawrence Stewart 	}
171dbc42409SLawrence Stewart 
172dbc42409SLawrence Stewart 	sbuf_delete(s);
173dbc42409SLawrence Stewart 	return (err);
174dbc42409SLawrence Stewart }
175dbc42409SLawrence Stewart 
176dbc42409SLawrence Stewart /*
17778b01840SLawrence Stewart  * Reset the default CC algo to NewReno for any netstack which is using the algo
17878b01840SLawrence Stewart  * that is about to go away as its default.
17978b01840SLawrence Stewart  */
18078b01840SLawrence Stewart static void
18178b01840SLawrence Stewart cc_checkreset_default(struct cc_algo *remove_cc)
18278b01840SLawrence Stewart {
18378b01840SLawrence Stewart 	VNET_ITERATOR_DECL(vnet_iter);
18478b01840SLawrence Stewart 
18578b01840SLawrence Stewart 	CC_LIST_LOCK_ASSERT();
18678b01840SLawrence Stewart 
18778b01840SLawrence Stewart 	VNET_LIST_RLOCK_NOSLEEP();
18878b01840SLawrence Stewart 	VNET_FOREACH(vnet_iter) {
18978b01840SLawrence Stewart 		CURVNET_SET(vnet_iter);
19078b01840SLawrence Stewart 		if (strncmp(CC_DEFAULT()->name, remove_cc->name,
19178b01840SLawrence Stewart 		    TCP_CA_NAME_MAX) == 0)
19278b01840SLawrence Stewart 			V_default_cc_ptr = &newreno_cc_algo;
19378b01840SLawrence Stewart 		CURVNET_RESTORE();
19478b01840SLawrence Stewart 	}
19578b01840SLawrence Stewart 	VNET_LIST_RUNLOCK_NOSLEEP();
19678b01840SLawrence Stewart }
19778b01840SLawrence Stewart 
19878b01840SLawrence Stewart /*
199dbc42409SLawrence Stewart  * Initialise CC subsystem on system boot.
200dbc42409SLawrence Stewart  */
20114f57a8bSLawrence Stewart static void
20214f57a8bSLawrence Stewart cc_init(void)
203dbc42409SLawrence Stewart {
204dbc42409SLawrence Stewart 	CC_LIST_LOCK_INIT();
205dbc42409SLawrence Stewart 	STAILQ_INIT(&cc_list);
206dbc42409SLawrence Stewart }
207dbc42409SLawrence Stewart 
208dbc42409SLawrence Stewart /*
209dbc42409SLawrence Stewart  * Returns non-zero on success, 0 on failure.
210dbc42409SLawrence Stewart  */
211dbc42409SLawrence Stewart int
212dbc42409SLawrence Stewart cc_deregister_algo(struct cc_algo *remove_cc)
213dbc42409SLawrence Stewart {
214dbc42409SLawrence Stewart 	struct cc_algo *funcs, *tmpfuncs;
215dbc42409SLawrence Stewart 	int err;
216dbc42409SLawrence Stewart 
217dbc42409SLawrence Stewart 	err = ENOENT;
218dbc42409SLawrence Stewart 
219dbc42409SLawrence Stewart 	/* Never allow newreno to be deregistered. */
220dbc42409SLawrence Stewart 	if (&newreno_cc_algo == remove_cc)
221dbc42409SLawrence Stewart 		return (EPERM);
222dbc42409SLawrence Stewart 
223dbc42409SLawrence Stewart 	/* Remove algo from cc_list so that new connections can't use it. */
224dbc42409SLawrence Stewart 	CC_LIST_WLOCK();
225dbc42409SLawrence Stewart 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
226dbc42409SLawrence Stewart 		if (funcs == remove_cc) {
22778b01840SLawrence Stewart 			cc_checkreset_default(remove_cc);
228dbc42409SLawrence Stewart 			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
229dbc42409SLawrence Stewart 			err = 0;
230dbc42409SLawrence Stewart 			break;
231dbc42409SLawrence Stewart 		}
232dbc42409SLawrence Stewart 	}
233dbc42409SLawrence Stewart 	CC_LIST_WUNLOCK();
234dbc42409SLawrence Stewart 
23599065ae6SLawrence Stewart 	if (!err)
236dbc42409SLawrence Stewart 		/*
23799065ae6SLawrence Stewart 		 * XXXLAS:
23899065ae6SLawrence Stewart 		 * - We may need to handle non-zero return values in future.
23999065ae6SLawrence Stewart 		 * - If we add CC framework support for protocols other than
24099065ae6SLawrence Stewart 		 *   TCP, we may want a more generic way to handle this step.
241dbc42409SLawrence Stewart 		 */
24299065ae6SLawrence Stewart 		tcp_ccalgounload(remove_cc);
243dbc42409SLawrence Stewart 
244dbc42409SLawrence Stewart 	return (err);
245dbc42409SLawrence Stewart }
246dbc42409SLawrence Stewart 
247dbc42409SLawrence Stewart /*
248dbc42409SLawrence Stewart  * Returns 0 on success, non-zero on failure.
249dbc42409SLawrence Stewart  */
250dbc42409SLawrence Stewart int
251dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc)
252dbc42409SLawrence Stewart {
253dbc42409SLawrence Stewart 	struct cc_algo *funcs;
254dbc42409SLawrence Stewart 	int err;
255dbc42409SLawrence Stewart 
256dbc42409SLawrence Stewart 	err = 0;
257dbc42409SLawrence Stewart 
258dbc42409SLawrence Stewart 	/*
259dbc42409SLawrence Stewart 	 * Iterate over list of registered CC algorithms and make sure
260dbc42409SLawrence Stewart 	 * we're not trying to add a duplicate.
261dbc42409SLawrence Stewart 	 */
262dbc42409SLawrence Stewart 	CC_LIST_WLOCK();
263dbc42409SLawrence Stewart 	STAILQ_FOREACH(funcs, &cc_list, entries) {
264dbc42409SLawrence Stewart 		if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
265dbc42409SLawrence Stewart 		    TCP_CA_NAME_MAX) == 0)
266dbc42409SLawrence Stewart 			err = EEXIST;
267dbc42409SLawrence Stewart 	}
268dbc42409SLawrence Stewart 
269dbc42409SLawrence Stewart 	if (!err)
270dbc42409SLawrence Stewart 		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
271dbc42409SLawrence Stewart 
272dbc42409SLawrence Stewart 	CC_LIST_WUNLOCK();
273dbc42409SLawrence Stewart 
274dbc42409SLawrence Stewart 	return (err);
275dbc42409SLawrence Stewart }
276dbc42409SLawrence Stewart 
277dbc42409SLawrence Stewart /*
278dbc42409SLawrence Stewart  * Handles kld related events. Returns 0 on success, non-zero on failure.
279dbc42409SLawrence Stewart  */
280dbc42409SLawrence Stewart int
281dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data)
282dbc42409SLawrence Stewart {
283dbc42409SLawrence Stewart 	struct cc_algo *algo;
284dbc42409SLawrence Stewart 	int err;
285dbc42409SLawrence Stewart 
286dbc42409SLawrence Stewart 	err = 0;
287dbc42409SLawrence Stewart 	algo = (struct cc_algo *)data;
288dbc42409SLawrence Stewart 
289dbc42409SLawrence Stewart 	switch(event_type) {
290dbc42409SLawrence Stewart 	case MOD_LOAD:
291dbc42409SLawrence Stewart 		if (algo->mod_init != NULL)
292dbc42409SLawrence Stewart 			err = algo->mod_init();
293dbc42409SLawrence Stewart 		if (!err)
294dbc42409SLawrence Stewart 			err = cc_register_algo(algo);
295dbc42409SLawrence Stewart 		break;
296dbc42409SLawrence Stewart 
297dbc42409SLawrence Stewart 	case MOD_QUIESCE:
298dbc42409SLawrence Stewart 	case MOD_SHUTDOWN:
299dbc42409SLawrence Stewart 	case MOD_UNLOAD:
300dbc42409SLawrence Stewart 		err = cc_deregister_algo(algo);
301dbc42409SLawrence Stewart 		if (!err && algo->mod_destroy != NULL)
302dbc42409SLawrence Stewart 			algo->mod_destroy();
303dbc42409SLawrence Stewart 		if (err == ENOENT)
304dbc42409SLawrence Stewart 			err = 0;
305dbc42409SLawrence Stewart 		break;
306dbc42409SLawrence Stewart 
307dbc42409SLawrence Stewart 	default:
308dbc42409SLawrence Stewart 		err = EINVAL;
309dbc42409SLawrence Stewart 		break;
310dbc42409SLawrence Stewart 	}
311dbc42409SLawrence Stewart 
312dbc42409SLawrence Stewart 	return (err);
313dbc42409SLawrence Stewart }
314dbc42409SLawrence Stewart 
31514f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
31614f57a8bSLawrence Stewart 
317dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */
318dbc42409SLawrence Stewart SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
319dbc42409SLawrence Stewart     "congestion control related settings");
320dbc42409SLawrence Stewart 
3216df8a710SGleb Smirnoff SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
3226df8a710SGleb Smirnoff     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW,
323dbc42409SLawrence Stewart     NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
324dbc42409SLawrence Stewart 
325dbc42409SLawrence Stewart SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
326dbc42409SLawrence Stewart     NULL, 0, cc_list_available, "A",
327dbc42409SLawrence Stewart     "list available congestion control algorithms");
328