xref: /freebsd/sys/netinet/cc/cc.c (revision 3d265fce43746c293ae826e9603adbfe09f93cf6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007-2008
5  *	Swinburne University of Technology, Melbourne, Australia.
6  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7  * Copyright (c) 2010 The FreeBSD Foundation
8  * All rights reserved.
9  *
10  * This software was developed at the Centre for Advanced Internet
11  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
12  * James Healy, made possible in part by a grant from the Cisco University
13  * Research Program Fund at Community Foundation Silicon Valley.
14  *
15  * Portions of this software were developed at the Centre for Advanced
16  * Internet Architectures, Swinburne University of Technology, Melbourne,
17  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 /*
42  * This software was first released in 2007 by James Healy and Lawrence Stewart
43  * whilst working on the NewTCP research project at Swinburne University of
44  * Technology's Centre for Advanced Internet Architectures, Melbourne,
45  * Australia, which was made possible in part by a grant from the Cisco
46  * University Research Program Fund at Community Foundation Silicon Valley.
47  * More details are available at:
48  *   http://caia.swin.edu.au/urp/newtcp/
49  */
50 
51 #include <sys/cdefs.h>
52 __FBSDID("$FreeBSD$");
53 
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/libkern.h>
57 #include <sys/lock.h>
58 #include <sys/malloc.h>
59 #include <sys/module.h>
60 #include <sys/mutex.h>
61 #include <sys/queue.h>
62 #include <sys/rwlock.h>
63 #include <sys/sbuf.h>
64 #include <sys/socket.h>
65 #include <sys/socketvar.h>
66 #include <sys/sysctl.h>
67 
68 #include <net/vnet.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/in_pcb.h>
72 #include <netinet/tcp.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/cc/cc.h>
75 
76 #include <netinet/cc/cc_module.h>
77 
78 /*
79  * List of available cc algorithms on the current system. First element
80  * is used as the system default CC algorithm.
81  */
82 struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
83 
84 /* Protects the cc_list TAILQ. */
85 struct rwlock cc_list_lock;
86 
87 VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;
88 
89 /*
90  * Sysctl handler to show and change the default CC algorithm.
91  */
92 static int
93 cc_default_algo(SYSCTL_HANDLER_ARGS)
94 {
95 	char default_cc[TCP_CA_NAME_MAX];
96 	struct cc_algo *funcs;
97 	int error;
98 
99 	/* Get the current default: */
100 	CC_LIST_RLOCK();
101 	strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc));
102 	CC_LIST_RUNLOCK();
103 
104 	error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
105 
106 	/* Check for error or no change */
107 	if (error != 0 || req->newptr == NULL)
108 		goto done;
109 
110 	error = ESRCH;
111 
112 	/* Find algo with specified name and set it to default. */
113 	CC_LIST_RLOCK();
114 	STAILQ_FOREACH(funcs, &cc_list, entries) {
115 		if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
116 			continue;
117 		V_default_cc_ptr = funcs;
118 		error = 0;
119 		break;
120 	}
121 	CC_LIST_RUNLOCK();
122 done:
123 	return (error);
124 }
125 
126 /*
127  * Sysctl handler to display the list of available CC algorithms.
128  */
129 static int
130 cc_list_available(SYSCTL_HANDLER_ARGS)
131 {
132 	struct cc_algo *algo;
133 	struct sbuf *s;
134 	int err, first, nalgos;
135 
136 	err = nalgos = 0;
137 	first = 1;
138 
139 	CC_LIST_RLOCK();
140 	STAILQ_FOREACH(algo, &cc_list, entries) {
141 		nalgos++;
142 	}
143 	CC_LIST_RUNLOCK();
144 
145 	s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
146 
147 	if (s == NULL)
148 		return (ENOMEM);
149 
150 	/*
151 	 * It is theoretically possible for the CC list to have grown in size
152 	 * since the call to sbuf_new() and therefore for the sbuf to be too
153 	 * small. If this were to happen (incredibly unlikely), the sbuf will
154 	 * reach an overflow condition, sbuf_printf() will return an error and
155 	 * the sysctl will fail gracefully.
156 	 */
157 	CC_LIST_RLOCK();
158 	STAILQ_FOREACH(algo, &cc_list, entries) {
159 		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
160 		if (err) {
161 			/* Sbuf overflow condition. */
162 			err = EOVERFLOW;
163 			break;
164 		}
165 		first = 0;
166 	}
167 	CC_LIST_RUNLOCK();
168 
169 	if (!err) {
170 		sbuf_finish(s);
171 		err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
172 	}
173 
174 	sbuf_delete(s);
175 	return (err);
176 }
177 
178 /*
179  * Reset the default CC algo to NewReno for any netstack which is using the algo
180  * that is about to go away as its default.
181  */
182 static void
183 cc_checkreset_default(struct cc_algo *remove_cc)
184 {
185 	VNET_ITERATOR_DECL(vnet_iter);
186 
187 	CC_LIST_LOCK_ASSERT();
188 
189 	VNET_LIST_RLOCK_NOSLEEP();
190 	VNET_FOREACH(vnet_iter) {
191 		CURVNET_SET(vnet_iter);
192 		if (strncmp(CC_DEFAULT()->name, remove_cc->name,
193 		    TCP_CA_NAME_MAX) == 0)
194 			V_default_cc_ptr = &newreno_cc_algo;
195 		CURVNET_RESTORE();
196 	}
197 	VNET_LIST_RUNLOCK_NOSLEEP();
198 }
199 
200 /*
201  * Initialise CC subsystem on system boot.
202  */
203 static void
204 cc_init(void)
205 {
206 	CC_LIST_LOCK_INIT();
207 	STAILQ_INIT(&cc_list);
208 }
209 
210 /*
211  * Returns non-zero on success, 0 on failure.
212  */
213 int
214 cc_deregister_algo(struct cc_algo *remove_cc)
215 {
216 	struct cc_algo *funcs, *tmpfuncs;
217 	int err;
218 
219 	err = ENOENT;
220 
221 	/* Never allow newreno to be deregistered. */
222 	if (&newreno_cc_algo == remove_cc)
223 		return (EPERM);
224 
225 	/* Remove algo from cc_list so that new connections can't use it. */
226 	CC_LIST_WLOCK();
227 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
228 		if (funcs == remove_cc) {
229 			cc_checkreset_default(remove_cc);
230 			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
231 			err = 0;
232 			break;
233 		}
234 	}
235 	CC_LIST_WUNLOCK();
236 
237 	if (!err)
238 		/*
239 		 * XXXLAS:
240 		 * - We may need to handle non-zero return values in future.
241 		 * - If we add CC framework support for protocols other than
242 		 *   TCP, we may want a more generic way to handle this step.
243 		 */
244 		tcp_ccalgounload(remove_cc);
245 
246 	return (err);
247 }
248 
249 /*
250  * Returns 0 on success, non-zero on failure.
251  */
252 int
253 cc_register_algo(struct cc_algo *add_cc)
254 {
255 	struct cc_algo *funcs;
256 	int err;
257 
258 	err = 0;
259 
260 	/*
261 	 * Iterate over list of registered CC algorithms and make sure
262 	 * we're not trying to add a duplicate.
263 	 */
264 	CC_LIST_WLOCK();
265 	STAILQ_FOREACH(funcs, &cc_list, entries) {
266 		if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
267 		    TCP_CA_NAME_MAX) == 0)
268 			err = EEXIST;
269 	}
270 
271 	if (!err)
272 		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
273 
274 	CC_LIST_WUNLOCK();
275 
276 	return (err);
277 }
278 
279 /*
280  * Handles kld related events. Returns 0 on success, non-zero on failure.
281  */
282 int
283 cc_modevent(module_t mod, int event_type, void *data)
284 {
285 	struct cc_algo *algo;
286 	int err;
287 
288 	err = 0;
289 	algo = (struct cc_algo *)data;
290 
291 	switch(event_type) {
292 	case MOD_LOAD:
293 		if (algo->mod_init != NULL)
294 			err = algo->mod_init();
295 		if (!err)
296 			err = cc_register_algo(algo);
297 		break;
298 
299 	case MOD_QUIESCE:
300 	case MOD_SHUTDOWN:
301 	case MOD_UNLOAD:
302 		err = cc_deregister_algo(algo);
303 		if (!err && algo->mod_destroy != NULL)
304 			algo->mod_destroy();
305 		if (err == ENOENT)
306 			err = 0;
307 		break;
308 
309 	default:
310 		err = EINVAL;
311 		break;
312 	}
313 
314 	return (err);
315 }
316 
317 SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
318 
319 /* Declare sysctl tree and populate it. */
320 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
321     "Congestion control related settings");
322 
323 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
324     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
325     NULL, 0, cc_default_algo, "A",
326     "Default congestion control algorithm");
327 
328 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available,
329     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
330     NULL, 0, cc_list_available, "A",
331     "List available congestion control algorithms");
332 
333 VNET_DEFINE(int, cc_do_abe) = 0;
334 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW,
335     &VNET_NAME(cc_do_abe), 0,
336     "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)");
337 
338 VNET_DEFINE(int, cc_abe_frlossreduce) = 0;
339 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW,
340     &VNET_NAME(cc_abe_frlossreduce), 0,
341     "Apply standard beta instead of ABE-beta during ECN-signalled congestion "
342     "recovery episodes if loss also needs to be repaired");
343