xref: /freebsd/sys/netinet/cc/cc.c (revision 3332f1b444d4a73238e9f59cca27bfc95fe936bd)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007-2008
5  *	Swinburne University of Technology, Melbourne, Australia.
6  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7  * Copyright (c) 2010 The FreeBSD Foundation
8  * All rights reserved.
9  *
10  * This software was developed at the Centre for Advanced Internet
11  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
12  * James Healy, made possible in part by a grant from the Cisco University
13  * Research Program Fund at Community Foundation Silicon Valley.
14  *
15  * Portions of this software were developed at the Centre for Advanced
16  * Internet Architectures, Swinburne University of Technology, Melbourne,
17  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 /*
42  * This software was first released in 2007 by James Healy and Lawrence Stewart
43  * whilst working on the NewTCP research project at Swinburne University of
44  * Technology's Centre for Advanced Internet Architectures, Melbourne,
45  * Australia, which was made possible in part by a grant from the Cisco
46  * University Research Program Fund at Community Foundation Silicon Valley.
47  * More details are available at:
48  *   http://caia.swin.edu.au/urp/newtcp/
49  */
50 
51 #include <sys/cdefs.h>
52 __FBSDID("$FreeBSD$");
53 #include <opt_cc.h>
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/libkern.h>
57 #include <sys/lock.h>
58 #include <sys/malloc.h>
59 #include <sys/module.h>
60 #include <sys/mutex.h>
61 #include <sys/queue.h>
62 #include <sys/rwlock.h>
63 #include <sys/sbuf.h>
64 #include <sys/socket.h>
65 #include <sys/socketvar.h>
66 #include <sys/sysctl.h>
67 
68 #include <net/vnet.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/in_pcb.h>
72 #include <netinet/tcp.h>
73 #include <netinet/tcp_seq.h>
74 #include <netinet/tcp_var.h>
75 #include <netinet/tcp_log_buf.h>
76 #include <netinet/tcp_hpts.h>
77 #include <netinet/cc/cc.h>
78 #include <netinet/cc/cc_module.h>
79 
80 MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");
81 
82 /*
83  * List of available cc algorithms on the current system. First element
84  * is used as the system default CC algorithm.
85  */
86 struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
87 
88 /* Protects the cc_list TAILQ. */
89 struct rwlock cc_list_lock;
90 
91 VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
92 
93 VNET_DEFINE(uint32_t, newreno_beta) = 50;
94 #define V_newreno_beta VNET(newreno_beta)
95 
96 /*
97  * Sysctl handler to show and change the default CC algorithm.
98  */
99 static int
100 cc_default_algo(SYSCTL_HANDLER_ARGS)
101 {
102 	char default_cc[TCP_CA_NAME_MAX];
103 	struct cc_algo *funcs;
104 	int error;
105 
106 	/* Get the current default: */
107 	CC_LIST_RLOCK();
108 	if (CC_DEFAULT_ALGO() != NULL)
109 		strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc));
110 	else
111 		memset(default_cc, 0, TCP_CA_NAME_MAX);
112 	CC_LIST_RUNLOCK();
113 
114 	error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
115 
116 	/* Check for error or no change */
117 	if (error != 0 || req->newptr == NULL)
118 		goto done;
119 
120 	error = ESRCH;
121 	/* Find algo with specified name and set it to default. */
122 	CC_LIST_RLOCK();
123 	STAILQ_FOREACH(funcs, &cc_list, entries) {
124 		if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
125 			continue;
126 		V_default_cc_ptr = funcs;
127 		error = 0;
128 		break;
129 	}
130 	CC_LIST_RUNLOCK();
131 done:
132 	return (error);
133 }
134 
135 /*
136  * Sysctl handler to display the list of available CC algorithms.
137  */
138 static int
139 cc_list_available(SYSCTL_HANDLER_ARGS)
140 {
141 	struct cc_algo *algo;
142 	struct sbuf *s;
143 	int err, first, nalgos;
144 
145 	err = nalgos = 0;
146 	first = 1;
147 
148 	CC_LIST_RLOCK();
149 	STAILQ_FOREACH(algo, &cc_list, entries) {
150 		nalgos++;
151 	}
152 	CC_LIST_RUNLOCK();
153 	if (nalgos == 0) {
154 		return (ENOENT);
155 	}
156 	s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
157 
158 	if (s == NULL)
159 		return (ENOMEM);
160 
161 	/*
162 	 * It is theoretically possible for the CC list to have grown in size
163 	 * since the call to sbuf_new() and therefore for the sbuf to be too
164 	 * small. If this were to happen (incredibly unlikely), the sbuf will
165 	 * reach an overflow condition, sbuf_printf() will return an error and
166 	 * the sysctl will fail gracefully.
167 	 */
168 	CC_LIST_RLOCK();
169 	STAILQ_FOREACH(algo, &cc_list, entries) {
170 		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
171 		if (err) {
172 			/* Sbuf overflow condition. */
173 			err = EOVERFLOW;
174 			break;
175 		}
176 		first = 0;
177 	}
178 	CC_LIST_RUNLOCK();
179 
180 	if (!err) {
181 		sbuf_finish(s);
182 		err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
183 	}
184 
185 	sbuf_delete(s);
186 	return (err);
187 }
188 
189 /*
190  * Return the number of times a proposed removal_cc is
191  * being used as the default.
192  */
193 static int
194 cc_check_default(struct cc_algo *remove_cc)
195 {
196 	int cnt = 0;
197 	VNET_ITERATOR_DECL(vnet_iter);
198 
199 	CC_LIST_LOCK_ASSERT();
200 
201 	VNET_LIST_RLOCK_NOSLEEP();
202 	VNET_FOREACH(vnet_iter) {
203 		CURVNET_SET(vnet_iter);
204 		if ((CC_DEFAULT_ALGO() != NULL) &&
205 		    strncmp(CC_DEFAULT_ALGO()->name,
206 			    remove_cc->name,
207 			    TCP_CA_NAME_MAX) == 0) {
208 			cnt++;
209 		}
210 		CURVNET_RESTORE();
211 	}
212 	VNET_LIST_RUNLOCK_NOSLEEP();
213 	return (cnt);
214 }
215 
216 /*
217  * Initialise CC subsystem on system boot.
218  */
219 static void
220 cc_init(void)
221 {
222 	CC_LIST_LOCK_INIT();
223 	STAILQ_INIT(&cc_list);
224 }
225 
226 /*
227  * Returns non-zero on success, 0 on failure.
228  */
229 int
230 cc_deregister_algo(struct cc_algo *remove_cc)
231 {
232 	struct cc_algo *funcs, *tmpfuncs;
233 	int err;
234 
235 	err = ENOENT;
236 
237 	/* Remove algo from cc_list so that new connections can't use it. */
238 	CC_LIST_WLOCK();
239 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
240 		if (funcs == remove_cc) {
241 			if (cc_check_default(remove_cc)) {
242 				err = EBUSY;
243 				break;
244 			}
245 			/* Add a temp flag to stop new adds to it */
246 			funcs->flags |= CC_MODULE_BEING_REMOVED;
247 			break;
248 		}
249 	}
250 	CC_LIST_WUNLOCK();
251 	err = tcp_ccalgounload(remove_cc);
252 	/*
253 	 * Now back through and we either remove the temp flag
254 	 * or pull the registration.
255 	 */
256 	CC_LIST_WLOCK();
257 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
258 		if (funcs == remove_cc) {
259 			if (err == 0)
260 				STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
261 			else
262 				funcs->flags &= ~CC_MODULE_BEING_REMOVED;
263 			break;
264 		}
265 	}
266 	CC_LIST_WUNLOCK();
267 	return (err);
268 }
269 
270 /*
271  * Returns 0 on success, non-zero on failure.
272  */
273 int
274 cc_register_algo(struct cc_algo *add_cc)
275 {
276 	struct cc_algo *funcs;
277 	int err;
278 
279 	err = 0;
280 
281 	/*
282 	 * Iterate over list of registered CC algorithms and make sure
283 	 * we're not trying to add a duplicate.
284 	 */
285 	CC_LIST_WLOCK();
286 	STAILQ_FOREACH(funcs, &cc_list, entries) {
287 		if (funcs == add_cc ||
288 		    strncmp(funcs->name, add_cc->name,
289 			    TCP_CA_NAME_MAX) == 0) {
290 			err = EEXIST;
291 			break;
292 		}
293 	}
294 	/*
295 	 * The first loaded congestion control module will become
296 	 * the default until we find the "CC_DEFAULT" defined in
297 	 * the config (if we do).
298 	 */
299 	if (!err) {
300 		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
301 		if (strcmp(add_cc->name, CC_DEFAULT) == 0) {
302 			V_default_cc_ptr = add_cc;
303 		} else if (V_default_cc_ptr == NULL) {
304 			V_default_cc_ptr = add_cc;
305 		}
306 	}
307 	CC_LIST_WUNLOCK();
308 
309 	return (err);
310 }
311 
312 /*
313  * Perform any necessary tasks before we exit congestion recovery.
314  */
315 void
316 newreno_cc_post_recovery(struct cc_var *ccv)
317 {
318 	int pipe;
319 
320 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
321 		/*
322 		 * Fast recovery will conclude after returning from this
323 		 * function. Window inflation should have left us with
324 		 * approximately snd_ssthresh outstanding data. But in case we
325 		 * would be inclined to send a burst, better to do it via the
326 		 * slow start mechanism.
327 		 *
328 		 * XXXLAS: Find a way to do this without needing curack
329 		 */
330 		if (V_tcp_do_newsack)
331 			pipe = tcp_compute_pipe(ccv->ccvc.tcp);
332 		else
333 			pipe = CCV(ccv, snd_max) - ccv->curack;
334 		if (pipe < CCV(ccv, snd_ssthresh))
335 			/*
336 			 * Ensure that cwnd does not collapse to 1 MSS under
337 			 * adverse conditons. Implements RFC6582
338 			 */
339 			CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
340 			    CCV(ccv, t_maxseg);
341 		else
342 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
343 	}
344 }
345 
346 void
347 newreno_cc_after_idle(struct cc_var *ccv)
348 {
349 	uint32_t rw;
350 	/*
351 	 * If we've been idle for more than one retransmit timeout the old
352 	 * congestion window is no longer current and we have to reduce it to
353 	 * the restart window before we can transmit again.
354 	 *
355 	 * The restart window is the initial window or the last CWND, whichever
356 	 * is smaller.
357 	 *
358 	 * This is done to prevent us from flooding the path with a full CWND at
359 	 * wirespeed, overloading router and switch buffers along the way.
360 	 *
361 	 * See RFC5681 Section 4.1. "Restarting Idle Connections".
362 	 *
363 	 * In addition, per RFC2861 Section 2, the ssthresh is set to the
364 	 * maximum of the former ssthresh or 3/4 of the old cwnd, to
365 	 * not exit slow-start prematurely.
366 	 */
367 	rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp));
368 
369 	CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
370 	    CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
371 
372 	CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
373 }
374 
375 /*
376  * Perform any necessary tasks before we enter congestion recovery.
377  */
378 void
379 newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type)
380 {
381 	uint32_t cwin, factor;
382 	u_int mss;
383 
384 	cwin = CCV(ccv, snd_cwnd);
385 	mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
386 	/*
387 	 * Other TCP congestion controls use newreno_cong_signal(), but
388 	 * with their own private cc_data. Make sure the cc_data is used
389 	 * correctly.
390 	 */
391 	factor = V_newreno_beta;
392 
393 	/* Catch algos which mistakenly leak private signal types. */
394 	KASSERT((type & CC_SIGPRIVMASK) == 0,
395 	    ("%s: congestion signal type 0x%08x is private\n", __func__, type));
396 
397 	cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss),
398 	    2) * mss;
399 
400 	switch (type) {
401 	case CC_NDUPACK:
402 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
403 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
404 				CCV(ccv, snd_ssthresh) = cwin;
405 			ENTER_RECOVERY(CCV(ccv, t_flags));
406 		}
407 		break;
408 	case CC_ECN:
409 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
410 			CCV(ccv, snd_ssthresh) = cwin;
411 			CCV(ccv, snd_cwnd) = cwin;
412 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
413 		}
414 		break;
415 	case CC_RTO:
416 		CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd),
417 						 CCV(ccv, snd_cwnd)) / 2 / mss,
418 					     2) * mss;
419 		CCV(ccv, snd_cwnd) = mss;
420 		break;
421 	}
422 }
423 
424 void
425 newreno_cc_ack_received(struct cc_var *ccv, uint16_t type)
426 {
427 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
428 	    (ccv->flags & CCF_CWND_LIMITED)) {
429 		u_int cw = CCV(ccv, snd_cwnd);
430 		u_int incr = CCV(ccv, t_maxseg);
431 
432 		/*
433 		 * Regular in-order ACK, open the congestion window.
434 		 * Method depends on which congestion control state we're
435 		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
436 		 * enabled.
437 		 *
438 		 * slow start: cwnd <= ssthresh
439 		 * cong avoid: cwnd > ssthresh
440 		 *
441 		 * slow start and ABC (RFC 3465):
442 		 *   Grow cwnd exponentially by the amount of data
443 		 *   ACKed capping the max increment per ACK to
444 		 *   (abc_l_var * maxseg) bytes.
445 		 *
446 		 * slow start without ABC (RFC 5681):
447 		 *   Grow cwnd exponentially by maxseg per ACK.
448 		 *
449 		 * cong avoid and ABC (RFC 3465):
450 		 *   Grow cwnd linearly by maxseg per RTT for each
451 		 *   cwnd worth of ACKed data.
452 		 *
453 		 * cong avoid without ABC (RFC 5681):
454 		 *   Grow cwnd linearly by approximately maxseg per RTT using
455 		 *   maxseg^2 / cwnd per ACK as the increment.
456 		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
457 		 *   avoid capping cwnd.
458 		 */
459 		if (cw > CCV(ccv, snd_ssthresh)) {
460 			if (V_tcp_do_rfc3465) {
461 				if (ccv->flags & CCF_ABC_SENTAWND)
462 					ccv->flags &= ~CCF_ABC_SENTAWND;
463 				else
464 					incr = 0;
465 			} else
466 				incr = max((incr * incr / cw), 1);
467 		} else if (V_tcp_do_rfc3465) {
468 			/*
469 			 * In slow-start with ABC enabled and no RTO in sight?
470 			 * (Must not use abc_l_var > 1 if slow starting after
471 			 * an RTO. On RTO, snd_nxt = snd_una, so the
472 			 * snd_nxt == snd_max check is sufficient to
473 			 * handle this).
474 			 *
475 			 * XXXLAS: Find a way to signal SS after RTO that
476 			 * doesn't rely on tcpcb vars.
477 			 */
478 			uint16_t abc_val;
479 
480 			if (ccv->flags & CCF_USE_LOCAL_ABC)
481 				abc_val = ccv->labc;
482 			else
483 				abc_val = V_tcp_abc_l_var;
484 			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
485 				incr = min(ccv->bytes_this_ack,
486 				    ccv->nsegs * abc_val *
487 				    CCV(ccv, t_maxseg));
488 			else
489 				incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
490 
491 		}
492 		/* ABC is on by default, so incr equals 0 frequently. */
493 		if (incr > 0)
494 			CCV(ccv, snd_cwnd) = min(cw + incr,
495 			    TCP_MAXWIN << CCV(ccv, snd_scale));
496 	}
497 }
498 
499 /*
500  * Handles kld related events. Returns 0 on success, non-zero on failure.
501  */
502 int
503 cc_modevent(module_t mod, int event_type, void *data)
504 {
505 	struct cc_algo *algo;
506 	int err;
507 
508 	err = 0;
509 	algo = (struct cc_algo *)data;
510 
511 	switch(event_type) {
512 	case MOD_LOAD:
513 		if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) {
514 			/*
515 			 * A module must have a cc_data_sz function
516 			 * even if it has no data it should return 0.
517 			 */
518 			printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n");
519 			err = EINVAL;
520 			break;
521 		}
522 		if (algo->mod_init != NULL)
523 			err = algo->mod_init();
524 		if (!err)
525 			err = cc_register_algo(algo);
526 		break;
527 
528 	case MOD_QUIESCE:
529 	case MOD_SHUTDOWN:
530 	case MOD_UNLOAD:
531 		err = cc_deregister_algo(algo);
532 		if (!err && algo->mod_destroy != NULL)
533 			algo->mod_destroy();
534 		if (err == ENOENT)
535 			err = 0;
536 		break;
537 
538 	default:
539 		err = EINVAL;
540 		break;
541 	}
542 
543 	return (err);
544 }
545 
546 SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
547 
548 /* Declare sysctl tree and populate it. */
549 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
550     "Congestion control related settings");
551 
552 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
553     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
554     NULL, 0, cc_default_algo, "A",
555     "Default congestion control algorithm");
556 
557 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available,
558     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
559     NULL, 0, cc_list_available, "A",
560     "List available congestion control algorithms");
561 
562 VNET_DEFINE(int, cc_do_abe) = 0;
563 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW,
564     &VNET_NAME(cc_do_abe), 0,
565     "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)");
566 
567 VNET_DEFINE(int, cc_abe_frlossreduce) = 0;
568 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW,
569     &VNET_NAME(cc_abe_frlossreduce), 0,
570     "Apply standard beta instead of ABE-beta during ECN-signalled congestion "
571     "recovery episodes if loss also needs to be repaired");
572