xref: /freebsd/sys/netinet/cc/cc.c (revision 1af3908ce6121eb091923b3932fe56ab54656093)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007-2008
5  *	Swinburne University of Technology, Melbourne, Australia.
6  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7  * Copyright (c) 2010 The FreeBSD Foundation
8  * All rights reserved.
9  *
10  * This software was developed at the Centre for Advanced Internet
11  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
12  * James Healy, made possible in part by a grant from the Cisco University
13  * Research Program Fund at Community Foundation Silicon Valley.
14  *
15  * Portions of this software were developed at the Centre for Advanced
16  * Internet Architectures, Swinburne University of Technology, Melbourne,
17  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 /*
42  * This software was first released in 2007 by James Healy and Lawrence Stewart
43  * whilst working on the NewTCP research project at Swinburne University of
44  * Technology's Centre for Advanced Internet Architectures, Melbourne,
45  * Australia, which was made possible in part by a grant from the Cisco
46  * University Research Program Fund at Community Foundation Silicon Valley.
47  * More details are available at:
48  *   http://caia.swin.edu.au/urp/newtcp/
49  */
50 
51 #include <sys/cdefs.h>
52 __FBSDID("$FreeBSD$");
53 #include <opt_cc.h>
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/libkern.h>
57 #include <sys/lock.h>
58 #include <sys/malloc.h>
59 #include <sys/module.h>
60 #include <sys/mutex.h>
61 #include <sys/queue.h>
62 #include <sys/rwlock.h>
63 #include <sys/sbuf.h>
64 #include <sys/socket.h>
65 #include <sys/socketvar.h>
66 #include <sys/sysctl.h>
67 
68 #include <net/vnet.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/in_pcb.h>
72 #include <netinet/tcp.h>
73 #include <netinet/tcp_seq.h>
74 #include <netinet/tcp_var.h>
75 #include <netinet/tcp_log_buf.h>
76 #include <netinet/tcp_hpts.h>
77 #include <netinet/cc/cc.h>
78 #include <netinet/cc/cc_module.h>
79 
80 /*
81  * Have a sane default if no CC_DEFAULT is specified in the kernel config file.
82  */
83 #ifndef CC_DEFAULT
84 #define CC_DEFAULT "cubic"
85 #endif
86 
87 uint32_t hystart_minrtt_thresh = 4000;
88 uint32_t hystart_maxrtt_thresh = 16000;
89 uint32_t hystart_n_rttsamples = 8;
90 uint32_t hystart_css_growth_div = 4;
91 uint32_t hystart_css_rounds = 5;
92 uint32_t hystart_bblogs = 0;
93 
94 MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");
95 
96 /*
97  * List of available cc algorithms on the current system. First element
98  * is used as the system default CC algorithm.
99  */
100 struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
101 
102 /* Protects the cc_list TAILQ. */
103 struct rwlock cc_list_lock;
104 
105 VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
106 
107 VNET_DEFINE(uint32_t, newreno_beta) = 50;
108 #define V_newreno_beta VNET(newreno_beta)
109 VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
110 
111 void
112 cc_refer(struct cc_algo *algo)
113 {
114 	CC_LIST_LOCK_ASSERT();
115 	refcount_acquire(&algo->cc_refcount);
116 }
117 
118 void
119 cc_release(struct cc_algo *algo)
120 {
121 	CC_LIST_LOCK_ASSERT();
122 	refcount_release(&algo->cc_refcount);
123 }
124 
125 
126 void
127 cc_attach(struct tcpcb *tp, struct cc_algo *algo)
128 {
129 	/*
130 	 * Attach the tcpcb to the algorithm.
131 	 */
132 	CC_LIST_RLOCK();
133 	CC_ALGO(tp) = algo;
134 	cc_refer(algo);
135 	CC_LIST_RUNLOCK();
136 }
137 
138 void
139 cc_detach(struct tcpcb *tp)
140 {
141 	struct cc_algo *algo;
142 
143 	CC_LIST_RLOCK();
144 	algo = CC_ALGO(tp);
145 	CC_ALGO(tp) = NULL;
146 	cc_release(algo);
147 	CC_LIST_RUNLOCK();
148 }
149 
150 /*
151  * Sysctl handler to show and change the default CC algorithm.
152  */
153 static int
154 cc_default_algo(SYSCTL_HANDLER_ARGS)
155 {
156 	char default_cc[TCP_CA_NAME_MAX];
157 	struct cc_algo *funcs;
158 	int error;
159 
160 	/* Get the current default: */
161 	CC_LIST_RLOCK();
162 	if (CC_DEFAULT_ALGO() != NULL)
163 		strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc));
164 	else
165 		memset(default_cc, 0, TCP_CA_NAME_MAX);
166 	CC_LIST_RUNLOCK();
167 
168 	error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
169 
170 	/* Check for error or no change */
171 	if (error != 0 || req->newptr == NULL)
172 		goto done;
173 
174 	error = ESRCH;
175 	/* Find algo with specified name and set it to default. */
176 	CC_LIST_RLOCK();
177 	STAILQ_FOREACH(funcs, &cc_list, entries) {
178 		if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
179 			continue;
180 		if (funcs->flags & CC_MODULE_BEING_REMOVED) {
181 			/* Its being removed, its not eligible */
182 			continue;
183 		}
184 		V_default_cc_ptr = funcs;
185 		error = 0;
186 		break;
187 	}
188 	CC_LIST_RUNLOCK();
189 done:
190 	return (error);
191 }
192 
193 /*
194  * Sysctl handler to display the list of available CC algorithms.
195  */
196 static int
197 cc_list_available(SYSCTL_HANDLER_ARGS)
198 {
199 	struct cc_algo *algo;
200 	int error, nalgos;
201 	int linesz;
202 	char *buffer, *cp;
203 	size_t bufsz, outsz;
204 
205 	error = nalgos = 0;
206 	CC_LIST_RLOCK();
207 	STAILQ_FOREACH(algo, &cc_list, entries) {
208 		nalgos++;
209 	}
210 	CC_LIST_RUNLOCK();
211 	if (nalgos == 0) {
212 		return (ENOENT);
213 	}
214 	bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1);
215 	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
216 	cp = buffer;
217 
218 	linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D',
219 	    "PCB count");
220 	cp += linesz;
221 	bufsz -= linesz;
222 	outsz = linesz;
223 	CC_LIST_RLOCK();
224 	STAILQ_FOREACH(algo, &cc_list, entries) {
225 		linesz = snprintf(cp, bufsz, "%-16s%c %u\n",
226 		    algo->name,
227 		    (algo == CC_DEFAULT_ALGO()) ? '*' : ' ',
228 		    algo->cc_refcount);
229 		if (linesz >= bufsz) {
230 			error = EOVERFLOW;
231 			break;
232 		}
233 		cp += linesz;
234 		bufsz -= linesz;
235 		outsz += linesz;
236 	}
237 	CC_LIST_RUNLOCK();
238 	if (error == 0)
239 		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
240 	free(buffer, M_TEMP);
241 	return (error);
242 }
243 
244 /*
245  * Return the number of times a proposed removal_cc is
246  * being used as the default.
247  */
248 static int
249 cc_check_default(struct cc_algo *remove_cc)
250 {
251 	int cnt = 0;
252 	VNET_ITERATOR_DECL(vnet_iter);
253 
254 	CC_LIST_LOCK_ASSERT();
255 
256 	VNET_LIST_RLOCK_NOSLEEP();
257 	VNET_FOREACH(vnet_iter) {
258 		CURVNET_SET(vnet_iter);
259 		if ((CC_DEFAULT_ALGO() != NULL) &&
260 		    strncmp(CC_DEFAULT_ALGO()->name,
261 			    remove_cc->name,
262 			    TCP_CA_NAME_MAX) == 0) {
263 			cnt++;
264 		}
265 		CURVNET_RESTORE();
266 	}
267 	VNET_LIST_RUNLOCK_NOSLEEP();
268 	return (cnt);
269 }
270 
271 /*
272  * Initialise CC subsystem on system boot.
273  */
274 static void
275 cc_init(void)
276 {
277 	CC_LIST_LOCK_INIT();
278 	STAILQ_INIT(&cc_list);
279 }
280 
281 /*
282  * Returns non-zero on success, 0 on failure.
283  */
284 static int
285 cc_deregister_algo_locked(struct cc_algo *remove_cc)
286 {
287 	struct cc_algo *funcs;
288 	int found = 0;
289 
290 	/* This is unlikely to fail */
291 	STAILQ_FOREACH(funcs, &cc_list, entries) {
292 		if (funcs == remove_cc)
293 			found = 1;
294 	}
295 	if (found == 0) {
296 		/* Nothing to remove? */
297 		return (ENOENT);
298 	}
299 	/* We assert it should have been MOD_QUIESCE'd */
300 	KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED),
301 		("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc));
302 	if (cc_check_default(remove_cc)) {
303 		return(EBUSY);
304 	}
305 	if (remove_cc->cc_refcount != 0) {
306 		return (EBUSY);
307 	}
308 	/* Remove algo from cc_list so that new connections can't use it. */
309 	STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries);
310 	return (0);
311 }
312 
313 /*
314  * Returns non-zero on success, 0 on failure.
315  */
316 int
317 cc_deregister_algo(struct cc_algo *remove_cc)
318 {
319 	int ret;
320 
321 	CC_LIST_WLOCK();
322 	ret = cc_deregister_algo_locked(remove_cc);
323 	CC_LIST_WUNLOCK();
324 	return (ret);
325 }
326 
327 /*
328  * Returns 0 on success, non-zero on failure.
329  */
330 int
331 cc_register_algo(struct cc_algo *add_cc)
332 {
333 	struct cc_algo *funcs;
334 	int err;
335 
336 	err = 0;
337 
338 	/*
339 	 * Iterate over list of registered CC algorithms and make sure
340 	 * we're not trying to add a duplicate.
341 	 */
342 	CC_LIST_WLOCK();
343 	STAILQ_FOREACH(funcs, &cc_list, entries) {
344 		if (funcs == add_cc ||
345 		    strncmp(funcs->name, add_cc->name,
346 			    TCP_CA_NAME_MAX) == 0) {
347 			err = EEXIST;
348 			break;
349 		}
350 	}
351 	/* Init its reference count */
352 	if (err == 0)
353 		refcount_init(&add_cc->cc_refcount, 0);
354 	/*
355 	 * The first loaded congestion control module will become
356 	 * the default until we find the "CC_DEFAULT" defined in
357 	 * the config (if we do).
358 	 */
359 	if (!err) {
360 		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
361 		if (strcmp(add_cc->name, CC_DEFAULT) == 0) {
362 			V_default_cc_ptr = add_cc;
363 		} else if (V_default_cc_ptr == NULL) {
364 			V_default_cc_ptr = add_cc;
365 		}
366 	}
367 	CC_LIST_WUNLOCK();
368 
369 	return (err);
370 }
371 
372 static void
373 vnet_cc_sysinit(void *arg)
374 {
375 	struct cc_algo *cc;
376 
377 	if (IS_DEFAULT_VNET(curvnet))
378 		return;
379 
380 	CURVNET_SET(vnet0);
381 	cc = V_default_cc_ptr;
382 	CURVNET_RESTORE();
383 
384 	V_default_cc_ptr = cc;
385 }
386 VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
387     vnet_cc_sysinit, NULL);
388 
389 /*
390  * Perform any necessary tasks before we exit congestion recovery.
391  */
392 void
393 newreno_cc_post_recovery(struct cc_var *ccv)
394 {
395 	int pipe;
396 
397 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
398 		/*
399 		 * Fast recovery will conclude after returning from this
400 		 * function. Window inflation should have left us with
401 		 * approximately snd_ssthresh outstanding data. But in case we
402 		 * would be inclined to send a burst, better to do it via the
403 		 * slow start mechanism.
404 		 *
405 		 * XXXLAS: Find a way to do this without needing curack
406 		 */
407 		if (V_tcp_do_newsack)
408 			pipe = tcp_compute_pipe(ccv->ccvc.tcp);
409 		else
410 			pipe = CCV(ccv, snd_max) - ccv->curack;
411 		if (pipe < CCV(ccv, snd_ssthresh))
412 			/*
413 			 * Ensure that cwnd does not collapse to 1 MSS under
414 			 * adverse conditions. Implements RFC6582
415 			 */
416 			CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
417 			    CCV(ccv, t_maxseg);
418 		else
419 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
420 	}
421 }
422 
423 void
424 newreno_cc_after_idle(struct cc_var *ccv)
425 {
426 	uint32_t rw;
427 	/*
428 	 * If we've been idle for more than one retransmit timeout the old
429 	 * congestion window is no longer current and we have to reduce it to
430 	 * the restart window before we can transmit again.
431 	 *
432 	 * The restart window is the initial window or the last CWND, whichever
433 	 * is smaller.
434 	 *
435 	 * This is done to prevent us from flooding the path with a full CWND at
436 	 * wirespeed, overloading router and switch buffers along the way.
437 	 *
438 	 * See RFC5681 Section 4.1. "Restarting Idle Connections".
439 	 *
440 	 * In addition, per RFC2861 Section 2, the ssthresh is set to the
441 	 * maximum of the former ssthresh or 3/4 of the old cwnd, to
442 	 * not exit slow-start prematurely.
443 	 */
444 	rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp));
445 
446 	CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
447 	    CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
448 
449 	CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
450 }
451 
452 /*
453  * Perform any necessary tasks before we enter congestion recovery.
454  */
455 void
456 newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type)
457 {
458 	uint32_t cwin, factor;
459 	u_int mss;
460 
461 	cwin = CCV(ccv, snd_cwnd);
462 	mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
463 	/*
464 	 * Other TCP congestion controls use newreno_cong_signal(), but
465 	 * with their own private cc_data. Make sure the cc_data is used
466 	 * correctly.
467 	 */
468 	factor = V_newreno_beta;
469 
470 	/* Catch algos which mistakenly leak private signal types. */
471 	KASSERT((type & CC_SIGPRIVMASK) == 0,
472 	    ("%s: congestion signal type 0x%08x is private\n", __func__, type));
473 
474 	cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss),
475 	    2) * mss;
476 
477 	switch (type) {
478 	case CC_NDUPACK:
479 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
480 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
481 				CCV(ccv, snd_ssthresh) = cwin;
482 			ENTER_RECOVERY(CCV(ccv, t_flags));
483 		}
484 		break;
485 	case CC_ECN:
486 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
487 			CCV(ccv, snd_ssthresh) = cwin;
488 			CCV(ccv, snd_cwnd) = cwin;
489 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
490 		}
491 		break;
492 	case CC_RTO:
493 		CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd),
494 						 CCV(ccv, snd_cwnd)) / 2 / mss,
495 					     2) * mss;
496 		CCV(ccv, snd_cwnd) = mss;
497 		break;
498 	}
499 }
500 
501 void
502 newreno_cc_ack_received(struct cc_var *ccv, uint16_t type)
503 {
504 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
505 	    (ccv->flags & CCF_CWND_LIMITED)) {
506 		u_int cw = CCV(ccv, snd_cwnd);
507 		u_int incr = CCV(ccv, t_maxseg);
508 
509 		/*
510 		 * Regular in-order ACK, open the congestion window.
511 		 * Method depends on which congestion control state we're
512 		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
513 		 * enabled.
514 		 *
515 		 * slow start: cwnd <= ssthresh
516 		 * cong avoid: cwnd > ssthresh
517 		 *
518 		 * slow start and ABC (RFC 3465):
519 		 *   Grow cwnd exponentially by the amount of data
520 		 *   ACKed capping the max increment per ACK to
521 		 *   (abc_l_var * maxseg) bytes.
522 		 *
523 		 * slow start without ABC (RFC 5681):
524 		 *   Grow cwnd exponentially by maxseg per ACK.
525 		 *
526 		 * cong avoid and ABC (RFC 3465):
527 		 *   Grow cwnd linearly by maxseg per RTT for each
528 		 *   cwnd worth of ACKed data.
529 		 *
530 		 * cong avoid without ABC (RFC 5681):
531 		 *   Grow cwnd linearly by approximately maxseg per RTT using
532 		 *   maxseg^2 / cwnd per ACK as the increment.
533 		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
534 		 *   avoid capping cwnd.
535 		 */
536 		if (cw > CCV(ccv, snd_ssthresh)) {
537 			if (V_tcp_do_rfc3465) {
538 				if (ccv->flags & CCF_ABC_SENTAWND)
539 					ccv->flags &= ~CCF_ABC_SENTAWND;
540 				else
541 					incr = 0;
542 			} else
543 				incr = max((incr * incr / cw), 1);
544 		} else if (V_tcp_do_rfc3465) {
545 			/*
546 			 * In slow-start with ABC enabled and no RTO in sight?
547 			 * (Must not use abc_l_var > 1 if slow starting after
548 			 * an RTO. On RTO, snd_nxt = snd_una, so the
549 			 * snd_nxt == snd_max check is sufficient to
550 			 * handle this).
551 			 *
552 			 * XXXLAS: Find a way to signal SS after RTO that
553 			 * doesn't rely on tcpcb vars.
554 			 */
555 			uint16_t abc_val;
556 
557 			if (ccv->flags & CCF_USE_LOCAL_ABC)
558 				abc_val = ccv->labc;
559 			else
560 				abc_val = V_tcp_abc_l_var;
561 			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
562 				incr = min(ccv->bytes_this_ack,
563 				    ccv->nsegs * abc_val *
564 				    CCV(ccv, t_maxseg));
565 			else
566 				incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
567 
568 		}
569 		/* ABC is on by default, so incr equals 0 frequently. */
570 		if (incr > 0)
571 			CCV(ccv, snd_cwnd) = min(cw + incr,
572 			    TCP_MAXWIN << CCV(ccv, snd_scale));
573 	}
574 }
575 
576 static int
577 cc_stop_new_assignments(struct cc_algo *algo)
578 {
579 	CC_LIST_WLOCK();
580 	if (cc_check_default(algo)) {
581 		/* A default cannot be removed */
582 		CC_LIST_WUNLOCK();
583 		return (EBUSY);
584 	}
585 	algo->flags |= CC_MODULE_BEING_REMOVED;
586 	CC_LIST_WUNLOCK();
587 	return (0);
588 }
589 
590 /*
591  * Handles kld related events. Returns 0 on success, non-zero on failure.
592  */
593 int
594 cc_modevent(module_t mod, int event_type, void *data)
595 {
596 	struct cc_algo *algo;
597 	int err;
598 
599 	err = 0;
600 	algo = (struct cc_algo *)data;
601 
602 	switch(event_type) {
603 	case MOD_LOAD:
604 		if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) {
605 			/*
606 			 * A module must have a cc_data_sz function
607 			 * even if it has no data it should return 0.
608 			 */
609 			printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n");
610 			err = EINVAL;
611 			break;
612 		}
613 		if (algo->mod_init != NULL)
614 			err = algo->mod_init();
615 		if (!err)
616 			err = cc_register_algo(algo);
617 		break;
618 
619 	case MOD_SHUTDOWN:
620 		break;
621 	case MOD_QUIESCE:
622 		/* Stop any new assigments */
623 		err = cc_stop_new_assignments(algo);
624 		break;
625 	case MOD_UNLOAD:
626 		/*
627 		 * Deregister and remove the module from the list
628 		 */
629 		CC_LIST_WLOCK();
630 		/* Even with -f we can't unload if its the default */
631 		if (cc_check_default(algo)) {
632 			/* A default cannot be removed */
633 			CC_LIST_WUNLOCK();
634 			return (EBUSY);
635 		}
636 		/*
637 		 * If -f was used and users are still attached to
638 		 * the algorithm things are going to go boom.
639 		 */
640 		err = cc_deregister_algo_locked(algo);
641 		CC_LIST_WUNLOCK();
642 		if ((err == 0) && (algo->mod_destroy != NULL)) {
643 			algo->mod_destroy();
644 		}
645 		break;
646 	default:
647 		err = EINVAL;
648 		break;
649 	}
650 
651 	return (err);
652 }
653 
654 SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
655 
656 /* Declare sysctl tree and populate it. */
657 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
658     "Congestion control related settings");
659 
660 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
661     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
662     NULL, 0, cc_default_algo, "A",
663     "Default congestion control algorithm");
664 
665 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available,
666     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
667     NULL, 0, cc_list_available, "A",
668     "List available congestion control algorithms");
669 
670 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus,
671     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
672     "New Reno related HyStart++ settings");
673 
674 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh,
675     CTLFLAG_RW,
676     &hystart_minrtt_thresh, 4000,
677    "HyStarts++ minimum RTT thresh used in clamp (in microseconds)");
678 
679 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh,
680     CTLFLAG_RW,
681     &hystart_maxrtt_thresh, 16000,
682    "HyStarts++ maximum RTT thresh used in clamp (in microseconds)");
683 
684 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples,
685     CTLFLAG_RW,
686     &hystart_n_rttsamples, 8,
687    "The number of RTT samples that must be seen to consider HyStart++");
688 
689 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div,
690     CTLFLAG_RW,
691     &hystart_css_growth_div, 4,
692    "The divisor to the growth when in Hystart++ CSS");
693 
694 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds,
695     CTLFLAG_RW,
696     &hystart_css_rounds, 5,
697    "The number of rounds HyStart++ lasts in CSS before falling to CA");
698 
699 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs,
700     CTLFLAG_RW,
701     &hystart_bblogs, 0,
702    "Do we enable HyStart++ Black Box logs to be generated if BB logging is on");
703 
704 VNET_DEFINE(int, cc_do_abe) = 0;
705 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW,
706     &VNET_NAME(cc_do_abe), 0,
707     "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)");
708 
709 VNET_DEFINE(int, cc_abe_frlossreduce) = 0;
710 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW,
711     &VNET_NAME(cc_abe_frlossreduce), 0,
712     "Apply standard beta instead of ABE-beta during ECN-signalled congestion "
713     "recovery episodes if loss also needs to be repaired");
714