xref: /freebsd/sys/netinet/cc/cc.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2007-2008
5  *	Swinburne University of Technology, Melbourne, Australia.
6  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7  * Copyright (c) 2010 The FreeBSD Foundation
8  * All rights reserved.
9  *
10  * This software was developed at the Centre for Advanced Internet
11  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
12  * James Healy, made possible in part by a grant from the Cisco University
13  * Research Program Fund at Community Foundation Silicon Valley.
14  *
15  * Portions of this software were developed at the Centre for Advanced
16  * Internet Architectures, Swinburne University of Technology, Melbourne,
17  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 /*
42  * This software was first released in 2007 by James Healy and Lawrence Stewart
43  * whilst working on the NewTCP research project at Swinburne University of
44  * Technology's Centre for Advanced Internet Architectures, Melbourne,
45  * Australia, which was made possible in part by a grant from the Cisco
46  * University Research Program Fund at Community Foundation Silicon Valley.
47  * More details are available at:
48  *   http://caia.swin.edu.au/urp/newtcp/
49  */
50 
51 #include <sys/cdefs.h>
52 #include <opt_cc.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/libkern.h>
56 #include <sys/lock.h>
57 #include <sys/malloc.h>
58 #include <sys/module.h>
59 #include <sys/mutex.h>
60 #include <sys/queue.h>
61 #include <sys/rwlock.h>
62 #include <sys/sbuf.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/sysctl.h>
66 
67 #include <net/vnet.h>
68 
69 #include <netinet/in.h>
70 #include <netinet/in_pcb.h>
71 #include <netinet/tcp.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcp_log_buf.h>
75 #include <netinet/tcp_hpts.h>
76 #include <netinet/cc/cc.h>
77 #include <netinet/cc/cc_module.h>
78 
79 /*
80  * Have a sane default if no CC_DEFAULT is specified in the kernel config file.
81  */
82 #ifndef CC_DEFAULT
83 #define CC_DEFAULT "cubic"
84 #endif
85 
86 uint32_t hystart_minrtt_thresh = 4000;
87 uint32_t hystart_maxrtt_thresh = 16000;
88 uint32_t hystart_n_rttsamples = 8;
89 uint32_t hystart_css_growth_div = 4;
90 uint32_t hystart_css_rounds = 5;
91 uint32_t hystart_bblogs = 0;
92 
93 MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");
94 
95 /*
96  * List of available cc algorithms on the current system. First element
97  * is used as the system default CC algorithm.
98  */
99 struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
100 
101 /* Protects the cc_list TAILQ. */
102 struct rwlock cc_list_lock;
103 
104 VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
105 
106 VNET_DEFINE(uint32_t, newreno_beta) = 50;
107 #define V_newreno_beta VNET(newreno_beta)
108 VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
109 
110 void
111 cc_refer(struct cc_algo *algo)
112 {
113 	CC_LIST_LOCK_ASSERT();
114 	refcount_acquire(&algo->cc_refcount);
115 }
116 
117 void
118 cc_release(struct cc_algo *algo)
119 {
120 	CC_LIST_LOCK_ASSERT();
121 	refcount_release(&algo->cc_refcount);
122 }
123 
124 
125 void
126 cc_attach(struct tcpcb *tp, struct cc_algo *algo)
127 {
128 	/*
129 	 * Attach the tcpcb to the algorithm.
130 	 */
131 	CC_LIST_RLOCK();
132 	CC_ALGO(tp) = algo;
133 	cc_refer(algo);
134 	CC_LIST_RUNLOCK();
135 }
136 
137 void
138 cc_detach(struct tcpcb *tp)
139 {
140 	struct cc_algo *algo;
141 
142 	CC_LIST_RLOCK();
143 	algo = CC_ALGO(tp);
144 	CC_ALGO(tp) = NULL;
145 	cc_release(algo);
146 	CC_LIST_RUNLOCK();
147 }
148 
149 /*
150  * Sysctl handler to show and change the default CC algorithm.
151  */
152 static int
153 cc_default_algo(SYSCTL_HANDLER_ARGS)
154 {
155 	char default_cc[TCP_CA_NAME_MAX];
156 	struct cc_algo *funcs;
157 	int error;
158 
159 	/* Get the current default: */
160 	CC_LIST_RLOCK();
161 	if (CC_DEFAULT_ALGO() != NULL)
162 		strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc));
163 	else
164 		memset(default_cc, 0, TCP_CA_NAME_MAX);
165 	CC_LIST_RUNLOCK();
166 
167 	error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
168 
169 	/* Check for error or no change */
170 	if (error != 0 || req->newptr == NULL)
171 		goto done;
172 
173 	error = ESRCH;
174 	/* Find algo with specified name and set it to default. */
175 	CC_LIST_RLOCK();
176 	STAILQ_FOREACH(funcs, &cc_list, entries) {
177 		if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
178 			continue;
179 		if (funcs->flags & CC_MODULE_BEING_REMOVED) {
180 			/* Its being removed, its not eligible */
181 			continue;
182 		}
183 		V_default_cc_ptr = funcs;
184 		error = 0;
185 		break;
186 	}
187 	CC_LIST_RUNLOCK();
188 done:
189 	return (error);
190 }
191 
192 /*
193  * Sysctl handler to display the list of available CC algorithms.
194  */
195 static int
196 cc_list_available(SYSCTL_HANDLER_ARGS)
197 {
198 	struct cc_algo *algo;
199 	int error, nalgos;
200 	int linesz;
201 	char *buffer, *cp;
202 	size_t bufsz, outsz;
203 
204 	error = nalgos = 0;
205 	CC_LIST_RLOCK();
206 	STAILQ_FOREACH(algo, &cc_list, entries) {
207 		nalgos++;
208 	}
209 	CC_LIST_RUNLOCK();
210 	if (nalgos == 0) {
211 		return (ENOENT);
212 	}
213 	bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1);
214 	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
215 	cp = buffer;
216 
217 	linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D',
218 	    "PCB count");
219 	cp += linesz;
220 	bufsz -= linesz;
221 	outsz = linesz;
222 	CC_LIST_RLOCK();
223 	STAILQ_FOREACH(algo, &cc_list, entries) {
224 		linesz = snprintf(cp, bufsz, "%-16s%c %u\n",
225 		    algo->name,
226 		    (algo == CC_DEFAULT_ALGO()) ? '*' : ' ',
227 		    algo->cc_refcount);
228 		if (linesz >= bufsz) {
229 			error = EOVERFLOW;
230 			break;
231 		}
232 		cp += linesz;
233 		bufsz -= linesz;
234 		outsz += linesz;
235 	}
236 	CC_LIST_RUNLOCK();
237 	if (error == 0)
238 		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
239 	free(buffer, M_TEMP);
240 	return (error);
241 }
242 
243 /*
244  * Return the number of times a proposed removal_cc is
245  * being used as the default.
246  */
247 static int
248 cc_check_default(struct cc_algo *remove_cc)
249 {
250 	int cnt = 0;
251 	VNET_ITERATOR_DECL(vnet_iter);
252 
253 	CC_LIST_LOCK_ASSERT();
254 
255 	VNET_LIST_RLOCK_NOSLEEP();
256 	VNET_FOREACH(vnet_iter) {
257 		CURVNET_SET(vnet_iter);
258 		if ((CC_DEFAULT_ALGO() != NULL) &&
259 		    strncmp(CC_DEFAULT_ALGO()->name,
260 			    remove_cc->name,
261 			    TCP_CA_NAME_MAX) == 0) {
262 			cnt++;
263 		}
264 		CURVNET_RESTORE();
265 	}
266 	VNET_LIST_RUNLOCK_NOSLEEP();
267 	return (cnt);
268 }
269 
270 /*
271  * Initialise CC subsystem on system boot.
272  */
273 static void
274 cc_init(void)
275 {
276 	CC_LIST_LOCK_INIT();
277 	STAILQ_INIT(&cc_list);
278 }
279 
280 /*
281  * Returns non-zero on success, 0 on failure.
282  */
283 static int
284 cc_deregister_algo_locked(struct cc_algo *remove_cc)
285 {
286 	struct cc_algo *funcs;
287 	int found = 0;
288 
289 	/* This is unlikely to fail */
290 	STAILQ_FOREACH(funcs, &cc_list, entries) {
291 		if (funcs == remove_cc)
292 			found = 1;
293 	}
294 	if (found == 0) {
295 		/* Nothing to remove? */
296 		return (ENOENT);
297 	}
298 	/* We assert it should have been MOD_QUIESCE'd */
299 	KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED),
300 		("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc));
301 	if (cc_check_default(remove_cc)) {
302 		return(EBUSY);
303 	}
304 	if (remove_cc->cc_refcount != 0) {
305 		return (EBUSY);
306 	}
307 	/* Remove algo from cc_list so that new connections can't use it. */
308 	STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries);
309 	return (0);
310 }
311 
312 /*
313  * Returns non-zero on success, 0 on failure.
314  */
315 int
316 cc_deregister_algo(struct cc_algo *remove_cc)
317 {
318 	int ret;
319 
320 	CC_LIST_WLOCK();
321 	ret = cc_deregister_algo_locked(remove_cc);
322 	CC_LIST_WUNLOCK();
323 	return (ret);
324 }
325 
326 /*
327  * Returns 0 on success, non-zero on failure.
328  */
329 int
330 cc_register_algo(struct cc_algo *add_cc)
331 {
332 	struct cc_algo *funcs;
333 	int err;
334 
335 	err = 0;
336 
337 	/*
338 	 * Iterate over list of registered CC algorithms and make sure
339 	 * we're not trying to add a duplicate.
340 	 */
341 	CC_LIST_WLOCK();
342 	STAILQ_FOREACH(funcs, &cc_list, entries) {
343 		if (funcs == add_cc ||
344 		    strncmp(funcs->name, add_cc->name,
345 			    TCP_CA_NAME_MAX) == 0) {
346 			err = EEXIST;
347 			break;
348 		}
349 	}
350 	/* Init its reference count */
351 	if (err == 0)
352 		refcount_init(&add_cc->cc_refcount, 0);
353 	/*
354 	 * The first loaded congestion control module will become
355 	 * the default until we find the "CC_DEFAULT" defined in
356 	 * the config (if we do).
357 	 */
358 	if (!err) {
359 		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
360 		if (strcmp(add_cc->name, CC_DEFAULT) == 0) {
361 			V_default_cc_ptr = add_cc;
362 		} else if (V_default_cc_ptr == NULL) {
363 			V_default_cc_ptr = add_cc;
364 		}
365 	}
366 	CC_LIST_WUNLOCK();
367 
368 	return (err);
369 }
370 
371 static void
372 vnet_cc_sysinit(void *arg)
373 {
374 	struct cc_algo *cc;
375 
376 	if (IS_DEFAULT_VNET(curvnet))
377 		return;
378 
379 	CURVNET_SET(vnet0);
380 	cc = V_default_cc_ptr;
381 	CURVNET_RESTORE();
382 
383 	V_default_cc_ptr = cc;
384 }
385 VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
386     vnet_cc_sysinit, NULL);
387 
388 /*
389  * Perform any necessary tasks before we exit congestion recovery.
390  */
391 void
392 newreno_cc_post_recovery(struct cc_var *ccv)
393 {
394 	int pipe;
395 	uint32_t mss = tcp_fixed_maxseg(ccv->tp);
396 
397 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
398 		/*
399 		 * Fast recovery will conclude after returning from this
400 		 * function. Window inflation should have left us with
401 		 * approximately snd_ssthresh outstanding data. But in case we
402 		 * would be inclined to send a burst, better to do it via the
403 		 * slow start mechanism.
404 		 *
405 		 * XXXLAS: Find a way to do this without needing curack
406 		 */
407 		if (V_tcp_do_newsack)
408 			pipe = tcp_compute_pipe(ccv->tp);
409 		else
410 			pipe = CCV(ccv, snd_max) - ccv->curack;
411 		if (pipe < CCV(ccv, snd_ssthresh))
412 			/*
413 			 * Ensure that cwnd does not collapse to 1 MSS under
414 			 * adverse conditions. Implements RFC6582
415 			 */
416 			CCV(ccv, snd_cwnd) = max(pipe, mss) + mss;
417 		else
418 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
419 	}
420 }
421 
422 void
423 newreno_cc_after_idle(struct cc_var *ccv)
424 {
425 	uint32_t rw;
426 	/*
427 	 * If we've been idle for more than one retransmit timeout the old
428 	 * congestion window is no longer current and we have to reduce it to
429 	 * the restart window before we can transmit again.
430 	 *
431 	 * The restart window is the initial window or the last CWND, whichever
432 	 * is smaller.
433 	 *
434 	 * This is done to prevent us from flooding the path with a full CWND at
435 	 * wirespeed, overloading router and switch buffers along the way.
436 	 *
437 	 * See RFC5681 Section 4.1. "Restarting Idle Connections".
438 	 *
439 	 * In addition, per RFC2861 Section 2, the ssthresh is set to the
440 	 * maximum of the former ssthresh or 3/4 of the old cwnd, to
441 	 * not exit slow-start prematurely.
442 	 */
443 	rw = tcp_compute_initwnd(tcp_fixed_maxseg(ccv->tp));
444 
445 	CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
446 	    CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
447 
448 	CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
449 }
450 
451 /*
452  * Get a new congestion window size on a multiplicative decrease event.
453  * */
454 u_int
455 newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss)
456 {
457 	uint32_t cwin, factor;
458 
459 	cwin = CCV(ccv, snd_cwnd);
460 	/*
461 	 * Other TCP congestion controls use newreno_cong_signal(), but
462 	 * with their own private cc_data. Make sure the cc_data is used
463 	 * correctly.
464 	 */
465 	factor = V_newreno_beta;
466 
467 	return max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss;
468 }
469 
470 /*
471  * Perform any necessary tasks before we enter congestion recovery.
472  */
473 void
474 newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type)
475 {
476 	uint32_t cwin, mss, pipe;
477 
478 	mss = tcp_fixed_maxseg(ccv->tp);
479 
480 	/* Catch algos which mistakenly leak private signal types. */
481 	KASSERT((type & CC_SIGPRIVMASK) == 0,
482 	    ("%s: congestion signal type 0x%08x is private\n", __func__, type));
483 
484 	cwin = newreno_cc_cwnd_on_multiplicative_decrease(ccv, mss);
485 
486 	switch (type) {
487 	case CC_NDUPACK:
488 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
489 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
490 				CCV(ccv, snd_ssthresh) = cwin;
491 			ENTER_RECOVERY(CCV(ccv, t_flags));
492 		}
493 		break;
494 	case CC_ECN:
495 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
496 			CCV(ccv, snd_ssthresh) = cwin;
497 			CCV(ccv, snd_cwnd) = cwin;
498 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
499 		}
500 		break;
501 	case CC_RTO:
502 		if (CCV(ccv, t_rxtshift) == 1) {
503 			if (V_tcp_do_newsack) {
504 				pipe = tcp_compute_pipe(ccv->tp);
505 			} else {
506 				pipe = CCV(ccv, snd_max) -
507 					CCV(ccv, snd_fack) +
508 					CCV(ccv, sackhint.sack_bytes_rexmit);
509 			}
510 			CCV(ccv, snd_ssthresh) = max(2,
511 				min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss;
512 		}
513 		CCV(ccv, snd_cwnd) = mss;
514 		break;
515 	default:
516 		break;
517 	}
518 }
519 
520 u_int
521 newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv)
522 {
523 	u_int cw = CCV(ccv, snd_cwnd);
524 	u_int incr = tcp_fixed_maxseg(ccv->tp);
525 
526 	KASSERT(cw > CCV(ccv, snd_ssthresh),
527 		("congestion control state not in congestion avoidance\n"));
528 
529 	/*
530 	 * Regular in-order ACK, open the congestion window.
531 	 * The congestion control state we're in is congestion avoidance.
532 	 *
533 	 * Check if ABC (RFC 3465) is enabled.
534 	 * cong avoid: cwnd > ssthresh
535 	 *
536 	 * cong avoid and ABC (RFC 3465):
537 	 *   Grow cwnd linearly by maxseg per RTT for each
538 	 *   cwnd worth of ACKed data.
539 	 *
540 	 * cong avoid without ABC (RFC 5681):
541 	 *   Grow cwnd linearly by approximately maxseg per RTT using
542 	 *   maxseg^2 / cwnd per ACK as the increment.
543 	 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
544 	 *   avoid capping cwnd.
545 	 */
546 	if (V_tcp_do_rfc3465) {
547 		if (ccv->flags & CCF_ABC_SENTAWND)
548 			ccv->flags &= ~CCF_ABC_SENTAWND;
549 		else
550 			incr = 0;
551 	} else
552 		incr = max((incr * incr / cw), 1);
553 	/* ABC is on by default, so incr equals 0 frequently. */
554 	if (incr > 0)
555 		return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale));
556 	else
557 		return cw;
558 }
559 
560 u_int
561 newreno_cc_cwnd_in_slow_start(struct cc_var *ccv)
562 {
563 	u_int cw = CCV(ccv, snd_cwnd);
564 	u_int mss = tcp_fixed_maxseg(ccv->tp);
565 	u_int incr = mss;
566 
567 	KASSERT(cw <= CCV(ccv, snd_ssthresh),
568 		("congestion control state not in slow start\n"));
569 
570 	/*
571 	 * Regular in-order ACK, open the congestion window.
572 	 * The congestion control state we're in is slow start.
573 	 *
574 	 * slow start: cwnd <= ssthresh
575 	 *
576 	 * slow start and ABC (RFC 3465):
577 	 *   Grow cwnd exponentially by the amount of data
578 	 *   ACKed capping the max increment per ACK to
579 	 *   (abc_l_var * maxseg) bytes.
580 	 *
581 	 * slow start without ABC (RFC 5681):
582 	 *   Grow cwnd exponentially by maxseg per ACK.
583 	 */
584 	if (V_tcp_do_rfc3465) {
585 		/*
586 		 * In slow-start with ABC enabled and no RTO in sight?
587 		 * (Must not use abc_l_var > 1 if slow starting after
588 		 * an RTO. On RTO, snd_nxt = snd_una, so the
589 		 * snd_nxt == snd_max check is sufficient to
590 		 * handle this).
591 		 *
592 		 * XXXLAS: Find a way to signal SS after RTO that
593 		 * doesn't rely on tcpcb vars.
594 		 */
595 		uint16_t abc_val;
596 
597 		if (ccv->flags & CCF_USE_LOCAL_ABC)
598 			abc_val = ccv->labc;
599 		else
600 			abc_val = V_tcp_abc_l_var;
601 		if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
602 			incr = min(ccv->bytes_this_ack,
603 			           ccv->nsegs * abc_val * mss);
604 		else
605 			incr = min(ccv->bytes_this_ack, mss);
606 	}
607 	/* ABC is on by default, so incr equals 0 frequently. */
608 	if (incr > 0)
609 		return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale));
610 	else
611 		return cw;
612 }
613 
614 void
615 newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type)
616 {
617 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
618 	    (ccv->flags & CCF_CWND_LIMITED)) {
619 		if (CCV(ccv, snd_cwnd) > CCV(ccv, snd_ssthresh)) {
620 			CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_cong_avoid(ccv);
621 		} else {
622 			CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_slow_start(ccv);
623 		}
624 	}
625 }
626 
627 static int
628 cc_stop_new_assignments(struct cc_algo *algo)
629 {
630 	CC_LIST_WLOCK();
631 	if (cc_check_default(algo)) {
632 		/* A default cannot be removed */
633 		CC_LIST_WUNLOCK();
634 		return (EBUSY);
635 	}
636 	algo->flags |= CC_MODULE_BEING_REMOVED;
637 	CC_LIST_WUNLOCK();
638 	return (0);
639 }
640 
641 /*
642  * Handles kld related events. Returns 0 on success, non-zero on failure.
643  */
644 int
645 cc_modevent(module_t mod, int event_type, void *data)
646 {
647 	struct cc_algo *algo;
648 	int err;
649 
650 	err = 0;
651 	algo = (struct cc_algo *)data;
652 
653 	switch(event_type) {
654 	case MOD_LOAD:
655 		if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) {
656 			/*
657 			 * A module must have a cc_data_sz function
658 			 * even if it has no data it should return 0.
659 			 */
660 			printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n");
661 			err = EINVAL;
662 			break;
663 		}
664 		if (algo->mod_init != NULL)
665 			err = algo->mod_init();
666 		if (!err)
667 			err = cc_register_algo(algo);
668 		break;
669 
670 	case MOD_SHUTDOWN:
671 		break;
672 	case MOD_QUIESCE:
673 		/* Stop any new assigments */
674 		err = cc_stop_new_assignments(algo);
675 		break;
676 	case MOD_UNLOAD:
677 		/*
678 		 * Deregister and remove the module from the list
679 		 */
680 		CC_LIST_WLOCK();
681 		/* Even with -f we can't unload if its the default */
682 		if (cc_check_default(algo)) {
683 			/* A default cannot be removed */
684 			CC_LIST_WUNLOCK();
685 			return (EBUSY);
686 		}
687 		/*
688 		 * If -f was used and users are still attached to
689 		 * the algorithm things are going to go boom.
690 		 */
691 		err = cc_deregister_algo_locked(algo);
692 		CC_LIST_WUNLOCK();
693 		if ((err == 0) && (algo->mod_destroy != NULL)) {
694 			algo->mod_destroy();
695 		}
696 		break;
697 	default:
698 		err = EINVAL;
699 		break;
700 	}
701 
702 	return (err);
703 }
704 
705 SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
706 
707 /* Declare sysctl tree and populate it. */
708 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
709     "Congestion control related settings");
710 
711 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
712     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
713     NULL, 0, cc_default_algo, "A",
714     "Default congestion control algorithm");
715 
716 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available,
717     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
718     NULL, 0, cc_list_available, "A",
719     "List available congestion control algorithms");
720 
721 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus,
722     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
723     "New Reno related HyStart++ settings");
724 
725 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh,
726     CTLFLAG_RW,
727     &hystart_minrtt_thresh, 4000,
728    "HyStarts++ minimum RTT thresh used in clamp (in microseconds)");
729 
730 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh,
731     CTLFLAG_RW,
732     &hystart_maxrtt_thresh, 16000,
733    "HyStarts++ maximum RTT thresh used in clamp (in microseconds)");
734 
735 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples,
736     CTLFLAG_RW,
737     &hystart_n_rttsamples, 8,
738    "The number of RTT samples that must be seen to consider HyStart++");
739 
740 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div,
741     CTLFLAG_RW,
742     &hystart_css_growth_div, 4,
743    "The divisor to the growth when in Hystart++ CSS");
744 
745 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds,
746     CTLFLAG_RW,
747     &hystart_css_rounds, 5,
748    "The number of rounds HyStart++ lasts in CSS before falling to CA");
749 
750 SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs,
751     CTLFLAG_RW,
752     &hystart_bblogs, 0,
753    "Do we enable HyStart++ Black Box logs to be generated if BB logging is on");
754 
755 VNET_DEFINE(int, cc_do_abe) = 0;
756 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW,
757     &VNET_NAME(cc_do_abe), 0,
758     "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)");
759 
760 VNET_DEFINE(int, cc_abe_frlossreduce) = 0;
761 SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW,
762     &VNET_NAME(cc_abe_frlossreduce), 0,
763     "Apply standard beta instead of ABE-beta during ECN-signalled congestion "
764     "recovery episodes if loss also needs to be repaired");
765