xref: /freebsd/sys/netinet/cc/cc.c (revision db0ac6ded61105caab4700aeac255328d4238dc4)
1dbc42409SLawrence Stewart /*-
2fe267a55SPedro F. Giffuni  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3fe267a55SPedro F. Giffuni  *
4dbc42409SLawrence Stewart  * Copyright (c) 2007-2008
5dbc42409SLawrence Stewart  *	Swinburne University of Technology, Melbourne, Australia.
6dbc42409SLawrence Stewart  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7dbc42409SLawrence Stewart  * Copyright (c) 2010 The FreeBSD Foundation
8dbc42409SLawrence Stewart  * All rights reserved.
9dbc42409SLawrence Stewart  *
10dbc42409SLawrence Stewart  * This software was developed at the Centre for Advanced Internet
11891b8ed4SLawrence Stewart  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
12891b8ed4SLawrence Stewart  * James Healy, made possible in part by a grant from the Cisco University
13891b8ed4SLawrence Stewart  * Research Program Fund at Community Foundation Silicon Valley.
14dbc42409SLawrence Stewart  *
15dbc42409SLawrence Stewart  * Portions of this software were developed at the Centre for Advanced
16dbc42409SLawrence Stewart  * Internet Architectures, Swinburne University of Technology, Melbourne,
17dbc42409SLawrence Stewart  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18dbc42409SLawrence Stewart  *
19dbc42409SLawrence Stewart  * Redistribution and use in source and binary forms, with or without
20dbc42409SLawrence Stewart  * modification, are permitted provided that the following conditions
21dbc42409SLawrence Stewart  * are met:
22dbc42409SLawrence Stewart  * 1. Redistributions of source code must retain the above copyright
23dbc42409SLawrence Stewart  *    notice, this list of conditions and the following disclaimer.
24dbc42409SLawrence Stewart  * 2. Redistributions in binary form must reproduce the above copyright
25dbc42409SLawrence Stewart  *    notice, this list of conditions and the following disclaimer in the
26dbc42409SLawrence Stewart  *    documentation and/or other materials provided with the distribution.
27dbc42409SLawrence Stewart  *
28dbc42409SLawrence Stewart  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29dbc42409SLawrence Stewart  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30dbc42409SLawrence Stewart  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31dbc42409SLawrence Stewart  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32dbc42409SLawrence Stewart  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33dbc42409SLawrence Stewart  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34dbc42409SLawrence Stewart  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35dbc42409SLawrence Stewart  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36dbc42409SLawrence Stewart  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37dbc42409SLawrence Stewart  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38dbc42409SLawrence Stewart  * SUCH DAMAGE.
39dbc42409SLawrence Stewart  */
40dbc42409SLawrence Stewart 
41dbc42409SLawrence Stewart /*
42dbc42409SLawrence Stewart  * This software was first released in 2007 by James Healy and Lawrence Stewart
43891b8ed4SLawrence Stewart  * whilst working on the NewTCP research project at Swinburne University of
44891b8ed4SLawrence Stewart  * Technology's Centre for Advanced Internet Architectures, Melbourne,
45891b8ed4SLawrence Stewart  * Australia, which was made possible in part by a grant from the Cisco
46891b8ed4SLawrence Stewart  * University Research Program Fund at Community Foundation Silicon Valley.
47891b8ed4SLawrence Stewart  * More details are available at:
48dbc42409SLawrence Stewart  *   http://caia.swin.edu.au/urp/newtcp/
49dbc42409SLawrence Stewart  */
50dbc42409SLawrence Stewart 
51dbc42409SLawrence Stewart #include <sys/cdefs.h>
52dbc42409SLawrence Stewart __FBSDID("$FreeBSD$");
53b8d60729SRandall Stewart #include <opt_cc.h>
54dbc42409SLawrence Stewart #include <sys/param.h>
55dbc42409SLawrence Stewart #include <sys/kernel.h>
56dbc42409SLawrence Stewart #include <sys/libkern.h>
57dbc42409SLawrence Stewart #include <sys/lock.h>
58dbc42409SLawrence Stewart #include <sys/malloc.h>
59dbc42409SLawrence Stewart #include <sys/module.h>
60dbc42409SLawrence Stewart #include <sys/mutex.h>
61dbc42409SLawrence Stewart #include <sys/queue.h>
62dbc42409SLawrence Stewart #include <sys/rwlock.h>
63dbc42409SLawrence Stewart #include <sys/sbuf.h>
64dbc42409SLawrence Stewart #include <sys/socket.h>
65dbc42409SLawrence Stewart #include <sys/socketvar.h>
66dbc42409SLawrence Stewart #include <sys/sysctl.h>
67dbc42409SLawrence Stewart 
68b66d74c1SGleb Smirnoff #include <net/vnet.h>
69dbc42409SLawrence Stewart 
70dbc42409SLawrence Stewart #include <netinet/in.h>
71dbc42409SLawrence Stewart #include <netinet/in_pcb.h>
722de3e790SGleb Smirnoff #include <netinet/tcp.h>
73b8d60729SRandall Stewart #include <netinet/tcp_seq.h>
74dbc42409SLawrence Stewart #include <netinet/tcp_var.h>
75b8d60729SRandall Stewart #include <netinet/tcp_log_buf.h>
76b8d60729SRandall Stewart #include <netinet/tcp_hpts.h>
774644fda3SGleb Smirnoff #include <netinet/cc/cc.h>
78dbc42409SLawrence Stewart #include <netinet/cc/cc_module.h>
79dbc42409SLawrence Stewart 
807e3c9ec9SWarner Losh /*
817e3c9ec9SWarner Losh  * Have a sane default if no CC_DEFAULT is specified in the kernel config file.
827e3c9ec9SWarner Losh  */
837e3c9ec9SWarner Losh #ifndef CC_DEFAULT
847e3c9ec9SWarner Losh #define CC_DEFAULT "newreno"
857e3c9ec9SWarner Losh #endif
867e3c9ec9SWarner Losh 
87b8d60729SRandall Stewart MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");
88b8d60729SRandall Stewart 
89dbc42409SLawrence Stewart /*
90dbc42409SLawrence Stewart  * List of available cc algorithms on the current system. First element
91dbc42409SLawrence Stewart  * is used as the system default CC algorithm.
92dbc42409SLawrence Stewart  */
93dbc42409SLawrence Stewart struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
94dbc42409SLawrence Stewart 
95dbc42409SLawrence Stewart /* Protects the cc_list TAILQ. */
96dbc42409SLawrence Stewart struct rwlock cc_list_lock;
97dbc42409SLawrence Stewart 
98b8d60729SRandall Stewart VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
99b8d60729SRandall Stewart 
100b8d60729SRandall Stewart VNET_DEFINE(uint32_t, newreno_beta) = 50;
101b8d60729SRandall Stewart #define V_newreno_beta VNET(newreno_beta)
102dbc42409SLawrence Stewart 
103dbc42409SLawrence Stewart /*
104dbc42409SLawrence Stewart  * Sysctl handler to show and change the default CC algorithm.
105dbc42409SLawrence Stewart  */
106dbc42409SLawrence Stewart static int
107dbc42409SLawrence Stewart cc_default_algo(SYSCTL_HANDLER_ARGS)
108dbc42409SLawrence Stewart {
109ebf92e86SLawrence Stewart 	char default_cc[TCP_CA_NAME_MAX];
110dbc42409SLawrence Stewart 	struct cc_algo *funcs;
1110e1152fcSHans Petter Selasky 	int error;
112dbc42409SLawrence Stewart 
1130e1152fcSHans Petter Selasky 	/* Get the current default: */
114dbc42409SLawrence Stewart 	CC_LIST_RLOCK();
115b8d60729SRandall Stewart 	if (CC_DEFAULT_ALGO() != NULL)
116b8d60729SRandall Stewart 		strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc));
117b8d60729SRandall Stewart 	else
118b8d60729SRandall Stewart 		memset(default_cc, 0, TCP_CA_NAME_MAX);
119dbc42409SLawrence Stewart 	CC_LIST_RUNLOCK();
1200e1152fcSHans Petter Selasky 
1210e1152fcSHans Petter Selasky 	error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
1220e1152fcSHans Petter Selasky 
1230e1152fcSHans Petter Selasky 	/* Check for error or no change */
1240e1152fcSHans Petter Selasky 	if (error != 0 || req->newptr == NULL)
1250e1152fcSHans Petter Selasky 		goto done;
1260e1152fcSHans Petter Selasky 
1270e1152fcSHans Petter Selasky 	error = ESRCH;
128dbc42409SLawrence Stewart 	/* Find algo with specified name and set it to default. */
12978b01840SLawrence Stewart 	CC_LIST_RLOCK();
130dbc42409SLawrence Stewart 	STAILQ_FOREACH(funcs, &cc_list, entries) {
1310e1152fcSHans Petter Selasky 		if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
13260a945f9SHans Petter Selasky 			continue;
13378b01840SLawrence Stewart 		V_default_cc_ptr = funcs;
1340e1152fcSHans Petter Selasky 		error = 0;
1350e1152fcSHans Petter Selasky 		break;
136dbc42409SLawrence Stewart 	}
13778b01840SLawrence Stewart 	CC_LIST_RUNLOCK();
1380e1152fcSHans Petter Selasky done:
1390e1152fcSHans Petter Selasky 	return (error);
140dbc42409SLawrence Stewart }
141dbc42409SLawrence Stewart 
142dbc42409SLawrence Stewart /*
143dbc42409SLawrence Stewart  * Sysctl handler to display the list of available CC algorithms.
144dbc42409SLawrence Stewart  */
145dbc42409SLawrence Stewart static int
146dbc42409SLawrence Stewart cc_list_available(SYSCTL_HANDLER_ARGS)
147dbc42409SLawrence Stewart {
148dbc42409SLawrence Stewart 	struct cc_algo *algo;
149dbc42409SLawrence Stewart 	struct sbuf *s;
150a66ac850SLawrence Stewart 	int err, first, nalgos;
151dbc42409SLawrence Stewart 
152a66ac850SLawrence Stewart 	err = nalgos = 0;
153dbc42409SLawrence Stewart 	first = 1;
154a66ac850SLawrence Stewart 
155a66ac850SLawrence Stewart 	CC_LIST_RLOCK();
156a66ac850SLawrence Stewart 	STAILQ_FOREACH(algo, &cc_list, entries) {
157a66ac850SLawrence Stewart 		nalgos++;
158a66ac850SLawrence Stewart 	}
159a66ac850SLawrence Stewart 	CC_LIST_RUNLOCK();
160b8d60729SRandall Stewart 	if (nalgos == 0) {
161b8d60729SRandall Stewart 		return (ENOENT);
162b8d60729SRandall Stewart 	}
163a66ac850SLawrence Stewart 	s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
164dbc42409SLawrence Stewart 
165dbc42409SLawrence Stewart 	if (s == NULL)
166dbc42409SLawrence Stewart 		return (ENOMEM);
167dbc42409SLawrence Stewart 
168a66ac850SLawrence Stewart 	/*
169a66ac850SLawrence Stewart 	 * It is theoretically possible for the CC list to have grown in size
170a66ac850SLawrence Stewart 	 * since the call to sbuf_new() and therefore for the sbuf to be too
171a66ac850SLawrence Stewart 	 * small. If this were to happen (incredibly unlikely), the sbuf will
172a66ac850SLawrence Stewart 	 * reach an overflow condition, sbuf_printf() will return an error and
173a66ac850SLawrence Stewart 	 * the sysctl will fail gracefully.
174a66ac850SLawrence Stewart 	 */
175dbc42409SLawrence Stewart 	CC_LIST_RLOCK();
176dbc42409SLawrence Stewart 	STAILQ_FOREACH(algo, &cc_list, entries) {
177dbc42409SLawrence Stewart 		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
178a66ac850SLawrence Stewart 		if (err) {
179a66ac850SLawrence Stewart 			/* Sbuf overflow condition. */
180a66ac850SLawrence Stewart 			err = EOVERFLOW;
181dbc42409SLawrence Stewart 			break;
182a66ac850SLawrence Stewart 		}
183dbc42409SLawrence Stewart 		first = 0;
184dbc42409SLawrence Stewart 	}
185dbc42409SLawrence Stewart 	CC_LIST_RUNLOCK();
186dbc42409SLawrence Stewart 
187dbc42409SLawrence Stewart 	if (!err) {
188dbc42409SLawrence Stewart 		sbuf_finish(s);
189e167cb89SHans Petter Selasky 		err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
190dbc42409SLawrence Stewart 	}
191dbc42409SLawrence Stewart 
192dbc42409SLawrence Stewart 	sbuf_delete(s);
193dbc42409SLawrence Stewart 	return (err);
194dbc42409SLawrence Stewart }
195dbc42409SLawrence Stewart 
196dbc42409SLawrence Stewart /*
197b8d60729SRandall Stewart  * Return the number of times a proposed removal_cc is
198b8d60729SRandall Stewart  * being used as the default.
19978b01840SLawrence Stewart  */
200b8d60729SRandall Stewart static int
201b8d60729SRandall Stewart cc_check_default(struct cc_algo *remove_cc)
20278b01840SLawrence Stewart {
203b8d60729SRandall Stewart 	int cnt = 0;
20478b01840SLawrence Stewart 	VNET_ITERATOR_DECL(vnet_iter);
20578b01840SLawrence Stewart 
20678b01840SLawrence Stewart 	CC_LIST_LOCK_ASSERT();
20778b01840SLawrence Stewart 
20878b01840SLawrence Stewart 	VNET_LIST_RLOCK_NOSLEEP();
20978b01840SLawrence Stewart 	VNET_FOREACH(vnet_iter) {
21078b01840SLawrence Stewart 		CURVNET_SET(vnet_iter);
211b8d60729SRandall Stewart 		if ((CC_DEFAULT_ALGO() != NULL) &&
212b8d60729SRandall Stewart 		    strncmp(CC_DEFAULT_ALGO()->name,
213b8d60729SRandall Stewart 			    remove_cc->name,
214b8d60729SRandall Stewart 			    TCP_CA_NAME_MAX) == 0) {
215b8d60729SRandall Stewart 			cnt++;
216b8d60729SRandall Stewart 		}
21778b01840SLawrence Stewart 		CURVNET_RESTORE();
21878b01840SLawrence Stewart 	}
21978b01840SLawrence Stewart 	VNET_LIST_RUNLOCK_NOSLEEP();
220b8d60729SRandall Stewart 	return (cnt);
22178b01840SLawrence Stewart }
22278b01840SLawrence Stewart 
22378b01840SLawrence Stewart /*
224dbc42409SLawrence Stewart  * Initialise CC subsystem on system boot.
225dbc42409SLawrence Stewart  */
22614f57a8bSLawrence Stewart static void
22714f57a8bSLawrence Stewart cc_init(void)
228dbc42409SLawrence Stewart {
229dbc42409SLawrence Stewart 	CC_LIST_LOCK_INIT();
230dbc42409SLawrence Stewart 	STAILQ_INIT(&cc_list);
231dbc42409SLawrence Stewart }
232dbc42409SLawrence Stewart 
233dbc42409SLawrence Stewart /*
234dbc42409SLawrence Stewart  * Returns non-zero on success, 0 on failure.
235dbc42409SLawrence Stewart  */
236dbc42409SLawrence Stewart int
237dbc42409SLawrence Stewart cc_deregister_algo(struct cc_algo *remove_cc)
238dbc42409SLawrence Stewart {
239dbc42409SLawrence Stewart 	struct cc_algo *funcs, *tmpfuncs;
240dbc42409SLawrence Stewart 	int err;
241dbc42409SLawrence Stewart 
242dbc42409SLawrence Stewart 	err = ENOENT;
243dbc42409SLawrence Stewart 
244dbc42409SLawrence Stewart 	/* Remove algo from cc_list so that new connections can't use it. */
245dbc42409SLawrence Stewart 	CC_LIST_WLOCK();
246dbc42409SLawrence Stewart 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
247dbc42409SLawrence Stewart 		if (funcs == remove_cc) {
248b8d60729SRandall Stewart 			if (cc_check_default(remove_cc)) {
249*db0ac6deSCy Schubert 				CC_LIST_WUNLOCK();
250*db0ac6deSCy Schubert 				return(EBUSY);
251b8d60729SRandall Stewart 			}
252dbc42409SLawrence Stewart 			break;
253dbc42409SLawrence Stewart 		}
254dbc42409SLawrence Stewart 	}
255*db0ac6deSCy Schubert 	remove_cc->flags |= CC_MODULE_BEING_REMOVED;
256dbc42409SLawrence Stewart 	CC_LIST_WUNLOCK();
257b8d60729SRandall Stewart 	err = tcp_ccalgounload(remove_cc);
258dbc42409SLawrence Stewart 	/*
259b8d60729SRandall Stewart 	 * Now back through and we either remove the temp flag
260b8d60729SRandall Stewart 	 * or pull the registration.
261dbc42409SLawrence Stewart 	 */
262b8d60729SRandall Stewart 	CC_LIST_WLOCK();
263b8d60729SRandall Stewart 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
264b8d60729SRandall Stewart 		if (funcs == remove_cc) {
265b8d60729SRandall Stewart 			if (err == 0)
266b8d60729SRandall Stewart 				STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
267b8d60729SRandall Stewart 			else
268b8d60729SRandall Stewart 				funcs->flags &= ~CC_MODULE_BEING_REMOVED;
269b8d60729SRandall Stewart 			break;
270b8d60729SRandall Stewart 		}
271b8d60729SRandall Stewart 	}
272b8d60729SRandall Stewart 	CC_LIST_WUNLOCK();
273dbc42409SLawrence Stewart 	return (err);
274dbc42409SLawrence Stewart }
275dbc42409SLawrence Stewart 
276dbc42409SLawrence Stewart /*
277dbc42409SLawrence Stewart  * Returns 0 on success, non-zero on failure.
278dbc42409SLawrence Stewart  */
279dbc42409SLawrence Stewart int
280dbc42409SLawrence Stewart cc_register_algo(struct cc_algo *add_cc)
281dbc42409SLawrence Stewart {
282dbc42409SLawrence Stewart 	struct cc_algo *funcs;
283dbc42409SLawrence Stewart 	int err;
284dbc42409SLawrence Stewart 
285dbc42409SLawrence Stewart 	err = 0;
286dbc42409SLawrence Stewart 
287dbc42409SLawrence Stewart 	/*
288dbc42409SLawrence Stewart 	 * Iterate over list of registered CC algorithms and make sure
289dbc42409SLawrence Stewart 	 * we're not trying to add a duplicate.
290dbc42409SLawrence Stewart 	 */
291dbc42409SLawrence Stewart 	CC_LIST_WLOCK();
292dbc42409SLawrence Stewart 	STAILQ_FOREACH(funcs, &cc_list, entries) {
293b8d60729SRandall Stewart 		if (funcs == add_cc ||
294b8d60729SRandall Stewart 		    strncmp(funcs->name, add_cc->name,
295b8d60729SRandall Stewart 			    TCP_CA_NAME_MAX) == 0) {
296dbc42409SLawrence Stewart 			err = EEXIST;
297b8d60729SRandall Stewart 			break;
298dbc42409SLawrence Stewart 		}
299b8d60729SRandall Stewart 	}
300b8d60729SRandall Stewart 	/*
301b8d60729SRandall Stewart 	 * The first loaded congestion control module will become
302b8d60729SRandall Stewart 	 * the default until we find the "CC_DEFAULT" defined in
303b8d60729SRandall Stewart 	 * the config (if we do).
304b8d60729SRandall Stewart 	 */
305b8d60729SRandall Stewart 	if (!err) {
306dbc42409SLawrence Stewart 		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
307b8d60729SRandall Stewart 		if (strcmp(add_cc->name, CC_DEFAULT) == 0) {
308b8d60729SRandall Stewart 			V_default_cc_ptr = add_cc;
309b8d60729SRandall Stewart 		} else if (V_default_cc_ptr == NULL) {
310b8d60729SRandall Stewart 			V_default_cc_ptr = add_cc;
311b8d60729SRandall Stewart 		}
312b8d60729SRandall Stewart 	}
313dbc42409SLawrence Stewart 	CC_LIST_WUNLOCK();
314dbc42409SLawrence Stewart 
315dbc42409SLawrence Stewart 	return (err);
316dbc42409SLawrence Stewart }
317dbc42409SLawrence Stewart 
318034a9240SMark Johnston static void
319034a9240SMark Johnston vnet_cc_sysinit(void *arg)
320034a9240SMark Johnston {
321034a9240SMark Johnston 	struct cc_algo *cc;
322034a9240SMark Johnston 
323034a9240SMark Johnston 	if (IS_DEFAULT_VNET(curvnet))
324034a9240SMark Johnston 		return;
325034a9240SMark Johnston 
326034a9240SMark Johnston 	CURVNET_SET(vnet0);
327034a9240SMark Johnston 	cc = V_default_cc_ptr;
328034a9240SMark Johnston 	CURVNET_RESTORE();
329034a9240SMark Johnston 
330034a9240SMark Johnston 	V_default_cc_ptr = cc;
331034a9240SMark Johnston }
332034a9240SMark Johnston VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
333034a9240SMark Johnston     vnet_cc_sysinit, NULL);
334034a9240SMark Johnston 
335dbc42409SLawrence Stewart /*
336b8d60729SRandall Stewart  * Perform any necessary tasks before we exit congestion recovery.
337b8d60729SRandall Stewart  */
338b8d60729SRandall Stewart void
339b8d60729SRandall Stewart newreno_cc_post_recovery(struct cc_var *ccv)
340b8d60729SRandall Stewart {
341b8d60729SRandall Stewart 	int pipe;
342b8d60729SRandall Stewart 
343b8d60729SRandall Stewart 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
344b8d60729SRandall Stewart 		/*
345b8d60729SRandall Stewart 		 * Fast recovery will conclude after returning from this
346b8d60729SRandall Stewart 		 * function. Window inflation should have left us with
347b8d60729SRandall Stewart 		 * approximately snd_ssthresh outstanding data. But in case we
348b8d60729SRandall Stewart 		 * would be inclined to send a burst, better to do it via the
349b8d60729SRandall Stewart 		 * slow start mechanism.
350b8d60729SRandall Stewart 		 *
351b8d60729SRandall Stewart 		 * XXXLAS: Find a way to do this without needing curack
352b8d60729SRandall Stewart 		 */
353b8d60729SRandall Stewart 		if (V_tcp_do_newsack)
354b8d60729SRandall Stewart 			pipe = tcp_compute_pipe(ccv->ccvc.tcp);
355b8d60729SRandall Stewart 		else
356b8d60729SRandall Stewart 			pipe = CCV(ccv, snd_max) - ccv->curack;
357b8d60729SRandall Stewart 		if (pipe < CCV(ccv, snd_ssthresh))
358b8d60729SRandall Stewart 			/*
359b8d60729SRandall Stewart 			 * Ensure that cwnd does not collapse to 1 MSS under
360b4fbc855SGordon Bergling 			 * adverse conditions. Implements RFC6582
361b8d60729SRandall Stewart 			 */
362b8d60729SRandall Stewart 			CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
363b8d60729SRandall Stewart 			    CCV(ccv, t_maxseg);
364b8d60729SRandall Stewart 		else
365b8d60729SRandall Stewart 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
366b8d60729SRandall Stewart 	}
367b8d60729SRandall Stewart }
368b8d60729SRandall Stewart 
369b8d60729SRandall Stewart void
370b8d60729SRandall Stewart newreno_cc_after_idle(struct cc_var *ccv)
371b8d60729SRandall Stewart {
372b8d60729SRandall Stewart 	uint32_t rw;
373b8d60729SRandall Stewart 	/*
374b8d60729SRandall Stewart 	 * If we've been idle for more than one retransmit timeout the old
375b8d60729SRandall Stewart 	 * congestion window is no longer current and we have to reduce it to
376b8d60729SRandall Stewart 	 * the restart window before we can transmit again.
377b8d60729SRandall Stewart 	 *
378b8d60729SRandall Stewart 	 * The restart window is the initial window or the last CWND, whichever
379b8d60729SRandall Stewart 	 * is smaller.
380b8d60729SRandall Stewart 	 *
381b8d60729SRandall Stewart 	 * This is done to prevent us from flooding the path with a full CWND at
382b8d60729SRandall Stewart 	 * wirespeed, overloading router and switch buffers along the way.
383b8d60729SRandall Stewart 	 *
384b8d60729SRandall Stewart 	 * See RFC5681 Section 4.1. "Restarting Idle Connections".
385b8d60729SRandall Stewart 	 *
386b8d60729SRandall Stewart 	 * In addition, per RFC2861 Section 2, the ssthresh is set to the
387b8d60729SRandall Stewart 	 * maximum of the former ssthresh or 3/4 of the old cwnd, to
388b8d60729SRandall Stewart 	 * not exit slow-start prematurely.
389b8d60729SRandall Stewart 	 */
390b8d60729SRandall Stewart 	rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp));
391b8d60729SRandall Stewart 
392b8d60729SRandall Stewart 	CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
393b8d60729SRandall Stewart 	    CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
394b8d60729SRandall Stewart 
395b8d60729SRandall Stewart 	CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
396b8d60729SRandall Stewart }
397b8d60729SRandall Stewart 
398b8d60729SRandall Stewart /*
399b8d60729SRandall Stewart  * Perform any necessary tasks before we enter congestion recovery.
400b8d60729SRandall Stewart  */
401b8d60729SRandall Stewart void
402b8d60729SRandall Stewart newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type)
403b8d60729SRandall Stewart {
404b8d60729SRandall Stewart 	uint32_t cwin, factor;
405b8d60729SRandall Stewart 	u_int mss;
406b8d60729SRandall Stewart 
407b8d60729SRandall Stewart 	cwin = CCV(ccv, snd_cwnd);
408b8d60729SRandall Stewart 	mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
409b8d60729SRandall Stewart 	/*
410b8d60729SRandall Stewart 	 * Other TCP congestion controls use newreno_cong_signal(), but
411b8d60729SRandall Stewart 	 * with their own private cc_data. Make sure the cc_data is used
412b8d60729SRandall Stewart 	 * correctly.
413b8d60729SRandall Stewart 	 */
414b8d60729SRandall Stewart 	factor = V_newreno_beta;
415b8d60729SRandall Stewart 
416b8d60729SRandall Stewart 	/* Catch algos which mistakenly leak private signal types. */
417b8d60729SRandall Stewart 	KASSERT((type & CC_SIGPRIVMASK) == 0,
418b8d60729SRandall Stewart 	    ("%s: congestion signal type 0x%08x is private\n", __func__, type));
419b8d60729SRandall Stewart 
420b8d60729SRandall Stewart 	cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss),
421b8d60729SRandall Stewart 	    2) * mss;
422b8d60729SRandall Stewart 
423b8d60729SRandall Stewart 	switch (type) {
424b8d60729SRandall Stewart 	case CC_NDUPACK:
425b8d60729SRandall Stewart 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
426b8d60729SRandall Stewart 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
427b8d60729SRandall Stewart 				CCV(ccv, snd_ssthresh) = cwin;
428b8d60729SRandall Stewart 			ENTER_RECOVERY(CCV(ccv, t_flags));
429b8d60729SRandall Stewart 		}
430b8d60729SRandall Stewart 		break;
431b8d60729SRandall Stewart 	case CC_ECN:
432b8d60729SRandall Stewart 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
433b8d60729SRandall Stewart 			CCV(ccv, snd_ssthresh) = cwin;
434b8d60729SRandall Stewart 			CCV(ccv, snd_cwnd) = cwin;
435b8d60729SRandall Stewart 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
436b8d60729SRandall Stewart 		}
437b8d60729SRandall Stewart 		break;
438b8d60729SRandall Stewart 	case CC_RTO:
439b8d60729SRandall Stewart 		CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd),
440b8d60729SRandall Stewart 						 CCV(ccv, snd_cwnd)) / 2 / mss,
441b8d60729SRandall Stewart 					     2) * mss;
442b8d60729SRandall Stewart 		CCV(ccv, snd_cwnd) = mss;
443b8d60729SRandall Stewart 		break;
444b8d60729SRandall Stewart 	}
445b8d60729SRandall Stewart }
446b8d60729SRandall Stewart 
447b8d60729SRandall Stewart void
448b8d60729SRandall Stewart newreno_cc_ack_received(struct cc_var *ccv, uint16_t type)
449b8d60729SRandall Stewart {
450b8d60729SRandall Stewart 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
451b8d60729SRandall Stewart 	    (ccv->flags & CCF_CWND_LIMITED)) {
452b8d60729SRandall Stewart 		u_int cw = CCV(ccv, snd_cwnd);
453b8d60729SRandall Stewart 		u_int incr = CCV(ccv, t_maxseg);
454b8d60729SRandall Stewart 
455b8d60729SRandall Stewart 		/*
456b8d60729SRandall Stewart 		 * Regular in-order ACK, open the congestion window.
457b8d60729SRandall Stewart 		 * Method depends on which congestion control state we're
458b8d60729SRandall Stewart 		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
459b8d60729SRandall Stewart 		 * enabled.
460b8d60729SRandall Stewart 		 *
461b8d60729SRandall Stewart 		 * slow start: cwnd <= ssthresh
462b8d60729SRandall Stewart 		 * cong avoid: cwnd > ssthresh
463b8d60729SRandall Stewart 		 *
464b8d60729SRandall Stewart 		 * slow start and ABC (RFC 3465):
465b8d60729SRandall Stewart 		 *   Grow cwnd exponentially by the amount of data
466b8d60729SRandall Stewart 		 *   ACKed capping the max increment per ACK to
467b8d60729SRandall Stewart 		 *   (abc_l_var * maxseg) bytes.
468b8d60729SRandall Stewart 		 *
469b8d60729SRandall Stewart 		 * slow start without ABC (RFC 5681):
470b8d60729SRandall Stewart 		 *   Grow cwnd exponentially by maxseg per ACK.
471b8d60729SRandall Stewart 		 *
472b8d60729SRandall Stewart 		 * cong avoid and ABC (RFC 3465):
473b8d60729SRandall Stewart 		 *   Grow cwnd linearly by maxseg per RTT for each
474b8d60729SRandall Stewart 		 *   cwnd worth of ACKed data.
475b8d60729SRandall Stewart 		 *
476b8d60729SRandall Stewart 		 * cong avoid without ABC (RFC 5681):
477b8d60729SRandall Stewart 		 *   Grow cwnd linearly by approximately maxseg per RTT using
478b8d60729SRandall Stewart 		 *   maxseg^2 / cwnd per ACK as the increment.
479b8d60729SRandall Stewart 		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
480b8d60729SRandall Stewart 		 *   avoid capping cwnd.
481b8d60729SRandall Stewart 		 */
482b8d60729SRandall Stewart 		if (cw > CCV(ccv, snd_ssthresh)) {
483b8d60729SRandall Stewart 			if (V_tcp_do_rfc3465) {
484b8d60729SRandall Stewart 				if (ccv->flags & CCF_ABC_SENTAWND)
485b8d60729SRandall Stewart 					ccv->flags &= ~CCF_ABC_SENTAWND;
486b8d60729SRandall Stewart 				else
487b8d60729SRandall Stewart 					incr = 0;
488b8d60729SRandall Stewart 			} else
489b8d60729SRandall Stewart 				incr = max((incr * incr / cw), 1);
490b8d60729SRandall Stewart 		} else if (V_tcp_do_rfc3465) {
491b8d60729SRandall Stewart 			/*
492b8d60729SRandall Stewart 			 * In slow-start with ABC enabled and no RTO in sight?
493b8d60729SRandall Stewart 			 * (Must not use abc_l_var > 1 if slow starting after
494b8d60729SRandall Stewart 			 * an RTO. On RTO, snd_nxt = snd_una, so the
495b8d60729SRandall Stewart 			 * snd_nxt == snd_max check is sufficient to
496b8d60729SRandall Stewart 			 * handle this).
497b8d60729SRandall Stewart 			 *
498b8d60729SRandall Stewart 			 * XXXLAS: Find a way to signal SS after RTO that
499b8d60729SRandall Stewart 			 * doesn't rely on tcpcb vars.
500b8d60729SRandall Stewart 			 */
501b8d60729SRandall Stewart 			uint16_t abc_val;
502b8d60729SRandall Stewart 
503b8d60729SRandall Stewart 			if (ccv->flags & CCF_USE_LOCAL_ABC)
504b8d60729SRandall Stewart 				abc_val = ccv->labc;
505b8d60729SRandall Stewart 			else
506b8d60729SRandall Stewart 				abc_val = V_tcp_abc_l_var;
507b8d60729SRandall Stewart 			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
508b8d60729SRandall Stewart 				incr = min(ccv->bytes_this_ack,
509b8d60729SRandall Stewart 				    ccv->nsegs * abc_val *
510b8d60729SRandall Stewart 				    CCV(ccv, t_maxseg));
511b8d60729SRandall Stewart 			else
512b8d60729SRandall Stewart 				incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
513b8d60729SRandall Stewart 
514b8d60729SRandall Stewart 		}
515b8d60729SRandall Stewart 		/* ABC is on by default, so incr equals 0 frequently. */
516b8d60729SRandall Stewart 		if (incr > 0)
517b8d60729SRandall Stewart 			CCV(ccv, snd_cwnd) = min(cw + incr,
518b8d60729SRandall Stewart 			    TCP_MAXWIN << CCV(ccv, snd_scale));
519b8d60729SRandall Stewart 	}
520b8d60729SRandall Stewart }
521b8d60729SRandall Stewart 
522b8d60729SRandall Stewart /*
523dbc42409SLawrence Stewart  * Handles kld related events. Returns 0 on success, non-zero on failure.
524dbc42409SLawrence Stewart  */
525dbc42409SLawrence Stewart int
526dbc42409SLawrence Stewart cc_modevent(module_t mod, int event_type, void *data)
527dbc42409SLawrence Stewart {
528dbc42409SLawrence Stewart 	struct cc_algo *algo;
529dbc42409SLawrence Stewart 	int err;
530dbc42409SLawrence Stewart 
531dbc42409SLawrence Stewart 	err = 0;
532dbc42409SLawrence Stewart 	algo = (struct cc_algo *)data;
533dbc42409SLawrence Stewart 
534dbc42409SLawrence Stewart 	switch(event_type) {
535dbc42409SLawrence Stewart 	case MOD_LOAD:
536b8d60729SRandall Stewart 		if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) {
537b8d60729SRandall Stewart 			/*
538b8d60729SRandall Stewart 			 * A module must have a cc_data_sz function
539b8d60729SRandall Stewart 			 * even if it has no data it should return 0.
540b8d60729SRandall Stewart 			 */
541b8d60729SRandall Stewart 			printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n");
542b8d60729SRandall Stewart 			err = EINVAL;
543b8d60729SRandall Stewart 			break;
544b8d60729SRandall Stewart 		}
545dbc42409SLawrence Stewart 		if (algo->mod_init != NULL)
546dbc42409SLawrence Stewart 			err = algo->mod_init();
547dbc42409SLawrence Stewart 		if (!err)
548dbc42409SLawrence Stewart 			err = cc_register_algo(algo);
549dbc42409SLawrence Stewart 		break;
550dbc42409SLawrence Stewart 
551dbc42409SLawrence Stewart 	case MOD_QUIESCE:
552dbc42409SLawrence Stewart 	case MOD_SHUTDOWN:
553dbc42409SLawrence Stewart 	case MOD_UNLOAD:
554dbc42409SLawrence Stewart 		err = cc_deregister_algo(algo);
555dbc42409SLawrence Stewart 		if (!err && algo->mod_destroy != NULL)
556dbc42409SLawrence Stewart 			algo->mod_destroy();
557dbc42409SLawrence Stewart 		if (err == ENOENT)
558dbc42409SLawrence Stewart 			err = 0;
559dbc42409SLawrence Stewart 		break;
560dbc42409SLawrence Stewart 
561dbc42409SLawrence Stewart 	default:
562dbc42409SLawrence Stewart 		err = EINVAL;
563dbc42409SLawrence Stewart 		break;
564dbc42409SLawrence Stewart 	}
565dbc42409SLawrence Stewart 
566dbc42409SLawrence Stewart 	return (err);
567dbc42409SLawrence Stewart }
568dbc42409SLawrence Stewart 
56914f57a8bSLawrence Stewart SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
57014f57a8bSLawrence Stewart 
571dbc42409SLawrence Stewart /* Declare sysctl tree and populate it. */
5727029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
573439e76ecSBrad Davis     "Congestion control related settings");
574dbc42409SLawrence Stewart 
5756df8a710SGleb Smirnoff SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
5767029da5cSPawel Biernacki     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
5777029da5cSPawel Biernacki     NULL, 0, cc_default_algo, "A",
5787029da5cSPawel Biernacki     "Default congestion control algorithm");
579dbc42409SLawrence Stewart 
5807029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available,
5817029da5cSPawel Biernacki     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
582dbc42409SLawrence Stewart     NULL, 0, cc_list_available, "A",
583439e76ecSBrad Davis     "List available congestion control algorithms");
584370efe5aSLawrence Stewart 
585370efe5aSLawrence Stewart VNET_DEFINE(int, cc_do_abe) = 0;
586370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW,
587370efe5aSLawrence Stewart     &VNET_NAME(cc_do_abe), 0,
588370efe5aSLawrence Stewart     "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)");
589370efe5aSLawrence Stewart 
590370efe5aSLawrence Stewart VNET_DEFINE(int, cc_abe_frlossreduce) = 0;
591370efe5aSLawrence Stewart SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW,
592370efe5aSLawrence Stewart     &VNET_NAME(cc_abe_frlossreduce), 0,
593370efe5aSLawrence Stewart     "Apply standard beta instead of ABE-beta during ECN-signalled congestion "
594370efe5aSLawrence Stewart     "recovery episodes if loss also needs to be repaired");
595