xref: /illumos-gate/usr/src/uts/common/inet/cc/cc_cubic.c (revision 8459c777fc1aaabb2f7dad05de1313aa169417cd)
1 /*
2  * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org>
3  * Copyright (c) 2010 The FreeBSD Foundation
4  * All rights reserved.
5  * Copyright (c) 2017 by Delphix. All rights reserved.
6  * Copyright 2019 Joyent, Inc.
7  * Copyright 2020 RackTop Systems, Inc.
8  *
9  * This software was developed by Lawrence Stewart while studying at the Centre
10  * for Advanced Internet Architectures, Swinburne University of Technology, made
11  * possible in part by a grant from the Cisco University Research Program Fund
12  * at Community Foundation Silicon Valley.
13  *
14  * Portions of this software were developed at the Centre for Advanced
15  * Internet Architectures, Swinburne University of Technology, Melbourne,
16  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  * 1. Redistributions of source code must retain the above copyright
22  *    notice, this list of conditions and the following disclaimer.
23  * 2. Redistributions in binary form must reproduce the above copyright
24  *    notice, this list of conditions and the following disclaimer in the
25  *    documentation and/or other materials provided with the distribution.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  */
39 
40 /*
41  * An implementation of the CUBIC congestion control algorithm for FreeBSD,
42  * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
43  * Originally released as part of the NewTCP research project at Swinburne
44  * University of Technology's Centre for Advanced Internet Architectures,
45  * Melbourne, Australia, which was made possible in part by a grant from the
46  * Cisco University Research Program Fund at Community Foundation Silicon
47  * Valley. More details are available at:
48  *   http://caia.swin.edu.au/urp/newtcp/
49  */
50 
51 #include <sys/errno.h>
52 #include <sys/types.h>
53 #include <sys/kmem.h>
54 #include <sys/ddi.h>
55 #include <sys/sunddi.h>
56 #include <sys/modctl.h>
57 #include <sys/time.h>
58 
59 #include <inet/tcp_impl.h>
60 #include <inet/cc.h>
61 #include <inet/cc/cc_cubic.h>
62 #include <inet/cc/cc_module.h>
63 
64 static struct modlmisc cc_cubic_modlmisc = {
65 	&mod_miscops,
66 	"Cubic Congestion Control"
67 };
68 
69 static struct modlinkage cc_cubic_modlinkage = {
70 	MODREV_1,
71 	&cc_cubic_modlmisc,
72 	NULL
73 };
74 
75 /*
76  * cubic uses the NewReno implementation of after_idle and uses NewReno's
77  * ack_received callback during slow start.
78  */
79 static struct cc_algo *newreno_cc_algo;
80 
81 static void	cubic_ack_received(struct cc_var *ccv, uint16_t type);
82 static void	cubic_cb_destroy(struct cc_var *ccv);
83 static int	cubic_cb_init(struct cc_var *ccv);
84 static void	cubic_cong_signal(struct cc_var *ccv, uint32_t type);
85 static void	cubic_conn_init(struct cc_var *ccv);
86 static void	cubic_post_recovery(struct cc_var *ccv);
87 static void	cubic_record_rtt(struct cc_var *ccv);
88 static void	cubic_ssthresh_update(struct cc_var *ccv);
89 static void	cubic_after_idle(struct cc_var *ccv);
90 
91 struct cubic {
92 	/* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
93 	int64_t		K;
94 	/* Sum of RTT samples across an epoch in nanoseconds. */
95 	hrtime_t	sum_rtt_nsecs;
96 	/* cwnd at the most recent congestion event. */
97 	uint32_t	max_cwnd;
98 	/* cwnd at the previous congestion event. */
99 	uint32_t	prev_max_cwnd;
100 	/* Number of congestion events. */
101 	uint32_t	num_cong_events;
102 	/* Minimum observed rtt in nanoseconds. */
103 	hrtime_t	min_rtt_nsecs;
104 	/* Mean observed rtt between congestion epochs. */
105 	hrtime_t	mean_rtt_nsecs;
106 	/* ACKs since last congestion event. */
107 	int		epoch_ack_count;
108 	/* Time of last congestion event in nanoseconds. */
109 	hrtime_t	t_last_cong;
110 };
111 
112 struct cc_algo cubic_cc_algo = {
113 	.name = "cubic",
114 	.ack_received = cubic_ack_received,
115 	.cb_destroy = cubic_cb_destroy,
116 	.cb_init = cubic_cb_init,
117 	.cong_signal = cubic_cong_signal,
118 	.conn_init = cubic_conn_init,
119 	.post_recovery = cubic_post_recovery,
120 	.after_idle = cubic_after_idle,
121 };
122 
123 int
124 _init(void)
125 {
126 	int err;
127 
128 	if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL)
129 		return (EINVAL);
130 
131 	if ((err = cc_register_algo(&cubic_cc_algo)) == 0) {
132 		if ((err = mod_install(&cc_cubic_modlinkage)) != 0)
133 			(void) cc_deregister_algo(&cubic_cc_algo);
134 	}
135 
136 	return (err);
137 }
138 
139 int
140 _fini(void)
141 {
142 	/* XXX Not unloadable for now */
143 	return (EBUSY);
144 }
145 
146 int
147 _info(struct modinfo *modinfop)
148 {
149 	return (mod_info(&cc_cubic_modlinkage, modinfop));
150 }
151 
152 static void
153 cubic_ack_received(struct cc_var *ccv, uint16_t type)
154 {
155 	struct cubic *cubic_data;
156 	uint32_t w_tf, w_cubic_next;
157 	hrtime_t nsecs_since_cong;
158 
159 	cubic_data = ccv->cc_data;
160 	cubic_record_rtt(ccv);
161 
162 	/*
163 	 * Regular ACK and we're not in cong/fast recovery and we're cwnd
164 	 * limited and we're either not doing ABC or are slow starting or are
165 	 * doing ABC and we've sent a cwnd's worth of bytes.
166 	 */
167 	if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
168 	    (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) ||
169 	    CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
170 	    (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) {
171 		/* Use the logic in NewReno ack_received() for slow start. */
172 		if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
173 		    cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)
174 			newreno_cc_algo->ack_received(ccv, type);
175 		else {
176 			nsecs_since_cong = gethrtime() -
177 			    cubic_data->t_last_cong;
178 
179 			/*
180 			 * The mean RTT is used to best reflect the equations in
181 			 * the I-D. Using min_rtt in the tf_cwnd calculation
182 			 * causes w_tf to grow much faster than it should if the
183 			 * RTT is dominated by network buffering rather than
184 			 * propagation delay.
185 			 */
186 			w_tf = tf_cwnd(nsecs_since_cong,
187 			    cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
188 			    CCV(ccv, tcp_mss));
189 
190 			w_cubic_next = cubic_cwnd(nsecs_since_cong +
191 			    cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
192 			    CCV(ccv, tcp_mss), cubic_data->K);
193 
194 			ccv->flags &= ~CCF_ABC_SENTAWND;
195 
196 			if (w_cubic_next < w_tf) {
197 				/*
198 				 * TCP-friendly region, follow tf
199 				 * cwnd growth.
200 				 */
201 				if (CCV(ccv, tcp_cwnd) < w_tf)
202 					CCV(ccv, tcp_cwnd) = w_tf;
203 			} else if (CCV(ccv, tcp_cwnd) < w_cubic_next) {
204 				/*
205 				 * Concave or convex region, follow CUBIC
206 				 * cwnd growth.
207 				 */
208 				if (CC_ABC(ccv))
209 					CCV(ccv, tcp_cwnd) = MIN(w_cubic_next,
210 					    INT_MAX);
211 				else
212 					CCV(ccv, tcp_cwnd) += MAX(1,
213 					    ((MIN(w_cubic_next, INT_MAX) -
214 					    CCV(ccv, tcp_cwnd)) *
215 					    CCV(ccv, tcp_mss)) /
216 					    CCV(ccv, tcp_cwnd));
217 			}
218 
219 			/*
220 			 * If we're not in slow start and we're probing for a
221 			 * new cwnd limit at the start of a connection
222 			 * (happens when hostcache has a relevant entry),
223 			 * keep updating our current estimate of the
224 			 * max_cwnd.
225 			 */
226 			if (cubic_data->num_cong_events == 0 &&
227 			    cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) {
228 				cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
229 				cubic_data->K = cubic_k(cubic_data->max_cwnd /
230 				    CCV(ccv, tcp_mss));
231 			}
232 		}
233 	}
234 }
235 
236 /*
237  * This is a Cubic specific implementation of after_idle.
238  *   - Reset cwnd by calling New Reno implementation of after_idle.
239  *   - Reset t_last_cong.
240  */
241 static void
242 cubic_after_idle(struct cc_var *ccv)
243 {
244 	struct cubic *cubic_data;
245 
246 	cubic_data = ccv->cc_data;
247 
248 	cubic_data->max_cwnd = max(cubic_data->max_cwnd, CCV(ccv, tcp_cwnd));
249 	cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss));
250 
251 	newreno_cc_algo->after_idle(ccv);
252 	cubic_data->t_last_cong = gethrtime();
253 }
254 
255 static void
256 cubic_cb_destroy(struct cc_var *ccv)
257 {
258 
259 	if (ccv->cc_data != NULL)
260 		kmem_free(ccv->cc_data, sizeof (struct cubic));
261 }
262 
263 static int
264 cubic_cb_init(struct cc_var *ccv)
265 {
266 	struct cubic *cubic_data;
267 
268 	cubic_data = kmem_zalloc(sizeof (struct cubic), KM_NOSLEEP);
269 
270 	if (cubic_data == NULL)
271 		return (ENOMEM);
272 
273 	/* Init some key variables with sensible defaults. */
274 	cubic_data->t_last_cong = gethrtime();
275 	cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE;
276 	cubic_data->mean_rtt_nsecs = 1;
277 
278 	ccv->cc_data = cubic_data;
279 
280 	return (0);
281 }
282 
283 /*
284  * Perform any necessary tasks before we enter congestion recovery.
285  */
286 static void
287 cubic_cong_signal(struct cc_var *ccv, uint32_t type)
288 {
289 	struct cubic *cubic_data;
290 	uint32_t cwin;
291 	uint32_t mss;
292 
293 	cubic_data = ccv->cc_data;
294 	cwin = CCV(ccv, tcp_cwnd);
295 	mss = CCV(ccv, tcp_mss);
296 
297 	switch (type) {
298 	case CC_NDUPACK:
299 		if (!IN_FASTRECOVERY(ccv->flags)) {
300 			if (!IN_CONGRECOVERY(ccv->flags)) {
301 				cubic_ssthresh_update(ccv);
302 				cubic_data->num_cong_events++;
303 				cubic_data->prev_max_cwnd =
304 				    cubic_data->max_cwnd;
305 				cubic_data->max_cwnd = cwin;
306 				CCV(ccv, tcp_cwnd) =
307 				    CCV(ccv, tcp_cwnd_ssthresh);
308 			}
309 			ENTER_RECOVERY(ccv->flags);
310 		}
311 		break;
312 
313 	case CC_ECN:
314 		if (!IN_CONGRECOVERY(ccv->flags)) {
315 			cubic_ssthresh_update(ccv);
316 			cubic_data->num_cong_events++;
317 			cubic_data->prev_max_cwnd = cubic_data->max_cwnd;
318 			cubic_data->max_cwnd = cwin;
319 			cubic_data->t_last_cong = gethrtime();
320 			CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
321 			ENTER_CONGRECOVERY(ccv->flags);
322 		}
323 		break;
324 
325 	case CC_RTO:
326 		/*
327 		 * Grab the current time and record it so we know when the
328 		 * most recent congestion event was.
329 		 */
330 		cubic_data->num_cong_events++;
331 		cubic_data->t_last_cong = gethrtime();
332 		cubic_ssthresh_update(ccv);
333 		cubic_data->max_cwnd = cwin;
334 		CCV(ccv, tcp_cwnd) = mss;
335 		break;
336 	}
337 }
338 
339 static void
340 cubic_conn_init(struct cc_var *ccv)
341 {
342 	struct cubic *cubic_data;
343 
344 	cubic_data = ccv->cc_data;
345 
346 	/*
347 	 * Ensure we have a sane initial value for max_cwnd recorded. Without
348 	 * this here bad things happen when entries from the TCP hostcache
349 	 * get used.
350 	 */
351 	cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
352 }
353 
354 /*
355  * Perform any necessary tasks before we exit congestion recovery.
356  */
357 static void
358 cubic_post_recovery(struct cc_var *ccv)
359 {
360 	struct cubic *cubic_data;
361 	uint32_t mss, pipe;
362 
363 	cubic_data = ccv->cc_data;
364 
365 	/* Fast convergence heuristic. */
366 	if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) {
367 		cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR)
368 		    >> CUBIC_SHIFT;
369 	}
370 
371 	/*
372 	 * There is a risk that if the cwnd becomes less than mss, and
373 	 * we do not get enough acks to drive it back up beyond mss,
374 	 * we will stop transmitting data altogether.
375 	 *
376 	 * The Cubic RFC defines values in terms of units of mss. Therefore
377 	 * we must make sure we have at least 1 mss to make progress
378 	 * since the algorthm is written that way.
379 	 */
380 	mss = CCV(ccv, tcp_mss);
381 
382 	if (IN_FASTRECOVERY(ccv->flags)) {
383 		/*
384 		 * If inflight data is less than ssthresh, set cwnd
385 		 * conservatively to avoid a burst of data, as suggested in
386 		 * the NewReno RFC. Otherwise, use the CUBIC method.
387 		 */
388 		pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna);
389 		if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) {
390 			/*
391 			 * Ensure that cwnd does not collapse to 1 MSS under
392 			 * adverse conditions. Implements RFC6582
393 			 */
394 			CCV(ccv, tcp_cwnd) = MAX(pipe, mss) + mss;
395 		} else {
396 			/* Update cwnd based on beta and adjusted max_cwnd. */
397 			CCV(ccv, tcp_cwnd) = max(mss, ((CUBIC_BETA *
398 			    cubic_data->max_cwnd) >> CUBIC_SHIFT));
399 		}
400 	} else {
401 		CCV(ccv, tcp_cwnd) = max(mss, CCV(ccv, tcp_cwnd));
402 	}
403 
404 	cubic_data->t_last_cong = gethrtime();
405 
406 	/* Calculate the average RTT between congestion epochs. */
407 	if (cubic_data->epoch_ack_count > 0 &&
408 	    cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) {
409 		cubic_data->mean_rtt_nsecs =
410 		    (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count);
411 	}
412 
413 	cubic_data->epoch_ack_count = 0;
414 	cubic_data->sum_rtt_nsecs = 0;
415 	cubic_data->K = cubic_k(cubic_data->max_cwnd / mss);
416 }
417 
418 /*
419  * Record the min RTT and sum samples for the epoch average RTT calculation.
420  */
421 static void
422 cubic_record_rtt(struct cc_var *ccv)
423 {
424 	struct cubic *cubic_data;
425 	int t_srtt_nsecs;
426 
427 	/* Ignore srtt until a min number of samples have been taken. */
428 	if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) {
429 		cubic_data = ccv->cc_data;
430 		/* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */
431 		t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3;
432 
433 		/*
434 		 * Record the current SRTT as our minrtt if it's the smallest
435 		 * we've seen or minrtt is currently equal to its initialized
436 		 * value.
437 		 *
438 		 * XXXLAS: Should there be some hysteresis for minrtt?
439 		 */
440 		if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs ||
441 		    cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) {
442 			cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs);
443 
444 			/*
445 			 * If the connection is within its first congestion
446 			 * epoch, ensure we prime mean_rtt_nsecs with a
447 			 * reasonable value until the epoch average RTT is
448 			 * calculated in cubic_post_recovery().
449 			 */
450 			if (cubic_data->min_rtt_nsecs >
451 			    cubic_data->mean_rtt_nsecs)
452 				cubic_data->mean_rtt_nsecs =
453 				    cubic_data->min_rtt_nsecs;
454 		}
455 
456 		/* Sum samples for epoch average RTT calculation. */
457 		cubic_data->sum_rtt_nsecs += t_srtt_nsecs;
458 		cubic_data->epoch_ack_count++;
459 	}
460 }
461 
462 /*
463  * Update the ssthresh in the event of congestion.
464  */
465 static void
466 cubic_ssthresh_update(struct cc_var *ccv)
467 {
468 	struct cubic *cubic_data;
469 
470 	cubic_data = ccv->cc_data;
471 
472 	/*
473 	 * On the first congestion event, set ssthresh to cwnd * 0.5, on
474 	 * subsequent congestion events, set it to cwnd * beta.
475 	 */
476 	if (cubic_data->num_cong_events == 0)
477 		CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1;
478 	else
479 		CCV(ccv, tcp_cwnd_ssthresh) =
480 		    (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT;
481 }
482