xref: /illumos-gate/usr/src/uts/common/inet/cc/cc_cubic.c (revision 4e5ef1cee66fbfbbd2b3e56b81e2bb5700f4a59e)
1 /*
2  * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org>
3  * Copyright (c) 2010 The FreeBSD Foundation
4  * All rights reserved.
5  * Copyright (c) 2017 by Delphix. All rights reserved.
6  *
7  * This software was developed by Lawrence Stewart while studying at the Centre
8  * for Advanced Internet Architectures, Swinburne University of Technology, made
9  * possible in part by a grant from the Cisco University Research Program Fund
10  * at Community Foundation Silicon Valley.
11  *
12  * Portions of this software were developed at the Centre for Advanced
13  * Internet Architectures, Swinburne University of Technology, Melbourne,
14  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 /*
39  * An implementation of the CUBIC congestion control algorithm for FreeBSD,
40  * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
41  * Originally released as part of the NewTCP research project at Swinburne
42  * University of Technology's Centre for Advanced Internet Architectures,
43  * Melbourne, Australia, which was made possible in part by a grant from the
44  * Cisco University Research Program Fund at Community Foundation Silicon
45  * Valley. More details are available at:
46  *   http://caia.swin.edu.au/urp/newtcp/
47  */
48 
49 #include <sys/errno.h>
50 #include <sys/types.h>
51 #include <sys/kmem.h>
52 #include <sys/ddi.h>
53 #include <sys/sunddi.h>
54 #include <sys/modctl.h>
55 #include <sys/time.h>
56 
57 #include <inet/tcp_impl.h>
58 #include <inet/cc.h>
59 #include <inet/cc/cc_cubic.h>
60 #include <inet/cc/cc_module.h>
61 
62 static struct modlmisc cc_cubic_modlmisc = {
63 	&mod_miscops,
64 	"Cubic Congestion Control"
65 };
66 
67 static struct modlinkage cc_cubic_modlinkage = {
68 	MODREV_1,
69 	&cc_cubic_modlmisc,
70 	NULL
71 };
72 
73 /*
74  * cubic uses the NewReno implementation of after_idle and uses NewReno's
75  * ack_received callback during slow start.
76  */
77 static struct cc_algo *newreno_cc_algo;
78 
79 static void	cubic_ack_received(struct cc_var *ccv, uint16_t type);
80 static void	cubic_cb_destroy(struct cc_var *ccv);
81 static int	cubic_cb_init(struct cc_var *ccv);
82 static void	cubic_cong_signal(struct cc_var *ccv, uint32_t type);
83 static void	cubic_conn_init(struct cc_var *ccv);
84 static void	cubic_post_recovery(struct cc_var *ccv);
85 static void	cubic_record_rtt(struct cc_var *ccv);
86 static void	cubic_ssthresh_update(struct cc_var *ccv);
87 
88 struct cubic {
89 	/* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
90 	int64_t		K;
91 	/* Sum of RTT samples across an epoch in nanoseconds. */
92 	hrtime_t	sum_rtt_nsecs;
93 	/* cwnd at the most recent congestion event. */
94 	uint32_t	max_cwnd;
95 	/* cwnd at the previous congestion event. */
96 	uint32_t	prev_max_cwnd;
97 	/* Number of congestion events. */
98 	uint32_t	num_cong_events;
99 	/* Minimum observed rtt in nanoseconds. */
100 	hrtime_t	min_rtt_nsecs;
101 	/* Mean observed rtt between congestion epochs. */
102 	hrtime_t	mean_rtt_nsecs;
103 	/* ACKs since last congestion event. */
104 	int		epoch_ack_count;
105 	/* Time of last congestion event in nanoseconds. */
106 	hrtime_t	t_last_cong;
107 };
108 
109 struct cc_algo cubic_cc_algo = {
110 	.name = "cubic",
111 	.ack_received = cubic_ack_received,
112 	.cb_destroy = cubic_cb_destroy,
113 	.cb_init = cubic_cb_init,
114 	.cong_signal = cubic_cong_signal,
115 	.conn_init = cubic_conn_init,
116 	.post_recovery = cubic_post_recovery,
117 };
118 
119 int
120 _init(void)
121 {
122 	int err;
123 
124 	if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL)
125 		return (EINVAL);
126 
127 	if ((err = cc_register_algo(&cubic_cc_algo)) == 0) {
128 		if ((err = mod_install(&cc_cubic_modlinkage)) != 0)
129 			(void) cc_deregister_algo(&cubic_cc_algo);
130 	}
131 	cubic_cc_algo.after_idle = newreno_cc_algo->after_idle;
132 	return (err);
133 }
134 
135 int
136 _fini(void)
137 {
138 	/* XXX Not unloadable for now */
139 	return (EBUSY);
140 }
141 
142 int
143 _info(struct modinfo *modinfop)
144 {
145 	return (mod_info(&cc_cubic_modlinkage, modinfop));
146 }
147 
148 static void
149 cubic_ack_received(struct cc_var *ccv, uint16_t type)
150 {
151 	struct cubic *cubic_data;
152 	uint32_t w_tf, w_cubic_next;
153 	hrtime_t nsecs_since_cong;
154 
155 	cubic_data = ccv->cc_data;
156 	cubic_record_rtt(ccv);
157 
158 	/*
159 	 * Regular ACK and we're not in cong/fast recovery and we're cwnd
160 	 * limited and we're either not doing ABC or are slow starting or are
161 	 * doing ABC and we've sent a cwnd's worth of bytes.
162 	 */
163 	if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
164 	    (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) ||
165 	    CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
166 	    (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) {
167 		/* Use the logic in NewReno ack_received() for slow start. */
168 		if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
169 		    cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)
170 			newreno_cc_algo->ack_received(ccv, type);
171 		else {
172 			nsecs_since_cong = gethrtime() -
173 			    cubic_data->t_last_cong;
174 
175 			/*
176 			 * The mean RTT is used to best reflect the equations in
177 			 * the I-D. Using min_rtt in the tf_cwnd calculation
178 			 * causes w_tf to grow much faster than it should if the
179 			 * RTT is dominated by network buffering rather than
180 			 * propagation delay.
181 			 */
182 			w_tf = tf_cwnd(nsecs_since_cong,
183 			    cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
184 			    CCV(ccv, tcp_mss));
185 
186 			w_cubic_next = cubic_cwnd(nsecs_since_cong +
187 			    cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
188 			    CCV(ccv, tcp_mss), cubic_data->K);
189 
190 			ccv->flags &= ~CCF_ABC_SENTAWND;
191 
192 			if (w_cubic_next < w_tf) {
193 				/*
194 				 * TCP-friendly region, follow tf
195 				 * cwnd growth.
196 				 */
197 				CCV(ccv, tcp_cwnd) = w_tf;
198 			} else if (CCV(ccv, tcp_cwnd) < w_cubic_next) {
199 				/*
200 				 * Concave or convex region, follow CUBIC
201 				 * cwnd growth.
202 				 */
203 				if (CC_ABC(ccv))
204 					CCV(ccv, tcp_cwnd) = w_cubic_next;
205 				else
206 					CCV(ccv, tcp_cwnd) += ((w_cubic_next -
207 					    CCV(ccv, tcp_cwnd)) *
208 					    CCV(ccv, tcp_mss)) /
209 					    CCV(ccv, tcp_cwnd);
210 			}
211 
212 			/*
213 			 * If we're not in slow start and we're probing for a
214 			 * new cwnd limit at the start of a connection
215 			 * (happens when hostcache has a relevant entry),
216 			 * keep updating our current estimate of the
217 			 * max_cwnd.
218 			 */
219 			if (cubic_data->num_cong_events == 0 &&
220 			    cubic_data->max_cwnd < CCV(ccv, tcp_cwnd))
221 				cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
222 		}
223 	}
224 }
225 
226 static void
227 cubic_cb_destroy(struct cc_var *ccv)
228 {
229 
230 	if (ccv->cc_data != NULL)
231 		kmem_free(ccv->cc_data, sizeof (struct cubic));
232 }
233 
234 static int
235 cubic_cb_init(struct cc_var *ccv)
236 {
237 	struct cubic *cubic_data;
238 
239 	cubic_data = kmem_alloc(sizeof (struct cubic), KM_NOSLEEP);
240 
241 	if (cubic_data == NULL)
242 		return (ENOMEM);
243 
244 	/* Init some key variables with sensible defaults. */
245 	cubic_data->t_last_cong = gethrtime();
246 	cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE;
247 	cubic_data->mean_rtt_nsecs = 1;
248 
249 	ccv->cc_data = cubic_data;
250 
251 	return (0);
252 }
253 
254 /*
255  * Perform any necessary tasks before we enter congestion recovery.
256  */
257 static void
258 cubic_cong_signal(struct cc_var *ccv, uint32_t type)
259 {
260 	struct cubic *cubic_data;
261 	uint32_t cwin;
262 	uint32_t mss;
263 
264 	cubic_data = ccv->cc_data;
265 	cwin = CCV(ccv, tcp_cwnd);
266 	mss = CCV(ccv, tcp_mss);
267 
268 	switch (type) {
269 	case CC_NDUPACK:
270 		if (!IN_FASTRECOVERY(ccv->flags)) {
271 			if (!IN_CONGRECOVERY(ccv->flags)) {
272 				cubic_ssthresh_update(ccv);
273 				cubic_data->num_cong_events++;
274 				cubic_data->prev_max_cwnd =
275 				    cubic_data->max_cwnd;
276 				cubic_data->max_cwnd = cwin;
277 				CCV(ccv, tcp_cwnd) =
278 				    CCV(ccv, tcp_cwnd_ssthresh);
279 			}
280 			ENTER_RECOVERY(ccv->flags);
281 		}
282 		break;
283 
284 	case CC_ECN:
285 		if (!IN_CONGRECOVERY(ccv->flags)) {
286 			cubic_ssthresh_update(ccv);
287 			cubic_data->num_cong_events++;
288 			cubic_data->prev_max_cwnd = cubic_data->max_cwnd;
289 			cubic_data->max_cwnd = cwin;
290 			cubic_data->t_last_cong = gethrtime();
291 			CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
292 			ENTER_CONGRECOVERY(ccv->flags);
293 		}
294 		break;
295 
296 	case CC_RTO:
297 		/*
298 		 * Grab the current time and record it so we know when the
299 		 * most recent congestion event was. Only record it when the
300 		 * timeout has fired more than once, as there is a reasonable
301 		 * chance the first one is a false alarm and may not indicate
302 		 * congestion.
303 		 */
304 		if (CCV(ccv, tcp_timer_backoff) >= 2) {
305 			cubic_data->num_cong_events++;
306 			cubic_data->t_last_cong = gethrtime();
307 			cubic_ssthresh_update(ccv);
308 			cubic_data->max_cwnd = cwin;
309 			CCV(ccv, tcp_cwnd) = mss;
310 		}
311 		break;
312 	}
313 }
314 
315 static void
316 cubic_conn_init(struct cc_var *ccv)
317 {
318 	struct cubic *cubic_data;
319 
320 	cubic_data = ccv->cc_data;
321 
322 	/*
323 	 * Ensure we have a sane initial value for max_cwnd recorded. Without
324 	 * this here bad things happen when entries from the TCP hostcache
325 	 * get used.
326 	 */
327 	cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
328 }
329 
330 /*
331  * Perform any necessary tasks before we exit congestion recovery.
332  */
333 static void
334 cubic_post_recovery(struct cc_var *ccv)
335 {
336 	struct cubic *cubic_data;
337 
338 	cubic_data = ccv->cc_data;
339 
340 	/* Fast convergence heuristic. */
341 	if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) {
342 		cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR)
343 		    >> CUBIC_SHIFT;
344 	}
345 
346 	if (IN_FASTRECOVERY(ccv->flags)) {
347 		/* Update cwnd based on beta and adjusted max_cwnd. */
348 		CCV(ccv, tcp_cwnd) = max(1, ((CUBIC_BETA *
349 		    cubic_data->max_cwnd) >> CUBIC_SHIFT));
350 	}
351 	cubic_data->t_last_cong = gethrtime();
352 
353 	/* Calculate the average RTT between congestion epochs. */
354 	if (cubic_data->epoch_ack_count > 0 &&
355 	    cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) {
356 		cubic_data->mean_rtt_nsecs =
357 		    (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count);
358 	}
359 
360 	cubic_data->epoch_ack_count = 0;
361 	cubic_data->sum_rtt_nsecs = 0;
362 	cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss));
363 }
364 
365 /*
366  * Record the min RTT and sum samples for the epoch average RTT calculation.
367  */
368 static void
369 cubic_record_rtt(struct cc_var *ccv)
370 {
371 	struct cubic *cubic_data;
372 	int t_srtt_nsecs;
373 
374 	/* Ignore srtt until a min number of samples have been taken. */
375 	if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) {
376 		cubic_data = ccv->cc_data;
377 		/* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */
378 		t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3;
379 
380 		/*
381 		 * Record the current SRTT as our minrtt if it's the smallest
382 		 * we've seen or minrtt is currently equal to its initialized
383 		 * value.
384 		 *
385 		 * XXXLAS: Should there be some hysteresis for minrtt?
386 		 */
387 		if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs ||
388 		    cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) {
389 			cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs);
390 
391 			/*
392 			 * If the connection is within its first congestion
393 			 * epoch, ensure we prime mean_rtt_nsecs with a
394 			 * reasonable value until the epoch average RTT is
395 			 * calculated in cubic_post_recovery().
396 			 */
397 			if (cubic_data->min_rtt_nsecs >
398 			    cubic_data->mean_rtt_nsecs)
399 				cubic_data->mean_rtt_nsecs =
400 				    cubic_data->min_rtt_nsecs;
401 		}
402 
403 		/* Sum samples for epoch average RTT calculation. */
404 		cubic_data->sum_rtt_nsecs += t_srtt_nsecs;
405 		cubic_data->epoch_ack_count++;
406 	}
407 }
408 
409 /*
410  * Update the ssthresh in the event of congestion.
411  */
412 static void
413 cubic_ssthresh_update(struct cc_var *ccv)
414 {
415 	struct cubic *cubic_data;
416 
417 	cubic_data = ccv->cc_data;
418 
419 	/*
420 	 * On the first congestion event, set ssthresh to cwnd * 0.5, on
421 	 * subsequent congestion events, set it to cwnd * beta.
422 	 */
423 	if (cubic_data->num_cong_events == 0)
424 		CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1;
425 	else
426 		CCV(ccv, tcp_cwnd_ssthresh) =
427 		    (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT;
428 }
429