1 /*
2 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org>
3 * Copyright (c) 2010 The FreeBSD Foundation
4 * All rights reserved.
5 * Copyright (c) 2017 by Delphix. All rights reserved.
6 * Copyright 2019 Joyent, Inc.
7 * Copyright 2020 RackTop Systems, Inc.
8 *
9 * This software was developed by Lawrence Stewart while studying at the Centre
10 * for Advanced Internet Architectures, Swinburne University of Technology, made
11 * possible in part by a grant from the Cisco University Research Program Fund
12 * at Community Foundation Silicon Valley.
13 *
14 * Portions of this software were developed at the Centre for Advanced
15 * Internet Architectures, Swinburne University of Technology, Melbourne,
16 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 */
39
40 /*
41 * An implementation of the CUBIC congestion control algorithm for FreeBSD,
42 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
43 * Originally released as part of the NewTCP research project at Swinburne
44 * University of Technology's Centre for Advanced Internet Architectures,
45 * Melbourne, Australia, which was made possible in part by a grant from the
46 * Cisco University Research Program Fund at Community Foundation Silicon
47 * Valley. More details are available at:
48 * http://caia.swin.edu.au/urp/newtcp/
49 */
50
51 #include <sys/errno.h>
52 #include <sys/types.h>
53 #include <sys/kmem.h>
54 #include <sys/ddi.h>
55 #include <sys/sunddi.h>
56 #include <sys/modctl.h>
57 #include <sys/time.h>
58
59 #include <inet/tcp_impl.h>
60 #include <inet/cc.h>
61 #include <inet/cc/cc_cubic.h>
62 #include <inet/cc/cc_module.h>
63
64 static struct modlmisc cc_cubic_modlmisc = {
65 &mod_miscops,
66 "Cubic Congestion Control"
67 };
68
69 static struct modlinkage cc_cubic_modlinkage = {
70 MODREV_1,
71 &cc_cubic_modlmisc,
72 NULL
73 };
74
75 /*
76 * cubic uses the NewReno implementation of after_idle and uses NewReno's
77 * ack_received callback during slow start.
78 */
79 static struct cc_algo *newreno_cc_algo;
80
81 static void cubic_ack_received(struct cc_var *ccv, uint16_t type);
82 static void cubic_cb_destroy(struct cc_var *ccv);
83 static int cubic_cb_init(struct cc_var *ccv);
84 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type);
85 static void cubic_conn_init(struct cc_var *ccv);
86 static void cubic_post_recovery(struct cc_var *ccv);
87 static void cubic_record_rtt(struct cc_var *ccv);
88 static void cubic_ssthresh_update(struct cc_var *ccv);
89 static void cubic_after_idle(struct cc_var *ccv);
90
91 struct cubic {
92 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
93 int64_t K;
94 /* Sum of RTT samples across an epoch in nanoseconds. */
95 hrtime_t sum_rtt_nsecs;
96 /* cwnd at the most recent congestion event. */
97 uint32_t max_cwnd;
98 /* cwnd at the previous congestion event. */
99 uint32_t prev_max_cwnd;
100 /* Number of congestion events. */
101 uint32_t num_cong_events;
102 /* Minimum observed rtt in nanoseconds. */
103 hrtime_t min_rtt_nsecs;
104 /* Mean observed rtt between congestion epochs. */
105 hrtime_t mean_rtt_nsecs;
106 /* ACKs since last congestion event. */
107 int epoch_ack_count;
108 /* Time of last congestion event in nanoseconds. */
109 hrtime_t t_last_cong;
110 };
111
112 struct cc_algo cubic_cc_algo = {
113 .name = "cubic",
114 .ack_received = cubic_ack_received,
115 .cb_destroy = cubic_cb_destroy,
116 .cb_init = cubic_cb_init,
117 .cong_signal = cubic_cong_signal,
118 .conn_init = cubic_conn_init,
119 .post_recovery = cubic_post_recovery,
120 .after_idle = cubic_after_idle,
121 };
122
123 int
_init(void)124 _init(void)
125 {
126 int err;
127
128 if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL)
129 return (EINVAL);
130
131 if ((err = cc_register_algo(&cubic_cc_algo)) == 0) {
132 if ((err = mod_install(&cc_cubic_modlinkage)) != 0)
133 (void) cc_deregister_algo(&cubic_cc_algo);
134 }
135
136 return (err);
137 }
138
139 int
_fini(void)140 _fini(void)
141 {
142 /* XXX Not unloadable for now */
143 return (EBUSY);
144 }
145
146 int
_info(struct modinfo * modinfop)147 _info(struct modinfo *modinfop)
148 {
149 return (mod_info(&cc_cubic_modlinkage, modinfop));
150 }
151
152 static void
cubic_ack_received(struct cc_var * ccv,uint16_t type)153 cubic_ack_received(struct cc_var *ccv, uint16_t type)
154 {
155 struct cubic *cubic_data;
156 uint32_t w_tf, w_cubic_next;
157 hrtime_t nsecs_since_cong;
158
159 cubic_data = ccv->cc_data;
160 cubic_record_rtt(ccv);
161
162 /*
163 * Regular ACK and we're not in cong/fast recovery and we're cwnd
164 * limited and we're either not doing ABC or are slow starting or are
165 * doing ABC and we've sent a cwnd's worth of bytes.
166 */
167 if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
168 (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) ||
169 CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
170 (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) {
171 /* Use the logic in NewReno ack_received() for slow start. */
172 if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
173 cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)
174 newreno_cc_algo->ack_received(ccv, type);
175 else {
176 nsecs_since_cong = gethrtime() -
177 cubic_data->t_last_cong;
178
179 /*
180 * The mean RTT is used to best reflect the equations in
181 * the I-D. Using min_rtt in the tf_cwnd calculation
182 * causes w_tf to grow much faster than it should if the
183 * RTT is dominated by network buffering rather than
184 * propagation delay.
185 */
186 w_tf = tf_cwnd(nsecs_since_cong,
187 cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
188 CCV(ccv, tcp_mss));
189
190 w_cubic_next = cubic_cwnd(nsecs_since_cong +
191 cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
192 CCV(ccv, tcp_mss), cubic_data->K);
193
194 ccv->flags &= ~CCF_ABC_SENTAWND;
195
196 if (w_cubic_next < w_tf) {
197 /*
198 * TCP-friendly region, follow tf
199 * cwnd growth.
200 */
201 if (CCV(ccv, tcp_cwnd) < w_tf)
202 CCV(ccv, tcp_cwnd) = w_tf;
203 } else if (CCV(ccv, tcp_cwnd) < w_cubic_next) {
204 /*
205 * Concave or convex region, follow CUBIC
206 * cwnd growth.
207 */
208 if (CC_ABC(ccv))
209 CCV(ccv, tcp_cwnd) = MIN(w_cubic_next,
210 INT_MAX);
211 else
212 CCV(ccv, tcp_cwnd) += MAX(1,
213 ((MIN(w_cubic_next, INT_MAX) -
214 CCV(ccv, tcp_cwnd)) *
215 CCV(ccv, tcp_mss)) /
216 CCV(ccv, tcp_cwnd));
217 }
218
219 /*
220 * If we're not in slow start and we're probing for a
221 * new cwnd limit at the start of a connection
222 * (happens when hostcache has a relevant entry),
223 * keep updating our current estimate of the
224 * max_cwnd.
225 */
226 if (cubic_data->num_cong_events == 0 &&
227 cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) {
228 cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
229 cubic_data->K = cubic_k(cubic_data->max_cwnd /
230 CCV(ccv, tcp_mss));
231 }
232 }
233 }
234 }
235
236 /*
237 * This is a Cubic specific implementation of after_idle.
238 * - Reset cwnd by calling New Reno implementation of after_idle.
239 * - Reset t_last_cong.
240 */
241 static void
cubic_after_idle(struct cc_var * ccv)242 cubic_after_idle(struct cc_var *ccv)
243 {
244 struct cubic *cubic_data;
245
246 cubic_data = ccv->cc_data;
247
248 cubic_data->max_cwnd = max(cubic_data->max_cwnd, CCV(ccv, tcp_cwnd));
249 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss));
250
251 newreno_cc_algo->after_idle(ccv);
252 cubic_data->t_last_cong = gethrtime();
253 }
254
255 static void
cubic_cb_destroy(struct cc_var * ccv)256 cubic_cb_destroy(struct cc_var *ccv)
257 {
258
259 if (ccv->cc_data != NULL)
260 kmem_free(ccv->cc_data, sizeof (struct cubic));
261 }
262
263 static int
cubic_cb_init(struct cc_var * ccv)264 cubic_cb_init(struct cc_var *ccv)
265 {
266 struct cubic *cubic_data;
267
268 cubic_data = kmem_zalloc(sizeof (struct cubic), KM_NOSLEEP);
269
270 if (cubic_data == NULL)
271 return (ENOMEM);
272
273 /* Init some key variables with sensible defaults. */
274 cubic_data->t_last_cong = gethrtime();
275 cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE;
276 cubic_data->mean_rtt_nsecs = 1;
277
278 ccv->cc_data = cubic_data;
279
280 return (0);
281 }
282
283 /*
284 * Perform any necessary tasks before we enter congestion recovery.
285 */
286 static void
cubic_cong_signal(struct cc_var * ccv,uint32_t type)287 cubic_cong_signal(struct cc_var *ccv, uint32_t type)
288 {
289 struct cubic *cubic_data;
290 uint32_t cwin;
291 uint32_t mss;
292
293 cubic_data = ccv->cc_data;
294 cwin = CCV(ccv, tcp_cwnd);
295 mss = CCV(ccv, tcp_mss);
296
297 switch (type) {
298 case CC_NDUPACK:
299 if (!IN_FASTRECOVERY(ccv->flags)) {
300 if (!IN_CONGRECOVERY(ccv->flags)) {
301 cubic_ssthresh_update(ccv);
302 cubic_data->num_cong_events++;
303 cubic_data->prev_max_cwnd =
304 cubic_data->max_cwnd;
305 cubic_data->max_cwnd = cwin;
306 CCV(ccv, tcp_cwnd) =
307 CCV(ccv, tcp_cwnd_ssthresh);
308 }
309 ENTER_RECOVERY(ccv->flags);
310 }
311 break;
312
313 case CC_ECN:
314 if (!IN_CONGRECOVERY(ccv->flags)) {
315 cubic_ssthresh_update(ccv);
316 cubic_data->num_cong_events++;
317 cubic_data->prev_max_cwnd = cubic_data->max_cwnd;
318 cubic_data->max_cwnd = cwin;
319 cubic_data->t_last_cong = gethrtime();
320 CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
321 ENTER_CONGRECOVERY(ccv->flags);
322 }
323 break;
324
325 case CC_RTO:
326 /*
327 * Grab the current time and record it so we know when the
328 * most recent congestion event was.
329 */
330 cubic_data->num_cong_events++;
331 cubic_data->t_last_cong = gethrtime();
332 cubic_ssthresh_update(ccv);
333 cubic_data->max_cwnd = cwin;
334 CCV(ccv, tcp_cwnd) = mss;
335 break;
336 }
337 }
338
339 static void
cubic_conn_init(struct cc_var * ccv)340 cubic_conn_init(struct cc_var *ccv)
341 {
342 struct cubic *cubic_data;
343
344 cubic_data = ccv->cc_data;
345
346 /*
347 * Ensure we have a sane initial value for max_cwnd recorded. Without
348 * this here bad things happen when entries from the TCP hostcache
349 * get used.
350 */
351 cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
352 }
353
354 /*
355 * Perform any necessary tasks before we exit congestion recovery.
356 */
357 static void
cubic_post_recovery(struct cc_var * ccv)358 cubic_post_recovery(struct cc_var *ccv)
359 {
360 struct cubic *cubic_data;
361 uint32_t mss, pipe;
362
363 cubic_data = ccv->cc_data;
364
365 /* Fast convergence heuristic. */
366 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) {
367 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR)
368 >> CUBIC_SHIFT;
369 }
370
371 /*
372 * There is a risk that if the cwnd becomes less than mss, and
373 * we do not get enough acks to drive it back up beyond mss,
374 * we will stop transmitting data altogether.
375 *
376 * The Cubic RFC defines values in terms of units of mss. Therefore
377 * we must make sure we have at least 1 mss to make progress
378 * since the algorthm is written that way.
379 */
380 mss = CCV(ccv, tcp_mss);
381
382 if (IN_FASTRECOVERY(ccv->flags)) {
383 /*
384 * If inflight data is less than ssthresh, set cwnd
385 * conservatively to avoid a burst of data, as suggested in
386 * the NewReno RFC. Otherwise, use the CUBIC method.
387 */
388 pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna);
389 if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) {
390 /*
391 * Ensure that cwnd does not collapse to 1 MSS under
392 * adverse conditions. Implements RFC6582
393 */
394 CCV(ccv, tcp_cwnd) = MAX(pipe, mss) + mss;
395 } else {
396 /* Update cwnd based on beta and adjusted max_cwnd. */
397 CCV(ccv, tcp_cwnd) = max(mss, ((CUBIC_BETA *
398 cubic_data->max_cwnd) >> CUBIC_SHIFT));
399 }
400 } else {
401 CCV(ccv, tcp_cwnd) = max(mss, CCV(ccv, tcp_cwnd));
402 }
403
404 cubic_data->t_last_cong = gethrtime();
405
406 /* Calculate the average RTT between congestion epochs. */
407 if (cubic_data->epoch_ack_count > 0 &&
408 cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) {
409 cubic_data->mean_rtt_nsecs =
410 (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count);
411 }
412
413 cubic_data->epoch_ack_count = 0;
414 cubic_data->sum_rtt_nsecs = 0;
415 cubic_data->K = cubic_k(cubic_data->max_cwnd / mss);
416 }
417
418 /*
419 * Record the min RTT and sum samples for the epoch average RTT calculation.
420 */
421 static void
cubic_record_rtt(struct cc_var * ccv)422 cubic_record_rtt(struct cc_var *ccv)
423 {
424 struct cubic *cubic_data;
425 int t_srtt_nsecs;
426
427 /* Ignore srtt until a min number of samples have been taken. */
428 if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) {
429 cubic_data = ccv->cc_data;
430 /* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */
431 t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3;
432
433 /*
434 * Record the current SRTT as our minrtt if it's the smallest
435 * we've seen or minrtt is currently equal to its initialized
436 * value.
437 *
438 * XXXLAS: Should there be some hysteresis for minrtt?
439 */
440 if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs ||
441 cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) {
442 cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs);
443
444 /*
445 * If the connection is within its first congestion
446 * epoch, ensure we prime mean_rtt_nsecs with a
447 * reasonable value until the epoch average RTT is
448 * calculated in cubic_post_recovery().
449 */
450 if (cubic_data->min_rtt_nsecs >
451 cubic_data->mean_rtt_nsecs)
452 cubic_data->mean_rtt_nsecs =
453 cubic_data->min_rtt_nsecs;
454 }
455
456 /* Sum samples for epoch average RTT calculation. */
457 cubic_data->sum_rtt_nsecs += t_srtt_nsecs;
458 cubic_data->epoch_ack_count++;
459 }
460 }
461
462 /*
463 * Update the ssthresh in the event of congestion.
464 */
465 static void
cubic_ssthresh_update(struct cc_var * ccv)466 cubic_ssthresh_update(struct cc_var *ccv)
467 {
468 struct cubic *cubic_data;
469
470 cubic_data = ccv->cc_data;
471
472 /*
473 * On the first congestion event, set ssthresh to cwnd * 0.5, on
474 * subsequent congestion events, set it to cwnd * beta.
475 */
476 if (cubic_data->num_cong_events == 0)
477 CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1;
478 else
479 CCV(ccv, tcp_cwnd_ssthresh) =
480 (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT;
481 }
482