1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 5 * Copyright (c) 2010 The FreeBSD Foundation 6 * All rights reserved. 7 * 8 * This software was developed by Lawrence Stewart while studying at the Centre 9 * for Advanced Internet Architectures, Swinburne University of Technology, made 10 * possible in part by a grant from the Cisco University Research Program Fund 11 * at Community Foundation Silicon Valley. 12 * 13 * Portions of this software were developed at the Centre for Advanced 14 * Internet Architectures, Swinburne University of Technology, Melbourne, 15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 /* 40 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 41 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 42 * Originally released as part of the NewTCP research project at Swinburne 43 * University of Technology's Centre for Advanced Internet Architectures, 44 * Melbourne, Australia, which was made possible in part by a grant from the 45 * Cisco University Research Program Fund at Community Foundation Silicon 46 * Valley. More details are available at: 47 * http://caia.swin.edu.au/urp/newtcp/ 48 */ 49 50 #include <sys/cdefs.h> 51 __FBSDID("$FreeBSD$"); 52 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/malloc.h> 56 #include <sys/module.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/sysctl.h> 60 #include <sys/systm.h> 61 62 #include <net/vnet.h> 63 64 #include <netinet/tcp.h> 65 #include <netinet/tcp_seq.h> 66 #include <netinet/tcp_timer.h> 67 #include <netinet/tcp_var.h> 68 #include <netinet/cc/cc.h> 69 #include <netinet/cc/cc_cubic.h> 70 #include <netinet/cc/cc_module.h> 71 72 static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 73 static void cubic_cb_destroy(struct cc_var *ccv); 74 static int cubic_cb_init(struct cc_var *ccv); 75 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 76 static void cubic_conn_init(struct cc_var *ccv); 77 static int cubic_mod_init(void); 78 static void cubic_post_recovery(struct cc_var *ccv); 79 static void cubic_record_rtt(struct cc_var *ccv); 80 static void cubic_ssthresh_update(struct cc_var *ccv); 81 82 struct cubic { 83 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 84 int64_t K; 85 /* Sum of RTT samples across an epoch in ticks. */ 86 int64_t sum_rtt_ticks; 87 /* cwnd at the most recent congestion event. */ 88 unsigned long max_cwnd; 89 /* cwnd at the previous congestion event. */ 90 unsigned long prev_max_cwnd; 91 /* Cached value for t_maxseg when K was computed */ 92 uint32_t k_maxseg; 93 /* Number of congestion events. */ 94 uint32_t num_cong_events; 95 /* Minimum observed rtt in ticks. */ 96 int min_rtt_ticks; 97 /* Mean observed rtt between congestion epochs. */ 98 int mean_rtt_ticks; 99 /* ACKs since last congestion event. */ 100 int epoch_ack_count; 101 /* Time of last congestion event in ticks. */ 102 int t_last_cong; 103 }; 104 105 static MALLOC_DEFINE(M_CUBIC, "cubic data", 106 "Per connection data required for the CUBIC congestion control algorithm"); 107 108 struct cc_algo cubic_cc_algo = { 109 .name = "cubic", 110 .ack_received = cubic_ack_received, 111 .cb_destroy = cubic_cb_destroy, 112 .cb_init = cubic_cb_init, 113 .cong_signal = cubic_cong_signal, 114 .conn_init = cubic_conn_init, 115 .mod_init = cubic_mod_init, 116 .post_recovery = cubic_post_recovery, 117 }; 118 119 static void 120 cubic_ack_received(struct cc_var *ccv, uint16_t type) 121 { 122 struct cubic *cubic_data; 123 unsigned long w_tf, w_cubic_next; 124 int ticks_since_cong; 125 126 cubic_data = ccv->cc_data; 127 cubic_record_rtt(ccv); 128 129 if (ccv->flags & CCF_MAX_CWND) 130 return; 131 132 /* 133 * Regular ACK and we're not in cong/fast recovery and we're cwnd 134 * limited and we're either not doing ABC or are slow starting or are 135 * doing ABC and we've sent a cwnd's worth of bytes. 136 */ 137 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 138 (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || 139 CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 140 (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { 141 /* Use the logic in NewReno ack_received() for slow start. */ 142 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 143 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) 144 newreno_cc_algo.ack_received(ccv, type); 145 else { 146 ticks_since_cong = ticks - cubic_data->t_last_cong; 147 148 /* 149 * The mean RTT is used to best reflect the equations in 150 * the I-D. Using min_rtt in the tf_cwnd calculation 151 * causes w_tf to grow much faster than it should if the 152 * RTT is dominated by network buffering rather than 153 * propagation delay. 154 */ 155 w_tf = tf_cwnd(ticks_since_cong, 156 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 157 CCV(ccv, t_maxseg)); 158 159 if (ccv->flags & CCF_CHG_MAX_CWND || cubic_data->k_maxseg != CCV(ccv, t_maxseg)) { 160 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 161 cubic_data->k_maxseg = CCV(ccv, t_maxseg); 162 ccv->flags &= ~(CCF_MAX_CWND|CCF_CHG_MAX_CWND); 163 } 164 165 w_cubic_next = cubic_cwnd(ticks_since_cong + 166 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 167 CCV(ccv, t_maxseg), cubic_data->K); 168 169 ccv->flags &= ~CCF_ABC_SENTAWND; 170 171 if (w_cubic_next < w_tf) 172 /* 173 * TCP-friendly region, follow tf 174 * cwnd growth. 175 */ 176 CCV(ccv, snd_cwnd) = ulmin(w_tf, TCP_MAXWIN << CCV(ccv, snd_scale)); 177 178 else if (CCV(ccv, snd_cwnd) < w_cubic_next) { 179 /* 180 * Concave or convex region, follow CUBIC 181 * cwnd growth. 182 */ 183 if (w_cubic_next >= TCP_MAXWIN << CCV(ccv, snd_scale)) { 184 w_cubic_next = TCP_MAXWIN << CCV(ccv, snd_scale); 185 ccv->flags |= CCF_MAX_CWND; 186 } 187 w_cubic_next = ulmin(w_cubic_next, TCP_MAXWIN << CCV(ccv, snd_scale)); 188 if (V_tcp_do_rfc3465) 189 CCV(ccv, snd_cwnd) = w_cubic_next; 190 else 191 CCV(ccv, snd_cwnd) += ((w_cubic_next - 192 CCV(ccv, snd_cwnd)) * 193 CCV(ccv, t_maxseg)) / 194 CCV(ccv, snd_cwnd); 195 } 196 197 /* 198 * If we're not in slow start and we're probing for a 199 * new cwnd limit at the start of a connection 200 * (happens when hostcache has a relevant entry), 201 * keep updating our current estimate of the 202 * max_cwnd. 203 */ 204 if (cubic_data->num_cong_events == 0 && 205 cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) { 206 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 207 ccv->flags |= CCF_CHG_MAX_CWND; 208 } 209 } 210 } 211 } 212 213 static void 214 cubic_cb_destroy(struct cc_var *ccv) 215 { 216 free(ccv->cc_data, M_CUBIC); 217 } 218 219 static int 220 cubic_cb_init(struct cc_var *ccv) 221 { 222 struct cubic *cubic_data; 223 224 cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO); 225 226 if (cubic_data == NULL) 227 return (ENOMEM); 228 229 /* Init some key variables with sensible defaults. */ 230 cubic_data->t_last_cong = ticks; 231 cubic_data->min_rtt_ticks = TCPTV_SRTTBASE; 232 cubic_data->mean_rtt_ticks = 1; 233 234 ccv->cc_data = cubic_data; 235 236 return (0); 237 } 238 239 /* 240 * Perform any necessary tasks before we enter congestion recovery. 241 */ 242 static void 243 cubic_cong_signal(struct cc_var *ccv, uint32_t type) 244 { 245 struct cubic *cubic_data; 246 247 cubic_data = ccv->cc_data; 248 249 switch (type) { 250 case CC_NDUPACK: 251 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 252 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 253 cubic_ssthresh_update(ccv); 254 cubic_data->num_cong_events++; 255 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 256 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 257 ccv->flags |= CCF_CHG_MAX_CWND; 258 } 259 ENTER_RECOVERY(CCV(ccv, t_flags)); 260 } 261 break; 262 263 case CC_ECN: 264 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 265 cubic_ssthresh_update(ccv); 266 cubic_data->num_cong_events++; 267 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 268 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 269 cubic_data->t_last_cong = ticks; 270 ccv->flags |= CCF_CHG_MAX_CWND; 271 ccv->flags &= ~CCF_MAX_CWND; 272 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 273 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 274 } 275 break; 276 277 case CC_RTO: 278 /* 279 * Grab the current time and record it so we know when the 280 * most recent congestion event was. Only record it when the 281 * timeout has fired more than once, as there is a reasonable 282 * chance the first one is a false alarm and may not indicate 283 * congestion. 284 */ 285 if (CCV(ccv, t_rxtshift) >= 2) { 286 cubic_data->num_cong_events++; 287 cubic_data->t_last_cong = ticks; 288 ccv->flags &= ~CCF_MAX_CWND; 289 } 290 break; 291 } 292 } 293 294 static void 295 cubic_conn_init(struct cc_var *ccv) 296 { 297 struct cubic *cubic_data; 298 299 cubic_data = ccv->cc_data; 300 301 /* 302 * Ensure we have a sane initial value for max_cwnd recorded. Without 303 * this here bad things happen when entries from the TCP hostcache 304 * get used. 305 */ 306 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 307 ccv->flags |= CCF_CHG_MAX_CWND; 308 } 309 310 static int 311 cubic_mod_init(void) 312 { 313 314 cubic_cc_algo.after_idle = newreno_cc_algo.after_idle; 315 316 return (0); 317 } 318 319 /* 320 * Perform any necessary tasks before we exit congestion recovery. 321 */ 322 static void 323 cubic_post_recovery(struct cc_var *ccv) 324 { 325 struct cubic *cubic_data; 326 int pipe; 327 328 cubic_data = ccv->cc_data; 329 pipe = 0; 330 331 /* Fast convergence heuristic. */ 332 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) { 333 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) 334 >> CUBIC_SHIFT; 335 ccv->flags |= CCF_CHG_MAX_CWND; 336 } 337 338 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 339 /* 340 * If inflight data is less than ssthresh, set cwnd 341 * conservatively to avoid a burst of data, as suggested in 342 * the NewReno RFC. Otherwise, use the CUBIC method. 343 * 344 * XXXLAS: Find a way to do this without needing curack 345 */ 346 if (V_tcp_do_rfc6675_pipe) 347 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 348 else 349 pipe = CCV(ccv, snd_max) - ccv->curack; 350 351 if (pipe < CCV(ccv, snd_ssthresh)) 352 CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg); 353 else 354 /* Update cwnd based on beta and adjusted max_cwnd. */ 355 CCV(ccv, snd_cwnd) = max(1, ((CUBIC_BETA * 356 cubic_data->max_cwnd) >> CUBIC_SHIFT)); 357 } 358 cubic_data->t_last_cong = ticks; 359 ccv->flags &= ~CCF_MAX_CWND; 360 361 /* Calculate the average RTT between congestion epochs. */ 362 if (cubic_data->epoch_ack_count > 0 && 363 cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { 364 cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks / 365 cubic_data->epoch_ack_count); 366 } 367 368 cubic_data->epoch_ack_count = 0; 369 cubic_data->sum_rtt_ticks = 0; 370 } 371 372 /* 373 * Record the min RTT and sum samples for the epoch average RTT calculation. 374 */ 375 static void 376 cubic_record_rtt(struct cc_var *ccv) 377 { 378 struct cubic *cubic_data; 379 int t_srtt_ticks; 380 381 /* Ignore srtt until a min number of samples have been taken. */ 382 if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { 383 cubic_data = ccv->cc_data; 384 t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE; 385 386 /* 387 * Record the current SRTT as our minrtt if it's the smallest 388 * we've seen or minrtt is currently equal to its initialised 389 * value. 390 * 391 * XXXLAS: Should there be some hysteresis for minrtt? 392 */ 393 if ((t_srtt_ticks < cubic_data->min_rtt_ticks || 394 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) { 395 cubic_data->min_rtt_ticks = max(1, t_srtt_ticks); 396 397 /* 398 * If the connection is within its first congestion 399 * epoch, ensure we prime mean_rtt_ticks with a 400 * reasonable value until the epoch average RTT is 401 * calculated in cubic_post_recovery(). 402 */ 403 if (cubic_data->min_rtt_ticks > 404 cubic_data->mean_rtt_ticks) 405 cubic_data->mean_rtt_ticks = 406 cubic_data->min_rtt_ticks; 407 } 408 409 /* Sum samples for epoch average RTT calculation. */ 410 cubic_data->sum_rtt_ticks += t_srtt_ticks; 411 cubic_data->epoch_ack_count++; 412 } 413 } 414 415 /* 416 * Update the ssthresh in the event of congestion. 417 */ 418 static void 419 cubic_ssthresh_update(struct cc_var *ccv) 420 { 421 struct cubic *cubic_data; 422 423 cubic_data = ccv->cc_data; 424 425 /* 426 * On the first congestion event, set ssthresh to cwnd * 0.5, on 427 * subsequent congestion events, set it to cwnd * beta. 428 */ 429 if (cubic_data->num_cong_events == 0) 430 CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd) >> 1; 431 else 432 CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) * 433 CUBIC_BETA) >> CUBIC_SHIFT; 434 } 435 436 437 DECLARE_CC_MODULE(cubic, &cubic_cc_algo); 438