1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 5 * Copyright (c) 2010 The FreeBSD Foundation 6 * All rights reserved. 7 * 8 * This software was developed by Lawrence Stewart while studying at the Centre 9 * for Advanced Internet Architectures, Swinburne University of Technology, made 10 * possible in part by a grant from the Cisco University Research Program Fund 11 * at Community Foundation Silicon Valley. 12 * 13 * Portions of this software were developed at the Centre for Advanced 14 * Internet Architectures, Swinburne University of Technology, Melbourne, 15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 /* 40 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 41 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 42 * Originally released as part of the NewTCP research project at Swinburne 43 * University of Technology's Centre for Advanced Internet Architectures, 44 * Melbourne, Australia, which was made possible in part by a grant from the 45 * Cisco University Research Program Fund at Community Foundation Silicon 46 * Valley. More details are available at: 47 * http://caia.swin.edu.au/urp/newtcp/ 48 */ 49 50 #include <sys/cdefs.h> 51 __FBSDID("$FreeBSD$"); 52 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/malloc.h> 56 #include <sys/module.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/sysctl.h> 60 #include <sys/systm.h> 61 62 #include <net/vnet.h> 63 64 #include <netinet/tcp.h> 65 #include <netinet/tcp_seq.h> 66 #include <netinet/tcp_timer.h> 67 #include <netinet/tcp_var.h> 68 #include <netinet/cc/cc.h> 69 #include <netinet/cc/cc_cubic.h> 70 #include <netinet/cc/cc_module.h> 71 72 static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 73 static void cubic_cb_destroy(struct cc_var *ccv); 74 static int cubic_cb_init(struct cc_var *ccv); 75 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 76 static void cubic_conn_init(struct cc_var *ccv); 77 static int cubic_mod_init(void); 78 static void cubic_post_recovery(struct cc_var *ccv); 79 static void cubic_record_rtt(struct cc_var *ccv); 80 static void cubic_ssthresh_update(struct cc_var *ccv); 81 82 struct cubic { 83 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 84 int64_t K; 85 /* Sum of RTT samples across an epoch in ticks. */ 86 int64_t sum_rtt_ticks; 87 /* cwnd at the most recent congestion event. */ 88 unsigned long max_cwnd; 89 /* cwnd at the previous congestion event. */ 90 unsigned long prev_max_cwnd; 91 /* Cached value for t_maxseg when K was computed */ 92 uint32_t k_maxseg; 93 /* Number of congestion events. */ 94 uint32_t num_cong_events; 95 /* Minimum observed rtt in ticks. */ 96 int min_rtt_ticks; 97 /* Mean observed rtt between congestion epochs. */ 98 int mean_rtt_ticks; 99 /* ACKs since last congestion event. */ 100 int epoch_ack_count; 101 /* Time of last congestion event in ticks. */ 102 int t_last_cong; 103 }; 104 105 static MALLOC_DEFINE(M_CUBIC, "cubic data", 106 "Per connection data required for the CUBIC congestion control algorithm"); 107 108 struct cc_algo cubic_cc_algo = { 109 .name = "cubic", 110 .ack_received = cubic_ack_received, 111 .cb_destroy = cubic_cb_destroy, 112 .cb_init = cubic_cb_init, 113 .cong_signal = cubic_cong_signal, 114 .conn_init = cubic_conn_init, 115 .mod_init = cubic_mod_init, 116 .post_recovery = cubic_post_recovery, 117 }; 118 119 static void 120 cubic_ack_received(struct cc_var *ccv, uint16_t type) 121 { 122 struct cubic *cubic_data; 123 unsigned long w_tf, w_cubic_next; 124 int ticks_since_cong; 125 126 cubic_data = ccv->cc_data; 127 cubic_record_rtt(ccv); 128 129 if (ccv->flags & CCF_MAX_CWND) 130 return; 131 132 /* 133 * Regular ACK and we're not in cong/fast recovery and we're cwnd 134 * limited and we're either not doing ABC or are slow starting or are 135 * doing ABC and we've sent a cwnd's worth of bytes. 136 */ 137 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 138 (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || 139 CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 140 (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { 141 /* Use the logic in NewReno ack_received() for slow start. */ 142 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 143 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) 144 newreno_cc_algo.ack_received(ccv, type); 145 else { 146 ticks_since_cong = ticks - cubic_data->t_last_cong; 147 148 /* 149 * The mean RTT is used to best reflect the equations in 150 * the I-D. Using min_rtt in the tf_cwnd calculation 151 * causes w_tf to grow much faster than it should if the 152 * RTT is dominated by network buffering rather than 153 * propagation delay. 154 */ 155 w_tf = tf_cwnd(ticks_since_cong, 156 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 157 CCV(ccv, t_maxseg)); 158 159 if (ccv->flags & CCF_CHG_MAX_CWND || cubic_data->k_maxseg != CCV(ccv, t_maxseg)) { 160 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 161 cubic_data->k_maxseg = CCV(ccv, t_maxseg); 162 ccv->flags &= ~(CCF_MAX_CWND|CCF_CHG_MAX_CWND); 163 } 164 165 w_cubic_next = cubic_cwnd(ticks_since_cong + 166 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 167 CCV(ccv, t_maxseg), cubic_data->K); 168 169 ccv->flags &= ~CCF_ABC_SENTAWND; 170 171 if (w_cubic_next < w_tf) 172 /* 173 * TCP-friendly region, follow tf 174 * cwnd growth. 175 */ 176 CCV(ccv, snd_cwnd) = ulmin(w_tf, TCP_MAXWIN << CCV(ccv, snd_scale)); 177 178 else if (CCV(ccv, snd_cwnd) < w_cubic_next) { 179 /* 180 * Concave or convex region, follow CUBIC 181 * cwnd growth. 182 */ 183 if (w_cubic_next >= TCP_MAXWIN << CCV(ccv, snd_scale)) { 184 w_cubic_next = TCP_MAXWIN << CCV(ccv, snd_scale); 185 ccv->flags |= CCF_MAX_CWND; 186 } 187 w_cubic_next = ulmin(w_cubic_next, TCP_MAXWIN << CCV(ccv, snd_scale)); 188 if (V_tcp_do_rfc3465) 189 CCV(ccv, snd_cwnd) = w_cubic_next; 190 else 191 CCV(ccv, snd_cwnd) += ((w_cubic_next - 192 CCV(ccv, snd_cwnd)) * 193 CCV(ccv, t_maxseg)) / 194 CCV(ccv, snd_cwnd); 195 } 196 197 /* 198 * If we're not in slow start and we're probing for a 199 * new cwnd limit at the start of a connection 200 * (happens when hostcache has a relevant entry), 201 * keep updating our current estimate of the 202 * max_cwnd. 203 */ 204 if (cubic_data->num_cong_events == 0 && 205 cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) { 206 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 207 ccv->flags |= CCF_CHG_MAX_CWND; 208 } 209 } 210 } 211 } 212 213 static void 214 cubic_cb_destroy(struct cc_var *ccv) 215 { 216 217 if (ccv->cc_data != NULL) 218 free(ccv->cc_data, M_CUBIC); 219 } 220 221 static int 222 cubic_cb_init(struct cc_var *ccv) 223 { 224 struct cubic *cubic_data; 225 226 cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO); 227 228 if (cubic_data == NULL) 229 return (ENOMEM); 230 231 /* Init some key variables with sensible defaults. */ 232 cubic_data->t_last_cong = ticks; 233 cubic_data->min_rtt_ticks = TCPTV_SRTTBASE; 234 cubic_data->mean_rtt_ticks = 1; 235 236 ccv->cc_data = cubic_data; 237 238 return (0); 239 } 240 241 /* 242 * Perform any necessary tasks before we enter congestion recovery. 243 */ 244 static void 245 cubic_cong_signal(struct cc_var *ccv, uint32_t type) 246 { 247 struct cubic *cubic_data; 248 249 cubic_data = ccv->cc_data; 250 251 switch (type) { 252 case CC_NDUPACK: 253 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 254 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 255 cubic_ssthresh_update(ccv); 256 cubic_data->num_cong_events++; 257 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 258 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 259 ccv->flags |= CCF_CHG_MAX_CWND; 260 } 261 ENTER_RECOVERY(CCV(ccv, t_flags)); 262 } 263 break; 264 265 case CC_ECN: 266 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 267 cubic_ssthresh_update(ccv); 268 cubic_data->num_cong_events++; 269 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 270 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 271 cubic_data->t_last_cong = ticks; 272 ccv->flags |= CCF_CHG_MAX_CWND; 273 ccv->flags &= ~CCF_MAX_CWND; 274 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 275 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 276 } 277 break; 278 279 case CC_RTO: 280 /* 281 * Grab the current time and record it so we know when the 282 * most recent congestion event was. Only record it when the 283 * timeout has fired more than once, as there is a reasonable 284 * chance the first one is a false alarm and may not indicate 285 * congestion. 286 */ 287 if (CCV(ccv, t_rxtshift) >= 2) { 288 cubic_data->num_cong_events++; 289 cubic_data->t_last_cong = ticks; 290 ccv->flags &= ~CCF_MAX_CWND; 291 } 292 break; 293 } 294 } 295 296 static void 297 cubic_conn_init(struct cc_var *ccv) 298 { 299 struct cubic *cubic_data; 300 301 cubic_data = ccv->cc_data; 302 303 /* 304 * Ensure we have a sane initial value for max_cwnd recorded. Without 305 * this here bad things happen when entries from the TCP hostcache 306 * get used. 307 */ 308 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 309 ccv->flags |= CCF_CHG_MAX_CWND; 310 } 311 312 static int 313 cubic_mod_init(void) 314 { 315 316 cubic_cc_algo.after_idle = newreno_cc_algo.after_idle; 317 318 return (0); 319 } 320 321 /* 322 * Perform any necessary tasks before we exit congestion recovery. 323 */ 324 static void 325 cubic_post_recovery(struct cc_var *ccv) 326 { 327 struct cubic *cubic_data; 328 int pipe; 329 330 cubic_data = ccv->cc_data; 331 pipe = 0; 332 333 /* Fast convergence heuristic. */ 334 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) { 335 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) 336 >> CUBIC_SHIFT; 337 ccv->flags |= CCF_CHG_MAX_CWND; 338 } 339 340 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 341 /* 342 * If inflight data is less than ssthresh, set cwnd 343 * conservatively to avoid a burst of data, as suggested in 344 * the NewReno RFC. Otherwise, use the CUBIC method. 345 * 346 * XXXLAS: Find a way to do this without needing curack 347 */ 348 if (V_tcp_do_rfc6675_pipe) 349 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 350 else 351 pipe = CCV(ccv, snd_max) - ccv->curack; 352 353 if (pipe < CCV(ccv, snd_ssthresh)) 354 CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg); 355 else 356 /* Update cwnd based on beta and adjusted max_cwnd. */ 357 CCV(ccv, snd_cwnd) = max(1, ((CUBIC_BETA * 358 cubic_data->max_cwnd) >> CUBIC_SHIFT)); 359 } 360 cubic_data->t_last_cong = ticks; 361 ccv->flags &= ~CCF_MAX_CWND; 362 363 /* Calculate the average RTT between congestion epochs. */ 364 if (cubic_data->epoch_ack_count > 0 && 365 cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { 366 cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks / 367 cubic_data->epoch_ack_count); 368 } 369 370 cubic_data->epoch_ack_count = 0; 371 cubic_data->sum_rtt_ticks = 0; 372 } 373 374 /* 375 * Record the min RTT and sum samples for the epoch average RTT calculation. 376 */ 377 static void 378 cubic_record_rtt(struct cc_var *ccv) 379 { 380 struct cubic *cubic_data; 381 int t_srtt_ticks; 382 383 /* Ignore srtt until a min number of samples have been taken. */ 384 if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { 385 cubic_data = ccv->cc_data; 386 t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE; 387 388 /* 389 * Record the current SRTT as our minrtt if it's the smallest 390 * we've seen or minrtt is currently equal to its initialised 391 * value. 392 * 393 * XXXLAS: Should there be some hysteresis for minrtt? 394 */ 395 if ((t_srtt_ticks < cubic_data->min_rtt_ticks || 396 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) { 397 cubic_data->min_rtt_ticks = max(1, t_srtt_ticks); 398 399 /* 400 * If the connection is within its first congestion 401 * epoch, ensure we prime mean_rtt_ticks with a 402 * reasonable value until the epoch average RTT is 403 * calculated in cubic_post_recovery(). 404 */ 405 if (cubic_data->min_rtt_ticks > 406 cubic_data->mean_rtt_ticks) 407 cubic_data->mean_rtt_ticks = 408 cubic_data->min_rtt_ticks; 409 } 410 411 /* Sum samples for epoch average RTT calculation. */ 412 cubic_data->sum_rtt_ticks += t_srtt_ticks; 413 cubic_data->epoch_ack_count++; 414 } 415 } 416 417 /* 418 * Update the ssthresh in the event of congestion. 419 */ 420 static void 421 cubic_ssthresh_update(struct cc_var *ccv) 422 { 423 struct cubic *cubic_data; 424 425 cubic_data = ccv->cc_data; 426 427 /* 428 * On the first congestion event, set ssthresh to cwnd * 0.5, on 429 * subsequent congestion events, set it to cwnd * beta. 430 */ 431 if (cubic_data->num_cong_events == 0) 432 CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd) >> 1; 433 else 434 CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) * 435 CUBIC_BETA) >> CUBIC_SHIFT; 436 } 437 438 439 DECLARE_CC_MODULE(cubic, &cubic_cc_algo); 440