1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 5 * Copyright (c) 2010 The FreeBSD Foundation 6 * All rights reserved. 7 * 8 * This software was developed by Lawrence Stewart while studying at the Centre 9 * for Advanced Internet Architectures, Swinburne University of Technology, made 10 * possible in part by a grant from the Cisco University Research Program Fund 11 * at Community Foundation Silicon Valley. 12 * 13 * Portions of this software were developed at the Centre for Advanced 14 * Internet Architectures, Swinburne University of Technology, Melbourne, 15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 /* 40 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 41 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 42 * Originally released as part of the NewTCP research project at Swinburne 43 * University of Technology's Centre for Advanced Internet Architectures, 44 * Melbourne, Australia, which was made possible in part by a grant from the 45 * Cisco University Research Program Fund at Community Foundation Silicon 46 * Valley. More details are available at: 47 * http://caia.swin.edu.au/urp/newtcp/ 48 */ 49 50 #include <sys/cdefs.h> 51 __FBSDID("$FreeBSD$"); 52 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/malloc.h> 56 #include <sys/module.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/sysctl.h> 60 #include <sys/systm.h> 61 62 #include <net/vnet.h> 63 64 #include <netinet/tcp.h> 65 #include <netinet/tcp_seq.h> 66 #include <netinet/tcp_timer.h> 67 #include <netinet/tcp_var.h> 68 #include <netinet/cc/cc.h> 69 #include <netinet/cc/cc_cubic.h> 70 #include <netinet/cc/cc_module.h> 71 72 static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 73 static void cubic_cb_destroy(struct cc_var *ccv); 74 static int cubic_cb_init(struct cc_var *ccv); 75 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 76 static void cubic_conn_init(struct cc_var *ccv); 77 static int cubic_mod_init(void); 78 static void cubic_post_recovery(struct cc_var *ccv); 79 static void cubic_record_rtt(struct cc_var *ccv); 80 static void cubic_ssthresh_update(struct cc_var *ccv); 81 82 struct cubic { 83 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 84 int64_t K; 85 /* Sum of RTT samples across an epoch in ticks. */ 86 int64_t sum_rtt_ticks; 87 /* cwnd at the most recent congestion event. */ 88 unsigned long max_cwnd; 89 /* cwnd at the previous congestion event. */ 90 unsigned long prev_max_cwnd; 91 /* Number of congestion events. */ 92 uint32_t num_cong_events; 93 /* Minimum observed rtt in ticks. */ 94 int min_rtt_ticks; 95 /* Mean observed rtt between congestion epochs. */ 96 int mean_rtt_ticks; 97 /* ACKs since last congestion event. */ 98 int epoch_ack_count; 99 /* Time of last congestion event in ticks. */ 100 int t_last_cong; 101 }; 102 103 static MALLOC_DEFINE(M_CUBIC, "cubic data", 104 "Per connection data required for the CUBIC congestion control algorithm"); 105 106 struct cc_algo cubic_cc_algo = { 107 .name = "cubic", 108 .ack_received = cubic_ack_received, 109 .cb_destroy = cubic_cb_destroy, 110 .cb_init = cubic_cb_init, 111 .cong_signal = cubic_cong_signal, 112 .conn_init = cubic_conn_init, 113 .mod_init = cubic_mod_init, 114 .post_recovery = cubic_post_recovery, 115 }; 116 117 static void 118 cubic_ack_received(struct cc_var *ccv, uint16_t type) 119 { 120 struct cubic *cubic_data; 121 unsigned long w_tf, w_cubic_next; 122 int ticks_since_cong; 123 124 cubic_data = ccv->cc_data; 125 cubic_record_rtt(ccv); 126 127 /* 128 * Regular ACK and we're not in cong/fast recovery and we're cwnd 129 * limited and we're either not doing ABC or are slow starting or are 130 * doing ABC and we've sent a cwnd's worth of bytes. 131 */ 132 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 133 (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || 134 CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 135 (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { 136 /* Use the logic in NewReno ack_received() for slow start. */ 137 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 138 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) 139 newreno_cc_algo.ack_received(ccv, type); 140 else { 141 ticks_since_cong = ticks - cubic_data->t_last_cong; 142 143 /* 144 * The mean RTT is used to best reflect the equations in 145 * the I-D. Using min_rtt in the tf_cwnd calculation 146 * causes w_tf to grow much faster than it should if the 147 * RTT is dominated by network buffering rather than 148 * propagation delay. 149 */ 150 w_tf = tf_cwnd(ticks_since_cong, 151 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 152 CCV(ccv, t_maxseg)); 153 154 w_cubic_next = cubic_cwnd(ticks_since_cong + 155 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 156 CCV(ccv, t_maxseg), cubic_data->K); 157 158 ccv->flags &= ~CCF_ABC_SENTAWND; 159 160 if (w_cubic_next < w_tf) 161 /* 162 * TCP-friendly region, follow tf 163 * cwnd growth. 164 */ 165 CCV(ccv, snd_cwnd) = w_tf; 166 167 else if (CCV(ccv, snd_cwnd) < w_cubic_next) { 168 /* 169 * Concave or convex region, follow CUBIC 170 * cwnd growth. 171 */ 172 if (V_tcp_do_rfc3465) 173 CCV(ccv, snd_cwnd) = w_cubic_next; 174 else 175 CCV(ccv, snd_cwnd) += ((w_cubic_next - 176 CCV(ccv, snd_cwnd)) * 177 CCV(ccv, t_maxseg)) / 178 CCV(ccv, snd_cwnd); 179 } 180 181 /* 182 * If we're not in slow start and we're probing for a 183 * new cwnd limit at the start of a connection 184 * (happens when hostcache has a relevant entry), 185 * keep updating our current estimate of the 186 * max_cwnd. 187 */ 188 if (cubic_data->num_cong_events == 0 && 189 cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) 190 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 191 } 192 } 193 } 194 195 static void 196 cubic_cb_destroy(struct cc_var *ccv) 197 { 198 199 if (ccv->cc_data != NULL) 200 free(ccv->cc_data, M_CUBIC); 201 } 202 203 static int 204 cubic_cb_init(struct cc_var *ccv) 205 { 206 struct cubic *cubic_data; 207 208 cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO); 209 210 if (cubic_data == NULL) 211 return (ENOMEM); 212 213 /* Init some key variables with sensible defaults. */ 214 cubic_data->t_last_cong = ticks; 215 cubic_data->min_rtt_ticks = TCPTV_SRTTBASE; 216 cubic_data->mean_rtt_ticks = 1; 217 218 ccv->cc_data = cubic_data; 219 220 return (0); 221 } 222 223 /* 224 * Perform any necessary tasks before we enter congestion recovery. 225 */ 226 static void 227 cubic_cong_signal(struct cc_var *ccv, uint32_t type) 228 { 229 struct cubic *cubic_data; 230 231 cubic_data = ccv->cc_data; 232 233 switch (type) { 234 case CC_NDUPACK: 235 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 236 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 237 cubic_ssthresh_update(ccv); 238 cubic_data->num_cong_events++; 239 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 240 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 241 } 242 ENTER_RECOVERY(CCV(ccv, t_flags)); 243 } 244 break; 245 246 case CC_ECN: 247 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 248 cubic_ssthresh_update(ccv); 249 cubic_data->num_cong_events++; 250 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 251 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 252 cubic_data->t_last_cong = ticks; 253 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 254 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 255 } 256 break; 257 258 case CC_RTO: 259 /* 260 * Grab the current time and record it so we know when the 261 * most recent congestion event was. Only record it when the 262 * timeout has fired more than once, as there is a reasonable 263 * chance the first one is a false alarm and may not indicate 264 * congestion. 265 */ 266 if (CCV(ccv, t_rxtshift) >= 2) { 267 cubic_data->num_cong_events++; 268 cubic_data->t_last_cong = ticks; 269 } 270 break; 271 } 272 } 273 274 static void 275 cubic_conn_init(struct cc_var *ccv) 276 { 277 struct cubic *cubic_data; 278 279 cubic_data = ccv->cc_data; 280 281 /* 282 * Ensure we have a sane initial value for max_cwnd recorded. Without 283 * this here bad things happen when entries from the TCP hostcache 284 * get used. 285 */ 286 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 287 } 288 289 static int 290 cubic_mod_init(void) 291 { 292 293 cubic_cc_algo.after_idle = newreno_cc_algo.after_idle; 294 295 return (0); 296 } 297 298 /* 299 * Perform any necessary tasks before we exit congestion recovery. 300 */ 301 static void 302 cubic_post_recovery(struct cc_var *ccv) 303 { 304 struct cubic *cubic_data; 305 int pipe; 306 307 cubic_data = ccv->cc_data; 308 pipe = 0; 309 310 /* Fast convergence heuristic. */ 311 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) 312 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) 313 >> CUBIC_SHIFT; 314 315 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 316 /* 317 * If inflight data is less than ssthresh, set cwnd 318 * conservatively to avoid a burst of data, as suggested in 319 * the NewReno RFC. Otherwise, use the CUBIC method. 320 * 321 * XXXLAS: Find a way to do this without needing curack 322 */ 323 if (V_tcp_do_rfc6675_pipe) 324 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 325 else 326 pipe = CCV(ccv, snd_max) - ccv->curack; 327 328 if (pipe < CCV(ccv, snd_ssthresh)) 329 CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg); 330 else 331 /* Update cwnd based on beta and adjusted max_cwnd. */ 332 CCV(ccv, snd_cwnd) = max(1, ((CUBIC_BETA * 333 cubic_data->max_cwnd) >> CUBIC_SHIFT)); 334 } 335 cubic_data->t_last_cong = ticks; 336 337 /* Calculate the average RTT between congestion epochs. */ 338 if (cubic_data->epoch_ack_count > 0 && 339 cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { 340 cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks / 341 cubic_data->epoch_ack_count); 342 } 343 344 cubic_data->epoch_ack_count = 0; 345 cubic_data->sum_rtt_ticks = 0; 346 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 347 } 348 349 /* 350 * Record the min RTT and sum samples for the epoch average RTT calculation. 351 */ 352 static void 353 cubic_record_rtt(struct cc_var *ccv) 354 { 355 struct cubic *cubic_data; 356 int t_srtt_ticks; 357 358 /* Ignore srtt until a min number of samples have been taken. */ 359 if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { 360 cubic_data = ccv->cc_data; 361 t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE; 362 363 /* 364 * Record the current SRTT as our minrtt if it's the smallest 365 * we've seen or minrtt is currently equal to its initialised 366 * value. 367 * 368 * XXXLAS: Should there be some hysteresis for minrtt? 369 */ 370 if ((t_srtt_ticks < cubic_data->min_rtt_ticks || 371 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) { 372 cubic_data->min_rtt_ticks = max(1, t_srtt_ticks); 373 374 /* 375 * If the connection is within its first congestion 376 * epoch, ensure we prime mean_rtt_ticks with a 377 * reasonable value until the epoch average RTT is 378 * calculated in cubic_post_recovery(). 379 */ 380 if (cubic_data->min_rtt_ticks > 381 cubic_data->mean_rtt_ticks) 382 cubic_data->mean_rtt_ticks = 383 cubic_data->min_rtt_ticks; 384 } 385 386 /* Sum samples for epoch average RTT calculation. */ 387 cubic_data->sum_rtt_ticks += t_srtt_ticks; 388 cubic_data->epoch_ack_count++; 389 } 390 } 391 392 /* 393 * Update the ssthresh in the event of congestion. 394 */ 395 static void 396 cubic_ssthresh_update(struct cc_var *ccv) 397 { 398 struct cubic *cubic_data; 399 400 cubic_data = ccv->cc_data; 401 402 /* 403 * On the first congestion event, set ssthresh to cwnd * 0.5, on 404 * subsequent congestion events, set it to cwnd * beta. 405 */ 406 if (cubic_data->num_cong_events == 0) 407 CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd) >> 1; 408 else 409 CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) * 410 CUBIC_BETA) >> CUBIC_SHIFT; 411 } 412 413 414 DECLARE_CC_MODULE(cubic, &cubic_cc_algo); 415