1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2013 5 * Swinburne University of Technology, Melbourne, Australia 6 * All rights reserved. 7 * 8 * This software was developed at the Centre for Advanced Internet 9 * Architectures, Swinburne University of Technology, by David Hayes, made 10 * possible in part by a gift from The Cisco University Research Program Fund, 11 * a corporate advised fund of Silicon Valley Community Foundation. Development 12 * and testing were further assisted by a grant from the FreeBSD Foundation. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 /* 37 * CAIA Delay-Gradient (CDG) congestion control algorithm 38 * 39 * An implemention of the delay-gradient congestion control algorithm proposed 40 * in the following paper: 41 * 42 * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay 43 * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011. 44 * 45 * Developed as part of the NewTCP research project at Swinburne University of 46 * Technology's Centre for Advanced Internet Architectures, Melbourne, 47 * Australia. More details are available at: 48 * http://caia.swin.edu.au/urp/newtcp/ 49 */ 50 51 #include <sys/cdefs.h> 52 __FBSDID("$FreeBSD$"); 53 54 #include <sys/param.h> 55 #include <sys/hhook.h> 56 #include <sys/kernel.h> 57 #include <sys/khelp.h> 58 #include <sys/limits.h> 59 #include <sys/lock.h> 60 #include <sys/malloc.h> 61 #include <sys/module.h> 62 #include <sys/queue.h> 63 #include <sys/socket.h> 64 #include <sys/socketvar.h> 65 #include <sys/sysctl.h> 66 #include <sys/systm.h> 67 68 #include <net/vnet.h> 69 70 #include <net/route.h> 71 #include <net/route/nhop.h> 72 73 #include <netinet/in_pcb.h> 74 #include <netinet/tcp.h> 75 #include <netinet/tcp_seq.h> 76 #include <netinet/tcp_timer.h> 77 #include <netinet/tcp_var.h> 78 #include <netinet/cc/cc.h> 79 #include <netinet/cc/cc_module.h> 80 81 #include <netinet/khelp/h_ertt.h> 82 83 #include <vm/uma.h> 84 85 #define CDG_VERSION "0.1" 86 87 /* Private delay-gradient induced congestion control signal. */ 88 #define CC_CDG_DELAY 0x01000000 89 90 /* NewReno window deflation factor on loss (as a percentage). */ 91 #define RENO_BETA 50 92 93 /* Queue states. */ 94 #define CDG_Q_EMPTY 1 95 #define CDG_Q_RISING 2 96 #define CDG_Q_FALLING 3 97 #define CDG_Q_FULL 4 98 #define CDG_Q_UNKNOWN 9999 99 100 /* Number of bit shifts used in probexp lookup table. */ 101 #define EXP_PREC 15 102 103 /* Largest gradient represented in probexp lookup table. */ 104 #define MAXGRAD 5 105 106 /* 107 * Delay Precision Enhance - number of bit shifts used for qtrend related 108 * integer arithmetic precision. 109 */ 110 #define D_P_E 7 111 112 struct qdiff_sample { 113 long qdiff; 114 STAILQ_ENTRY(qdiff_sample) qdiff_lnk; 115 }; 116 117 struct cdg { 118 long max_qtrend; 119 long min_qtrend; 120 STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q; 121 STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q; 122 long window_incr; 123 /* rttcount for window increase when in congestion avoidance */ 124 long rtt_count; 125 /* maximum measured rtt within an rtt period */ 126 int maxrtt_in_rtt; 127 /* maximum measured rtt within prev rtt period */ 128 int maxrtt_in_prevrtt; 129 /* minimum measured rtt within an rtt period */ 130 int minrtt_in_rtt; 131 /* minimum measured rtt within prev rtt period */ 132 int minrtt_in_prevrtt; 133 /* consecutive congestion episode counter */ 134 uint32_t consec_cong_cnt; 135 /* when tracking a new reno type loss window */ 136 uint32_t shadow_w; 137 /* maximum number of samples in the moving average queue */ 138 int sample_q_size; 139 /* number of samples in the moving average queue */ 140 int num_samples; 141 /* estimate of the queue state of the path */ 142 int queue_state; 143 }; 144 145 /* 146 * Lookup table for: 147 * (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments 148 * 149 * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase 150 * gradients. 151 */ 152 static const int probexp[641] = { 153 10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624, 154 3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846, 155 7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713, 156 9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939, 157 12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951, 158 14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768, 159 15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410, 160 17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893, 161 19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233, 162 20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444, 163 21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537, 164 22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525, 165 23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418, 166 24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224, 167 25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953, 168 26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611, 169 26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206, 170 27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743, 171 27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228, 172 28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667, 173 28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063, 174 29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421, 175 29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744, 176 29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036, 177 30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300, 178 30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538, 179 30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753, 180 30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948, 181 30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124, 182 31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283, 183 31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426, 184 31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556, 185 31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673, 186 31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778, 187 31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874, 188 31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960, 189 31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038, 190 32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109, 191 32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173, 192 32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230, 193 32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282, 194 32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329, 195 32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371, 196 32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410, 197 32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444, 198 32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476, 199 32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504, 200 32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529, 201 32531,32533,32535,32537,32538,32540,32542,32544,32545,32547}; 202 203 static uma_zone_t qdiffsample_zone; 204 static int ertt_id; 205 206 VNET_DEFINE_STATIC(uint32_t, cdg_alpha_inc); 207 VNET_DEFINE_STATIC(uint32_t, cdg_beta_delay); 208 VNET_DEFINE_STATIC(uint32_t, cdg_beta_loss); 209 VNET_DEFINE_STATIC(uint32_t, cdg_smoothing_factor); 210 VNET_DEFINE_STATIC(uint32_t, cdg_exp_backoff_scale); 211 VNET_DEFINE_STATIC(uint32_t, cdg_consec_cong); 212 VNET_DEFINE_STATIC(uint32_t, cdg_hold_backoff); 213 #define V_cdg_alpha_inc VNET(cdg_alpha_inc) 214 #define V_cdg_beta_delay VNET(cdg_beta_delay) 215 #define V_cdg_beta_loss VNET(cdg_beta_loss) 216 #define V_cdg_smoothing_factor VNET(cdg_smoothing_factor) 217 #define V_cdg_exp_backoff_scale VNET(cdg_exp_backoff_scale) 218 #define V_cdg_consec_cong VNET(cdg_consec_cong) 219 #define V_cdg_hold_backoff VNET(cdg_hold_backoff) 220 221 /* Function prototypes. */ 222 static int cdg_mod_init(void); 223 static int cdg_mod_destroy(void); 224 static void cdg_conn_init(struct cc_var *ccv); 225 static int cdg_cb_init(struct cc_var *ccv, void *ptr); 226 static void cdg_cb_destroy(struct cc_var *ccv); 227 static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type); 228 static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type); 229 static size_t cdg_data_sz(void); 230 231 struct cc_algo cdg_cc_algo = { 232 .name = "cdg", 233 .mod_init = cdg_mod_init, 234 .ack_received = cdg_ack_received, 235 .cb_destroy = cdg_cb_destroy, 236 .cb_init = cdg_cb_init, 237 .conn_init = cdg_conn_init, 238 .cong_signal = cdg_cong_signal, 239 .mod_destroy = cdg_mod_destroy, 240 .cc_data_sz = cdg_data_sz, 241 .post_recovery = newreno_cc_post_recovery, 242 .after_idle = newreno_cc_after_idle, 243 }; 244 245 /* Vnet created and being initialised. */ 246 static void 247 cdg_init_vnet(const void *unused __unused) 248 { 249 250 V_cdg_alpha_inc = 0; 251 V_cdg_beta_delay = 70; 252 V_cdg_beta_loss = 50; 253 V_cdg_smoothing_factor = 8; 254 V_cdg_exp_backoff_scale = 3; 255 V_cdg_consec_cong = 5; 256 V_cdg_hold_backoff = 5; 257 } 258 259 static int 260 cdg_mod_init(void) 261 { 262 VNET_ITERATOR_DECL(v); 263 264 ertt_id = khelp_get_id("ertt"); 265 if (ertt_id <= 0) 266 return (EINVAL); 267 268 qdiffsample_zone = uma_zcreate("cdg_qdiffsample", 269 sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0); 270 271 VNET_LIST_RLOCK(); 272 VNET_FOREACH(v) { 273 CURVNET_SET(v); 274 cdg_init_vnet(NULL); 275 CURVNET_RESTORE(); 276 } 277 VNET_LIST_RUNLOCK(); 278 return (0); 279 } 280 281 static int 282 cdg_mod_destroy(void) 283 { 284 285 uma_zdestroy(qdiffsample_zone); 286 return (0); 287 } 288 289 static size_t 290 cdg_data_sz(void) 291 { 292 return (sizeof(struct cdg)); 293 } 294 295 static int 296 cdg_cb_init(struct cc_var *ccv, void *ptr) 297 { 298 struct cdg *cdg_data; 299 300 INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp)); 301 if (ptr == NULL) { 302 cdg_data = malloc(sizeof(struct cdg), M_CC_MEM, M_NOWAIT); 303 if (cdg_data == NULL) 304 return (ENOMEM); 305 } else { 306 cdg_data = ptr; 307 } 308 cdg_data->shadow_w = 0; 309 cdg_data->max_qtrend = 0; 310 cdg_data->min_qtrend = 0; 311 cdg_data->queue_state = CDG_Q_UNKNOWN; 312 cdg_data->maxrtt_in_rtt = 0; 313 cdg_data->maxrtt_in_prevrtt = 0; 314 cdg_data->minrtt_in_rtt = INT_MAX; 315 cdg_data->minrtt_in_prevrtt = 0; 316 cdg_data->window_incr = 0; 317 cdg_data->rtt_count = 0; 318 cdg_data->consec_cong_cnt = 0; 319 cdg_data->sample_q_size = V_cdg_smoothing_factor; 320 cdg_data->num_samples = 0; 321 STAILQ_INIT(&cdg_data->qdiffmin_q); 322 STAILQ_INIT(&cdg_data->qdiffmax_q); 323 324 ccv->cc_data = cdg_data; 325 326 return (0); 327 } 328 329 static void 330 cdg_conn_init(struct cc_var *ccv) 331 { 332 struct cdg *cdg_data = ccv->cc_data; 333 334 /* 335 * Initialise the shadow_cwnd in case we are competing with loss based 336 * flows from the start 337 */ 338 cdg_data->shadow_w = CCV(ccv, snd_cwnd); 339 } 340 341 static void 342 cdg_cb_destroy(struct cc_var *ccv) 343 { 344 struct cdg *cdg_data; 345 struct qdiff_sample *qds, *qds_n; 346 347 cdg_data = ccv->cc_data; 348 349 qds = STAILQ_FIRST(&cdg_data->qdiffmin_q); 350 while (qds != NULL) { 351 qds_n = STAILQ_NEXT(qds, qdiff_lnk); 352 uma_zfree(qdiffsample_zone,qds); 353 qds = qds_n; 354 } 355 356 qds = STAILQ_FIRST(&cdg_data->qdiffmax_q); 357 while (qds != NULL) { 358 qds_n = STAILQ_NEXT(qds, qdiff_lnk); 359 uma_zfree(qdiffsample_zone,qds); 360 qds = qds_n; 361 } 362 363 free(ccv->cc_data, M_CC_MEM); 364 } 365 366 static int 367 cdg_beta_handler(SYSCTL_HANDLER_ARGS) 368 { 369 int error; 370 uint32_t new; 371 372 new = *(uint32_t *)arg1; 373 error = sysctl_handle_int(oidp, &new, 0, req); 374 if (error == 0 && req->newptr != NULL) { 375 if (new == 0 || new > 100) 376 error = EINVAL; 377 else 378 *(uint32_t *)arg1 = new; 379 } 380 381 return (error); 382 } 383 384 static int 385 cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS) 386 { 387 int error; 388 uint32_t new; 389 390 new = *(uint32_t *)arg1; 391 error = sysctl_handle_int(oidp, &new, 0, req); 392 if (error == 0 && req->newptr != NULL) { 393 if (new < 1) 394 error = EINVAL; 395 else 396 *(uint32_t *)arg1 = new; 397 } 398 399 return (error); 400 } 401 402 static inline uint32_t 403 cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta) 404 { 405 406 return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100); 407 } 408 409 /* 410 * Window increase function 411 * This window increase function is independent of the initial window size 412 * to ensure small window flows are not discriminated against (i.e. fairness). 413 * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for 414 * the next alpha_inc rtts, etc. 415 */ 416 static void 417 cdg_window_increase(struct cc_var *ccv, int new_measurement) 418 { 419 struct cdg *cdg_data; 420 int incr, s_w_incr; 421 422 cdg_data = ccv->cc_data; 423 incr = s_w_incr = 0; 424 425 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) { 426 /* Slow start. */ 427 incr = CCV(ccv, t_maxseg); 428 s_w_incr = incr; 429 cdg_data->window_incr = cdg_data->rtt_count = 0; 430 } else { 431 /* Congestion avoidance. */ 432 if (new_measurement) { 433 s_w_incr = CCV(ccv, t_maxseg); 434 if (V_cdg_alpha_inc == 0) { 435 incr = CCV(ccv, t_maxseg); 436 } else { 437 if (++cdg_data->rtt_count >= V_cdg_alpha_inc) { 438 cdg_data->window_incr++; 439 cdg_data->rtt_count = 0; 440 } 441 incr = CCV(ccv, t_maxseg) * 442 cdg_data->window_incr; 443 } 444 } 445 } 446 447 if (cdg_data->shadow_w > 0) 448 cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr, 449 TCP_MAXWIN << CCV(ccv, snd_scale)); 450 451 CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr, 452 TCP_MAXWIN << CCV(ccv, snd_scale)); 453 } 454 455 static void 456 cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type) 457 { 458 struct cdg *cdg_data = ccv->cc_data; 459 460 switch(signal_type) { 461 case CC_CDG_DELAY: 462 CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv, 463 CCV(ccv, snd_cwnd), V_cdg_beta_delay); 464 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 465 CCV(ccv, snd_recover) = CCV(ccv, snd_max); 466 cdg_data->window_incr = cdg_data->rtt_count = 0; 467 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 468 break; 469 case CC_NDUPACK: 470 /* 471 * If already responding to congestion OR we have guessed no 472 * queue in the path is full. 473 */ 474 if (IN_CONGRECOVERY(CCV(ccv, t_flags)) || 475 cdg_data->queue_state < CDG_Q_FULL) { 476 CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); 477 CCV(ccv, snd_recover) = CCV(ccv, snd_max); 478 } else { 479 /* 480 * Loss is likely to be congestion related. We have 481 * inferred a queue full state, so have shadow window 482 * react to loss as NewReno would. 483 */ 484 if (cdg_data->shadow_w > 0) 485 cdg_data->shadow_w = cdg_window_decrease(ccv, 486 cdg_data->shadow_w, RENO_BETA); 487 488 CCV(ccv, snd_ssthresh) = max(cdg_data->shadow_w, 489 cdg_window_decrease(ccv, CCV(ccv, snd_cwnd), 490 V_cdg_beta_loss)); 491 492 cdg_data->window_incr = cdg_data->rtt_count = 0; 493 } 494 ENTER_RECOVERY(CCV(ccv, t_flags)); 495 break; 496 default: 497 newreno_cc_cong_signal(ccv, signal_type); 498 break; 499 } 500 } 501 502 /* 503 * Using a negative exponential probabilistic backoff so that sources with 504 * varying RTTs which share the same link will, on average, have the same 505 * probability of backoff over time. 506 * 507 * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where 508 * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff. 509 */ 510 static inline int 511 prob_backoff(long qtrend) 512 { 513 int backoff, idx, p; 514 515 backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E)); 516 517 if (!backoff) { 518 if (V_cdg_exp_backoff_scale > 1) 519 idx = (qtrend + V_cdg_exp_backoff_scale / 2) / 520 V_cdg_exp_backoff_scale; 521 else 522 idx = qtrend; 523 524 /* Backoff probability proportional to rate of queue growth. */ 525 p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx]; 526 backoff = (random() < p); 527 } 528 529 return (backoff); 530 } 531 532 static inline void 533 calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min) 534 { 535 struct qdiff_sample *qds; 536 537 ++cdg_data->num_samples; 538 if (cdg_data->num_samples > cdg_data->sample_q_size) { 539 /* Minimum RTT. */ 540 qds = STAILQ_FIRST(&cdg_data->qdiffmin_q); 541 cdg_data->min_qtrend = cdg_data->min_qtrend + 542 (qdiff_min - qds->qdiff) / cdg_data->sample_q_size; 543 STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk); 544 qds->qdiff = qdiff_min; 545 STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk); 546 547 /* Maximum RTT. */ 548 qds = STAILQ_FIRST(&cdg_data->qdiffmax_q); 549 cdg_data->max_qtrend = cdg_data->max_qtrend + 550 (qdiff_max - qds->qdiff) / cdg_data->sample_q_size; 551 STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk); 552 qds->qdiff = qdiff_max; 553 STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk); 554 --cdg_data->num_samples; 555 } else { 556 qds = uma_zalloc(qdiffsample_zone, M_NOWAIT); 557 if (qds != NULL) { 558 cdg_data->min_qtrend = cdg_data->min_qtrend + 559 qdiff_min / cdg_data->sample_q_size; 560 qds->qdiff = qdiff_min; 561 STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, 562 qdiff_lnk); 563 } 564 565 qds = uma_zalloc(qdiffsample_zone, M_NOWAIT); 566 if (qds) { 567 cdg_data->max_qtrend = cdg_data->max_qtrend + 568 qdiff_max / cdg_data->sample_q_size; 569 qds->qdiff = qdiff_max; 570 STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, 571 qdiff_lnk); 572 } 573 } 574 } 575 576 static void 577 cdg_ack_received(struct cc_var *ccv, uint16_t ack_type) 578 { 579 struct cdg *cdg_data; 580 struct ertt *e_t; 581 long qdiff_max, qdiff_min; 582 int congestion, new_measurement, slowstart; 583 584 cdg_data = ccv->cc_data; 585 e_t = (struct ertt *)khelp_get_osd(&CCV(ccv, t_osd), ertt_id); 586 new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT; 587 congestion = 0; 588 cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt); 589 cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt); 590 591 if (new_measurement) { 592 slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)); 593 /* 594 * Update smoothed gradient measurements. Since we are only 595 * using one measurement per RTT, use max or min rtt_in_rtt. 596 * This is also less noisy than a sample RTT measurement. Max 597 * RTT measurements can have trouble due to OS issues. 598 */ 599 if (cdg_data->maxrtt_in_prevrtt) { 600 qdiff_max = ((long)(cdg_data->maxrtt_in_rtt - 601 cdg_data->maxrtt_in_prevrtt) << D_P_E ); 602 qdiff_min = ((long)(cdg_data->minrtt_in_rtt - 603 cdg_data->minrtt_in_prevrtt) << D_P_E ); 604 605 if (cdg_data->sample_q_size == 0) { 606 cdg_data->max_qtrend = qdiff_max; 607 cdg_data->min_qtrend = qdiff_min; 608 } else 609 calc_moving_average(cdg_data, qdiff_max, qdiff_min); 610 611 /* Probabilistic backoff with respect to gradient. */ 612 if (slowstart && qdiff_min > 0) 613 congestion = prob_backoff(qdiff_min); 614 else if (cdg_data->min_qtrend > 0) 615 congestion = prob_backoff(cdg_data->min_qtrend); 616 else if (slowstart && qdiff_max > 0) 617 congestion = prob_backoff(qdiff_max); 618 else if (cdg_data->max_qtrend > 0) 619 congestion = prob_backoff(cdg_data->max_qtrend); 620 621 /* Update estimate of queue state. */ 622 if (cdg_data->min_qtrend > 0 && 623 cdg_data->max_qtrend <= 0) { 624 cdg_data->queue_state = CDG_Q_FULL; 625 } else if (cdg_data->min_qtrend >= 0 && 626 cdg_data->max_qtrend < 0) { 627 cdg_data->queue_state = CDG_Q_EMPTY; 628 cdg_data->shadow_w = 0; 629 } else if (cdg_data->min_qtrend > 0 && 630 cdg_data->max_qtrend > 0) { 631 cdg_data->queue_state = CDG_Q_RISING; 632 } else if (cdg_data->min_qtrend < 0 && 633 cdg_data->max_qtrend < 0) { 634 cdg_data->queue_state = CDG_Q_FALLING; 635 } 636 637 if (cdg_data->min_qtrend < 0 || 638 cdg_data->max_qtrend < 0) 639 cdg_data->consec_cong_cnt = 0; 640 } 641 642 cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt; 643 cdg_data->minrtt_in_rtt = INT_MAX; 644 cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt; 645 cdg_data->maxrtt_in_rtt = 0; 646 e_t->flags &= ~ERTT_NEW_MEASUREMENT; 647 } 648 649 if (congestion) { 650 cdg_data->consec_cong_cnt++; 651 if (!IN_RECOVERY(CCV(ccv, t_flags))) { 652 if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong) 653 cdg_cong_signal(ccv, CC_CDG_DELAY); 654 else 655 /* 656 * We have been backing off but the queue is not 657 * falling. Assume we are competing with 658 * loss-based flows and don't back off for the 659 * next V_cdg_hold_backoff RTT periods. 660 */ 661 if (cdg_data->consec_cong_cnt >= 662 V_cdg_consec_cong + V_cdg_hold_backoff) 663 cdg_data->consec_cong_cnt = 0; 664 665 /* Won't see effect until 2nd RTT. */ 666 cdg_data->maxrtt_in_prevrtt = 0; 667 /* 668 * Resync shadow window in case we are competing with a 669 * loss based flow 670 */ 671 cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd), 672 cdg_data->shadow_w); 673 } 674 } else if (ack_type == CC_ACK) 675 cdg_window_increase(ccv, new_measurement); 676 } 677 678 /* When a vnet is created and being initialised, init the per-stack CDG vars. */ 679 VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, 680 cdg_init_vnet, NULL); 681 682 SYSCTL_DECL(_net_inet_tcp_cc_cdg); 683 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 684 "CAIA delay-gradient congestion control related settings"); 685 686 SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version, 687 CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1, 688 "Current algorithm/implementation version number"); 689 690 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc, 691 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0, 692 "Increment the window increase factor alpha by 1 MSS segment every " 693 "alpha_inc RTTs during congestion avoidance mode."); 694 695 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay, 696 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 697 &VNET_NAME(cdg_beta_delay), 70, &cdg_beta_handler, "IU", 698 "Delay-based window decrease factor as a percentage " 699 "(on delay-based backoff, w = w * beta_delay / 100)"); 700 701 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss, 702 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 703 &VNET_NAME(cdg_beta_loss), 50, &cdg_beta_handler, "IU", 704 "Loss-based window decrease factor as a percentage " 705 "(on loss-based backoff, w = w * beta_loss / 100)"); 706 707 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale, 708 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 709 &VNET_NAME(cdg_exp_backoff_scale), 2, &cdg_exp_backoff_scale_handler, "IU", 710 "Scaling parameter for the probabilistic exponential backoff"); 711 712 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, smoothing_factor, 713 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8, 714 "Number of samples used for moving average smoothing (0 = no smoothing)"); 715 716 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong, 717 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5, 718 "Number of consecutive delay-gradient based congestion episodes which will " 719 "trigger loss based CC compatibility"); 720 721 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff, 722 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5, 723 "Number of consecutive delay-gradient based congestion episodes to hold " 724 "the window backoff for loss based CC compatibility"); 725 726 DECLARE_CC_MODULE(cdg, &cdg_cc_algo); 727 MODULE_VERSION(cdg, 2); 728 MODULE_DEPEND(cdg, ertt, 1, 1, 1); 729