1 /* 2 * CAIA Delay-Gradient (CDG) congestion control 3 * 4 * This implementation is based on the paper: 5 * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using 6 * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. 7 * 8 * Scavenger traffic (Less-than-Best-Effort) should disable coexistence 9 * heuristics using parameters use_shadow=0 and use_ineff=0. 10 * 11 * Parameters window, backoff_beta, and backoff_factor are crucial for 12 * throughput and delay. Future work is needed to determine better defaults, 13 * and to provide guidelines for use in different environments/contexts. 14 * 15 * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. 16 * Parameter window is only configurable when loading tcp_cdg as a module. 17 * 18 * Notable differences from paper/FreeBSD: 19 * o Using Hybrid Slow start and Proportional Rate Reduction. 20 * o Add toggle for shadow window mechanism. Suggested by David Hayes. 21 * o Add toggle for non-congestion loss tolerance. 22 * o Scaling parameter G is changed to a backoff factor; 23 * conversion is given by: backoff_factor = 1000/(G * window). 24 * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. 25 * o More accurate e^-x. 26 */ 27 #include <linux/kernel.h> 28 #include <linux/random.h> 29 #include <linux/module.h> 30 #include <linux/sched/clock.h> 31 32 #include <net/tcp.h> 33 34 #define HYSTART_ACK_TRAIN 1 35 #define HYSTART_DELAY 2 36 37 static int window __read_mostly = 8; 38 static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ 39 static unsigned int backoff_factor __read_mostly = 42; 40 static unsigned int hystart_detect __read_mostly = 3; 41 static unsigned int use_ineff __read_mostly = 5; 42 static bool use_shadow __read_mostly = true; 43 static bool use_tolerance __read_mostly; 44 45 module_param(window, int, 0444); 46 MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); 47 module_param(backoff_beta, uint, 0644); 48 MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); 49 module_param(backoff_factor, uint, 0644); 50 MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); 51 module_param(hystart_detect, uint, 0644); 52 MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " 53 "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); 54 module_param(use_ineff, uint, 0644); 55 MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); 56 module_param(use_shadow, bool, 0644); 57 MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); 58 module_param(use_tolerance, bool, 0644); 59 MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); 60 61 struct cdg_minmax { 62 union { 63 struct { 64 s32 min; 65 s32 max; 66 }; 67 u64 v64; 68 }; 69 }; 70 71 enum cdg_state { 72 CDG_UNKNOWN = 0, 73 CDG_NONFULL = 1, 74 CDG_FULL = 2, 75 CDG_BACKOFF = 3, 76 }; 77 78 struct cdg { 79 struct cdg_minmax rtt; 80 struct cdg_minmax rtt_prev; 81 struct cdg_minmax *gradients; 82 struct cdg_minmax gsum; 83 bool gfilled; 84 u8 tail; 85 u8 state; 86 u8 delack; 87 u32 rtt_seq; 88 u32 undo_cwnd; 89 u32 shadow_wnd; 90 u16 backoff_cnt; 91 u16 sample_cnt; 92 s32 delay_min; 93 u32 last_ack; 94 u32 round_start; 95 }; 96 97 /** 98 * nexp_u32 - negative base-e exponential 99 * @ux: x in units of micro 100 * 101 * Returns exp(ux * -1e-6) * U32_MAX. 102 */ 103 static u32 __pure nexp_u32(u32 ux) 104 { 105 static const u16 v[] = { 106 /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ 107 65535, 108 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, 109 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, 110 }; 111 u32 msb = ux >> 8; 112 u32 res; 113 int i; 114 115 /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ 116 if (msb > U16_MAX) 117 return 0; 118 119 /* Scale first eight bits linearly: */ 120 res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); 121 122 /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ 123 for (i = 1; msb; i++, msb >>= 1) { 124 u32 y = v[i & -(msb & 1)] + U32_C(1); 125 126 res = ((u64)res * y) >> 16; 127 } 128 129 return res; 130 } 131 132 /* Based on the HyStart algorithm (by Ha et al.) that is implemented in 133 * tcp_cubic. Differences/experimental changes: 134 * o Using Hayes' delayed ACK filter. 135 * o Using a usec clock for the ACK train. 136 * o Reset ACK train when application limited. 137 * o Invoked at any cwnd (i.e. also when cwnd < 16). 138 * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). 139 */ 140 static void tcp_cdg_hystart_update(struct sock *sk) 141 { 142 struct cdg *ca = inet_csk_ca(sk); 143 struct tcp_sock *tp = tcp_sk(sk); 144 145 ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); 146 if (ca->delay_min == 0) 147 return; 148 149 if (hystart_detect & HYSTART_ACK_TRAIN) { 150 u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); 151 152 if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { 153 ca->last_ack = now_us; 154 ca->round_start = now_us; 155 } else if (before(now_us, ca->last_ack + 3000)) { 156 u32 base_owd = max(ca->delay_min / 2U, 125U); 157 158 ca->last_ack = now_us; 159 if (after(now_us, ca->round_start + base_owd)) { 160 NET_INC_STATS(sock_net(sk), 161 LINUX_MIB_TCPHYSTARTTRAINDETECT); 162 NET_ADD_STATS(sock_net(sk), 163 LINUX_MIB_TCPHYSTARTTRAINCWND, 164 tp->snd_cwnd); 165 tp->snd_ssthresh = tp->snd_cwnd; 166 return; 167 } 168 } 169 } 170 171 if (hystart_detect & HYSTART_DELAY) { 172 if (ca->sample_cnt < 8) { 173 ca->sample_cnt++; 174 } else { 175 s32 thresh = max(ca->delay_min + ca->delay_min / 8U, 176 125U); 177 178 if (ca->rtt.min > thresh) { 179 NET_INC_STATS(sock_net(sk), 180 LINUX_MIB_TCPHYSTARTDELAYDETECT); 181 NET_ADD_STATS(sock_net(sk), 182 LINUX_MIB_TCPHYSTARTDELAYCWND, 183 tp->snd_cwnd); 184 tp->snd_ssthresh = tp->snd_cwnd; 185 } 186 } 187 } 188 } 189 190 static s32 tcp_cdg_grad(struct cdg *ca) 191 { 192 s32 gmin = ca->rtt.min - ca->rtt_prev.min; 193 s32 gmax = ca->rtt.max - ca->rtt_prev.max; 194 s32 grad; 195 196 if (ca->gradients) { 197 ca->gsum.min += gmin - ca->gradients[ca->tail].min; 198 ca->gsum.max += gmax - ca->gradients[ca->tail].max; 199 ca->gradients[ca->tail].min = gmin; 200 ca->gradients[ca->tail].max = gmax; 201 ca->tail = (ca->tail + 1) & (window - 1); 202 gmin = ca->gsum.min; 203 gmax = ca->gsum.max; 204 } 205 206 /* We keep sums to ignore gradients during cwnd reductions; 207 * the paper's smoothed gradients otherwise simplify to: 208 * (rtt_latest - rtt_oldest) / window. 209 * 210 * We also drop division by window here. 211 */ 212 grad = gmin > 0 ? gmin : gmax; 213 214 /* Extrapolate missing values in gradient window: */ 215 if (!ca->gfilled) { 216 if (!ca->gradients && window > 1) 217 grad *= window; /* Memory allocation failed. */ 218 else if (ca->tail == 0) 219 ca->gfilled = true; 220 else 221 grad = (grad * window) / (int)ca->tail; 222 } 223 224 /* Backoff was effectual: */ 225 if (gmin <= -32 || gmax <= -32) 226 ca->backoff_cnt = 0; 227 228 if (use_tolerance) { 229 /* Reduce small variations to zero: */ 230 gmin = DIV_ROUND_CLOSEST(gmin, 64); 231 gmax = DIV_ROUND_CLOSEST(gmax, 64); 232 233 if (gmin > 0 && gmax <= 0) 234 ca->state = CDG_FULL; 235 else if ((gmin > 0 && gmax > 0) || gmax < 0) 236 ca->state = CDG_NONFULL; 237 } 238 return grad; 239 } 240 241 static bool tcp_cdg_backoff(struct sock *sk, u32 grad) 242 { 243 struct cdg *ca = inet_csk_ca(sk); 244 struct tcp_sock *tp = tcp_sk(sk); 245 246 if (prandom_u32() <= nexp_u32(grad * backoff_factor)) 247 return false; 248 249 if (use_ineff) { 250 ca->backoff_cnt++; 251 if (ca->backoff_cnt > use_ineff) 252 return false; 253 } 254 255 ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); 256 ca->state = CDG_BACKOFF; 257 tcp_enter_cwr(sk); 258 return true; 259 } 260 261 /* Not called in CWR or Recovery state. */ 262 static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) 263 { 264 struct cdg *ca = inet_csk_ca(sk); 265 struct tcp_sock *tp = tcp_sk(sk); 266 u32 prior_snd_cwnd; 267 u32 incr; 268 269 if (tcp_in_slow_start(tp) && hystart_detect) 270 tcp_cdg_hystart_update(sk); 271 272 if (after(ack, ca->rtt_seq) && ca->rtt.v64) { 273 s32 grad = 0; 274 275 if (ca->rtt_prev.v64) 276 grad = tcp_cdg_grad(ca); 277 ca->rtt_seq = tp->snd_nxt; 278 ca->rtt_prev = ca->rtt; 279 ca->rtt.v64 = 0; 280 ca->last_ack = 0; 281 ca->sample_cnt = 0; 282 283 if (grad > 0 && tcp_cdg_backoff(sk, grad)) 284 return; 285 } 286 287 if (!tcp_is_cwnd_limited(sk)) { 288 ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); 289 return; 290 } 291 292 prior_snd_cwnd = tp->snd_cwnd; 293 tcp_reno_cong_avoid(sk, ack, acked); 294 295 incr = tp->snd_cwnd - prior_snd_cwnd; 296 ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); 297 } 298 299 static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample) 300 { 301 struct cdg *ca = inet_csk_ca(sk); 302 struct tcp_sock *tp = tcp_sk(sk); 303 304 if (sample->rtt_us <= 0) 305 return; 306 307 /* A heuristic for filtering delayed ACKs, adapted from: 308 * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support 309 * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. 310 */ 311 if (tp->sacked_out == 0) { 312 if (sample->pkts_acked == 1 && ca->delack) { 313 /* A delayed ACK is only used for the minimum if it is 314 * provenly lower than an existing non-zero minimum. 315 */ 316 ca->rtt.min = min(ca->rtt.min, sample->rtt_us); 317 ca->delack--; 318 return; 319 } else if (sample->pkts_acked > 1 && ca->delack < 5) { 320 ca->delack++; 321 } 322 } 323 324 ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us); 325 ca->rtt.max = max(ca->rtt.max, sample->rtt_us); 326 } 327 328 static u32 tcp_cdg_ssthresh(struct sock *sk) 329 { 330 struct cdg *ca = inet_csk_ca(sk); 331 struct tcp_sock *tp = tcp_sk(sk); 332 333 ca->undo_cwnd = tp->snd_cwnd; 334 335 if (ca->state == CDG_BACKOFF) 336 return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); 337 338 if (ca->state == CDG_NONFULL && use_tolerance) 339 return tp->snd_cwnd; 340 341 ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); 342 if (use_shadow) 343 return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); 344 return max(2U, tp->snd_cwnd >> 1); 345 } 346 347 static u32 tcp_cdg_undo_cwnd(struct sock *sk) 348 { 349 struct cdg *ca = inet_csk_ca(sk); 350 351 return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); 352 } 353 354 static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) 355 { 356 struct cdg *ca = inet_csk_ca(sk); 357 struct tcp_sock *tp = tcp_sk(sk); 358 struct cdg_minmax *gradients; 359 360 switch (ev) { 361 case CA_EVENT_CWND_RESTART: 362 gradients = ca->gradients; 363 if (gradients) 364 memset(gradients, 0, window * sizeof(gradients[0])); 365 memset(ca, 0, sizeof(*ca)); 366 367 ca->gradients = gradients; 368 ca->rtt_seq = tp->snd_nxt; 369 ca->shadow_wnd = tp->snd_cwnd; 370 break; 371 case CA_EVENT_COMPLETE_CWR: 372 ca->state = CDG_UNKNOWN; 373 ca->rtt_seq = tp->snd_nxt; 374 ca->rtt_prev = ca->rtt; 375 ca->rtt.v64 = 0; 376 break; 377 default: 378 break; 379 } 380 } 381 382 static void tcp_cdg_init(struct sock *sk) 383 { 384 struct cdg *ca = inet_csk_ca(sk); 385 struct tcp_sock *tp = tcp_sk(sk); 386 387 /* We silently fall back to window = 1 if allocation fails. */ 388 if (window > 1) 389 ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), 390 GFP_NOWAIT | __GFP_NOWARN); 391 ca->rtt_seq = tp->snd_nxt; 392 ca->shadow_wnd = tp->snd_cwnd; 393 } 394 395 static void tcp_cdg_release(struct sock *sk) 396 { 397 struct cdg *ca = inet_csk_ca(sk); 398 399 kfree(ca->gradients); 400 } 401 402 struct tcp_congestion_ops tcp_cdg __read_mostly = { 403 .cong_avoid = tcp_cdg_cong_avoid, 404 .cwnd_event = tcp_cdg_cwnd_event, 405 .pkts_acked = tcp_cdg_acked, 406 .undo_cwnd = tcp_cdg_undo_cwnd, 407 .ssthresh = tcp_cdg_ssthresh, 408 .release = tcp_cdg_release, 409 .init = tcp_cdg_init, 410 .owner = THIS_MODULE, 411 .name = "cdg", 412 }; 413 414 static int __init tcp_cdg_register(void) 415 { 416 if (backoff_beta > 1024 || window < 1 || window > 256) 417 return -ERANGE; 418 if (!is_power_of_2(window)) 419 return -EINVAL; 420 421 BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); 422 tcp_register_congestion_control(&tcp_cdg); 423 return 0; 424 } 425 426 static void __exit tcp_cdg_unregister(void) 427 { 428 tcp_unregister_congestion_control(&tcp_cdg); 429 } 430 431 module_init(tcp_cdg_register); 432 module_exit(tcp_cdg_unregister); 433 MODULE_AUTHOR("Kenneth Klette Jonassen"); 434 MODULE_LICENSE("GPL"); 435 MODULE_DESCRIPTION("TCP CDG"); 436