1 /* 2 * CAIA Delay-Gradient (CDG) congestion control 3 * 4 * This implementation is based on the paper: 5 * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using 6 * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. 7 * 8 * Scavenger traffic (Less-than-Best-Effort) should disable coexistence 9 * heuristics using parameters use_shadow=0 and use_ineff=0. 10 * 11 * Parameters window, backoff_beta, and backoff_factor are crucial for 12 * throughput and delay. Future work is needed to determine better defaults, 13 * and to provide guidelines for use in different environments/contexts. 14 * 15 * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. 16 * Parameter window is only configurable when loading tcp_cdg as a module. 17 * 18 * Notable differences from paper/FreeBSD: 19 * o Using Hybrid Slow start and Proportional Rate Reduction. 20 * o Add toggle for shadow window mechanism. Suggested by David Hayes. 21 * o Add toggle for non-congestion loss tolerance. 22 * o Scaling parameter G is changed to a backoff factor; 23 * conversion is given by: backoff_factor = 1000/(G * window). 24 * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. 25 * o More accurate e^-x. 26 */ 27 #include <linux/kernel.h> 28 #include <linux/random.h> 29 #include <linux/module.h> 30 #include <linux/sched/clock.h> 31 32 #include <net/tcp.h> 33 34 #define HYSTART_ACK_TRAIN 1 35 #define HYSTART_DELAY 2 36 37 static int window __read_mostly = 8; 38 static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ 39 static unsigned int backoff_factor __read_mostly = 42; 40 static unsigned int hystart_detect __read_mostly = 3; 41 static unsigned int use_ineff __read_mostly = 5; 42 static bool use_shadow __read_mostly = true; 43 static bool use_tolerance __read_mostly; 44 45 module_param(window, int, 0444); 46 MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); 47 module_param(backoff_beta, uint, 0644); 48 MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); 49 module_param(backoff_factor, uint, 0644); 50 MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); 51 module_param(hystart_detect, uint, 0644); 52 MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " 53 "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); 54 module_param(use_ineff, uint, 0644); 55 MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); 56 module_param(use_shadow, bool, 0644); 57 MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); 58 module_param(use_tolerance, bool, 0644); 59 MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); 60 61 struct cdg_minmax { 62 union { 63 struct { 64 s32 min; 65 s32 max; 66 }; 67 u64 v64; 68 }; 69 }; 70 71 enum cdg_state { 72 CDG_UNKNOWN = 0, 73 CDG_NONFULL = 1, 74 CDG_FULL = 2, 75 CDG_BACKOFF = 3, 76 }; 77 78 struct cdg { 79 struct cdg_minmax rtt; 80 struct cdg_minmax rtt_prev; 81 struct cdg_minmax *gradients; 82 struct cdg_minmax gsum; 83 bool gfilled; 84 u8 tail; 85 u8 state; 86 u8 delack; 87 u32 rtt_seq; 88 u32 shadow_wnd; 89 u16 backoff_cnt; 90 u16 sample_cnt; 91 s32 delay_min; 92 u32 last_ack; 93 u32 round_start; 94 }; 95 96 /** 97 * nexp_u32 - negative base-e exponential 98 * @ux: x in units of micro 99 * 100 * Returns exp(ux * -1e-6) * U32_MAX. 101 */ 102 static u32 __pure nexp_u32(u32 ux) 103 { 104 static const u16 v[] = { 105 /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ 106 65535, 107 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, 108 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, 109 }; 110 u32 msb = ux >> 8; 111 u32 res; 112 int i; 113 114 /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ 115 if (msb > U16_MAX) 116 return 0; 117 118 /* Scale first eight bits linearly: */ 119 res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); 120 121 /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ 122 for (i = 1; msb; i++, msb >>= 1) { 123 u32 y = v[i & -(msb & 1)] + U32_C(1); 124 125 res = ((u64)res * y) >> 16; 126 } 127 128 return res; 129 } 130 131 /* Based on the HyStart algorithm (by Ha et al.) that is implemented in 132 * tcp_cubic. Differences/experimental changes: 133 * o Using Hayes' delayed ACK filter. 134 * o Using a usec clock for the ACK train. 135 * o Reset ACK train when application limited. 136 * o Invoked at any cwnd (i.e. also when cwnd < 16). 137 * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). 138 */ 139 static void tcp_cdg_hystart_update(struct sock *sk) 140 { 141 struct cdg *ca = inet_csk_ca(sk); 142 struct tcp_sock *tp = tcp_sk(sk); 143 144 ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); 145 if (ca->delay_min == 0) 146 return; 147 148 if (hystart_detect & HYSTART_ACK_TRAIN) { 149 u32 now_us = tp->tcp_mstamp; 150 151 if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { 152 ca->last_ack = now_us; 153 ca->round_start = now_us; 154 } else if (before(now_us, ca->last_ack + 3000)) { 155 u32 base_owd = max(ca->delay_min / 2U, 125U); 156 157 ca->last_ack = now_us; 158 if (after(now_us, ca->round_start + base_owd)) { 159 NET_INC_STATS(sock_net(sk), 160 LINUX_MIB_TCPHYSTARTTRAINDETECT); 161 NET_ADD_STATS(sock_net(sk), 162 LINUX_MIB_TCPHYSTARTTRAINCWND, 163 tp->snd_cwnd); 164 tp->snd_ssthresh = tp->snd_cwnd; 165 return; 166 } 167 } 168 } 169 170 if (hystart_detect & HYSTART_DELAY) { 171 if (ca->sample_cnt < 8) { 172 ca->sample_cnt++; 173 } else { 174 s32 thresh = max(ca->delay_min + ca->delay_min / 8U, 175 125U); 176 177 if (ca->rtt.min > thresh) { 178 NET_INC_STATS(sock_net(sk), 179 LINUX_MIB_TCPHYSTARTDELAYDETECT); 180 NET_ADD_STATS(sock_net(sk), 181 LINUX_MIB_TCPHYSTARTDELAYCWND, 182 tp->snd_cwnd); 183 tp->snd_ssthresh = tp->snd_cwnd; 184 } 185 } 186 } 187 } 188 189 static s32 tcp_cdg_grad(struct cdg *ca) 190 { 191 s32 gmin = ca->rtt.min - ca->rtt_prev.min; 192 s32 gmax = ca->rtt.max - ca->rtt_prev.max; 193 s32 grad; 194 195 if (ca->gradients) { 196 ca->gsum.min += gmin - ca->gradients[ca->tail].min; 197 ca->gsum.max += gmax - ca->gradients[ca->tail].max; 198 ca->gradients[ca->tail].min = gmin; 199 ca->gradients[ca->tail].max = gmax; 200 ca->tail = (ca->tail + 1) & (window - 1); 201 gmin = ca->gsum.min; 202 gmax = ca->gsum.max; 203 } 204 205 /* We keep sums to ignore gradients during cwnd reductions; 206 * the paper's smoothed gradients otherwise simplify to: 207 * (rtt_latest - rtt_oldest) / window. 208 * 209 * We also drop division by window here. 210 */ 211 grad = gmin > 0 ? gmin : gmax; 212 213 /* Extrapolate missing values in gradient window: */ 214 if (!ca->gfilled) { 215 if (!ca->gradients && window > 1) 216 grad *= window; /* Memory allocation failed. */ 217 else if (ca->tail == 0) 218 ca->gfilled = true; 219 else 220 grad = (grad * window) / (int)ca->tail; 221 } 222 223 /* Backoff was effectual: */ 224 if (gmin <= -32 || gmax <= -32) 225 ca->backoff_cnt = 0; 226 227 if (use_tolerance) { 228 /* Reduce small variations to zero: */ 229 gmin = DIV_ROUND_CLOSEST(gmin, 64); 230 gmax = DIV_ROUND_CLOSEST(gmax, 64); 231 232 if (gmin > 0 && gmax <= 0) 233 ca->state = CDG_FULL; 234 else if ((gmin > 0 && gmax > 0) || gmax < 0) 235 ca->state = CDG_NONFULL; 236 } 237 return grad; 238 } 239 240 static bool tcp_cdg_backoff(struct sock *sk, u32 grad) 241 { 242 struct cdg *ca = inet_csk_ca(sk); 243 struct tcp_sock *tp = tcp_sk(sk); 244 245 if (prandom_u32() <= nexp_u32(grad * backoff_factor)) 246 return false; 247 248 if (use_ineff) { 249 ca->backoff_cnt++; 250 if (ca->backoff_cnt > use_ineff) 251 return false; 252 } 253 254 ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); 255 ca->state = CDG_BACKOFF; 256 tcp_enter_cwr(sk); 257 return true; 258 } 259 260 /* Not called in CWR or Recovery state. */ 261 static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) 262 { 263 struct cdg *ca = inet_csk_ca(sk); 264 struct tcp_sock *tp = tcp_sk(sk); 265 u32 prior_snd_cwnd; 266 u32 incr; 267 268 if (tcp_in_slow_start(tp) && hystart_detect) 269 tcp_cdg_hystart_update(sk); 270 271 if (after(ack, ca->rtt_seq) && ca->rtt.v64) { 272 s32 grad = 0; 273 274 if (ca->rtt_prev.v64) 275 grad = tcp_cdg_grad(ca); 276 ca->rtt_seq = tp->snd_nxt; 277 ca->rtt_prev = ca->rtt; 278 ca->rtt.v64 = 0; 279 ca->last_ack = 0; 280 ca->sample_cnt = 0; 281 282 if (grad > 0 && tcp_cdg_backoff(sk, grad)) 283 return; 284 } 285 286 if (!tcp_is_cwnd_limited(sk)) { 287 ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); 288 return; 289 } 290 291 prior_snd_cwnd = tp->snd_cwnd; 292 tcp_reno_cong_avoid(sk, ack, acked); 293 294 incr = tp->snd_cwnd - prior_snd_cwnd; 295 ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); 296 } 297 298 static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample) 299 { 300 struct cdg *ca = inet_csk_ca(sk); 301 struct tcp_sock *tp = tcp_sk(sk); 302 303 if (sample->rtt_us <= 0) 304 return; 305 306 /* A heuristic for filtering delayed ACKs, adapted from: 307 * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support 308 * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. 309 */ 310 if (tp->sacked_out == 0) { 311 if (sample->pkts_acked == 1 && ca->delack) { 312 /* A delayed ACK is only used for the minimum if it is 313 * provenly lower than an existing non-zero minimum. 314 */ 315 ca->rtt.min = min(ca->rtt.min, sample->rtt_us); 316 ca->delack--; 317 return; 318 } else if (sample->pkts_acked > 1 && ca->delack < 5) { 319 ca->delack++; 320 } 321 } 322 323 ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us); 324 ca->rtt.max = max(ca->rtt.max, sample->rtt_us); 325 } 326 327 static u32 tcp_cdg_ssthresh(struct sock *sk) 328 { 329 struct cdg *ca = inet_csk_ca(sk); 330 struct tcp_sock *tp = tcp_sk(sk); 331 332 if (ca->state == CDG_BACKOFF) 333 return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); 334 335 if (ca->state == CDG_NONFULL && use_tolerance) 336 return tp->snd_cwnd; 337 338 ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); 339 if (use_shadow) 340 return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); 341 return max(2U, tp->snd_cwnd >> 1); 342 } 343 344 static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) 345 { 346 struct cdg *ca = inet_csk_ca(sk); 347 struct tcp_sock *tp = tcp_sk(sk); 348 struct cdg_minmax *gradients; 349 350 switch (ev) { 351 case CA_EVENT_CWND_RESTART: 352 gradients = ca->gradients; 353 if (gradients) 354 memset(gradients, 0, window * sizeof(gradients[0])); 355 memset(ca, 0, sizeof(*ca)); 356 357 ca->gradients = gradients; 358 ca->rtt_seq = tp->snd_nxt; 359 ca->shadow_wnd = tp->snd_cwnd; 360 break; 361 case CA_EVENT_COMPLETE_CWR: 362 ca->state = CDG_UNKNOWN; 363 ca->rtt_seq = tp->snd_nxt; 364 ca->rtt_prev = ca->rtt; 365 ca->rtt.v64 = 0; 366 break; 367 default: 368 break; 369 } 370 } 371 372 static void tcp_cdg_init(struct sock *sk) 373 { 374 struct cdg *ca = inet_csk_ca(sk); 375 struct tcp_sock *tp = tcp_sk(sk); 376 377 /* We silently fall back to window = 1 if allocation fails. */ 378 if (window > 1) 379 ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), 380 GFP_NOWAIT | __GFP_NOWARN); 381 ca->rtt_seq = tp->snd_nxt; 382 ca->shadow_wnd = tp->snd_cwnd; 383 } 384 385 static void tcp_cdg_release(struct sock *sk) 386 { 387 struct cdg *ca = inet_csk_ca(sk); 388 389 kfree(ca->gradients); 390 } 391 392 static struct tcp_congestion_ops tcp_cdg __read_mostly = { 393 .cong_avoid = tcp_cdg_cong_avoid, 394 .cwnd_event = tcp_cdg_cwnd_event, 395 .pkts_acked = tcp_cdg_acked, 396 .undo_cwnd = tcp_reno_undo_cwnd, 397 .ssthresh = tcp_cdg_ssthresh, 398 .release = tcp_cdg_release, 399 .init = tcp_cdg_init, 400 .owner = THIS_MODULE, 401 .name = "cdg", 402 }; 403 404 static int __init tcp_cdg_register(void) 405 { 406 if (backoff_beta > 1024 || window < 1 || window > 256) 407 return -ERANGE; 408 if (!is_power_of_2(window)) 409 return -EINVAL; 410 411 BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); 412 tcp_register_congestion_control(&tcp_cdg); 413 return 0; 414 } 415 416 static void __exit tcp_cdg_unregister(void) 417 { 418 tcp_unregister_congestion_control(&tcp_cdg); 419 } 420 421 module_init(tcp_cdg_register); 422 module_exit(tcp_cdg_unregister); 423 MODULE_AUTHOR("Kenneth Klette Jonassen"); 424 MODULE_LICENSE("GPL"); 425 MODULE_DESCRIPTION("TCP CDG"); 426