1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* COMMON Applications Kept Enhanced (CAKE) discipline 4 * 5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> 6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> 7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> 8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> 9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> 10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> 11 * 12 * The CAKE Principles: 13 * (or, how to have your cake and eat it too) 14 * 15 * This is a combination of several shaping, AQM and FQ techniques into one 16 * easy-to-use package: 17 * 18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 * eliminating the need for any sort of burst parameter (eg. token bucket 21 * depth). Burst support is limited to that necessary to overcome scheduling 22 * latency. 23 * 24 * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 * the priority is reduced to avoid starving other tins. 27 * 28 * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 * flows from each other. This prevents a burst on one flow from increasing 30 * the delay to another. Flows are distributed to queues using a 31 * set-associative hash function. 32 * 33 * - Each queue is actively managed by Cobalt, which is a combination of the 34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 * congestion early via ECN (if available) and/or packet drops, to keep 36 * latency low. The codel parameters are auto-tuned based on the bandwidth 37 * setting, as is necessary at low bandwidths. 38 * 39 * The configuration parameters are kept deliberately simple for ease of use. 40 * Everything has sane defaults. Complete generality of configuration is *not* 41 * a goal. 42 * 43 * The priority queue operates according to a weighted DRR scheme, combined with 44 * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 * bandwidth sharing threshold the tin is operating. This determines whether a 46 * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 * that tin in the current pass. 48 * 49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 * granted us permission to leverage. 51 */ 52 53 #include <linux/module.h> 54 #include <linux/types.h> 55 #include <linux/kernel.h> 56 #include <linux/jiffies.h> 57 #include <linux/string.h> 58 #include <linux/in.h> 59 #include <linux/errno.h> 60 #include <linux/init.h> 61 #include <linux/skbuff.h> 62 #include <linux/jhash.h> 63 #include <linux/slab.h> 64 #include <linux/vmalloc.h> 65 #include <linux/reciprocal_div.h> 66 #include <net/netlink.h> 67 #include <linux/if_vlan.h> 68 #include <net/pkt_sched.h> 69 #include <net/pkt_cls.h> 70 #include <net/tcp.h> 71 #include <net/flow_dissector.h> 72 73 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 74 #include <net/netfilter/nf_conntrack_core.h> 75 #endif 76 77 #define CAKE_SET_WAYS (8) 78 #define CAKE_MAX_TINS (8) 79 #define CAKE_QUEUES (1024) 80 #define CAKE_FLOW_MASK 63 81 #define CAKE_FLOW_NAT_FLAG 64 82 83 /* struct cobalt_params - contains codel and blue parameters 84 * @interval: codel initial drop rate 85 * @target: maximum persistent sojourn time & blue update rate 86 * @mtu_time: serialisation delay of maximum-size packet 87 * @p_inc: increment of blue drop probability (0.32 fxp) 88 * @p_dec: decrement of blue drop probability (0.32 fxp) 89 */ 90 struct cobalt_params { 91 u64 interval; 92 u64 target; 93 u64 mtu_time; 94 u32 p_inc; 95 u32 p_dec; 96 }; 97 98 /* struct cobalt_vars - contains codel and blue variables 99 * @count: codel dropping frequency 100 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 101 * @drop_next: time to drop next packet, or when we dropped last 102 * @blue_timer: Blue time to next drop 103 * @p_drop: BLUE drop probability (0.32 fxp) 104 * @dropping: set if in dropping state 105 * @ecn_marked: set if marked 106 */ 107 struct cobalt_vars { 108 u32 count; 109 u32 rec_inv_sqrt; 110 ktime_t drop_next; 111 ktime_t blue_timer; 112 u32 p_drop; 113 bool dropping; 114 bool ecn_marked; 115 }; 116 117 enum { 118 CAKE_SET_NONE = 0, 119 CAKE_SET_SPARSE, 120 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 121 CAKE_SET_BULK, 122 CAKE_SET_DECAYING 123 }; 124 125 struct cake_flow { 126 /* this stuff is all needed per-flow at dequeue time */ 127 struct sk_buff *head; 128 struct sk_buff *tail; 129 struct list_head flowchain; 130 s32 deficit; 131 u32 dropped; 132 struct cobalt_vars cvars; 133 u16 srchost; /* index into cake_host table */ 134 u16 dsthost; 135 u8 set; 136 }; /* please try to keep this structure <= 64 bytes */ 137 138 struct cake_host { 139 u32 srchost_tag; 140 u32 dsthost_tag; 141 u16 srchost_bulk_flow_count; 142 u16 dsthost_bulk_flow_count; 143 }; 144 145 struct cake_heap_entry { 146 u16 t:3, b:10; 147 }; 148 149 struct cake_tin_data { 150 struct cake_flow flows[CAKE_QUEUES]; 151 u32 backlogs[CAKE_QUEUES]; 152 u32 tags[CAKE_QUEUES]; /* for set association */ 153 u16 overflow_idx[CAKE_QUEUES]; 154 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 155 u16 flow_quantum; 156 157 struct cobalt_params cparams; 158 u32 drop_overlimit; 159 u16 bulk_flow_count; 160 u16 sparse_flow_count; 161 u16 decaying_flow_count; 162 u16 unresponsive_flow_count; 163 164 u32 max_skblen; 165 166 struct list_head new_flows; 167 struct list_head old_flows; 168 struct list_head decaying_flows; 169 170 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 171 ktime_t time_next_packet; 172 u64 tin_rate_ns; 173 u64 tin_rate_bps; 174 u16 tin_rate_shft; 175 176 u16 tin_quantum; 177 s32 tin_deficit; 178 u32 tin_backlog; 179 u32 tin_dropped; 180 u32 tin_ecn_mark; 181 182 u32 packets; 183 u64 bytes; 184 185 u32 ack_drops; 186 187 /* moving averages */ 188 u64 avge_delay; 189 u64 peak_delay; 190 u64 base_delay; 191 192 /* hash function stats */ 193 u32 way_directs; 194 u32 way_hits; 195 u32 way_misses; 196 u32 way_collisions; 197 }; /* number of tins is small, so size of this struct doesn't matter much */ 198 199 struct cake_sched_data { 200 struct tcf_proto __rcu *filter_list; /* optional external classifier */ 201 struct tcf_block *block; 202 struct cake_tin_data *tins; 203 204 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 205 u16 overflow_timeout; 206 207 u16 tin_cnt; 208 u8 tin_mode; 209 u8 flow_mode; 210 u8 ack_filter; 211 u8 atm_mode; 212 213 u32 fwmark_mask; 214 u16 fwmark_shft; 215 216 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 217 u16 rate_shft; 218 ktime_t time_next_packet; 219 ktime_t failsafe_next_packet; 220 u64 rate_ns; 221 u64 rate_bps; 222 u16 rate_flags; 223 s16 rate_overhead; 224 u16 rate_mpu; 225 u64 interval; 226 u64 target; 227 228 /* resource tracking */ 229 u32 buffer_used; 230 u32 buffer_max_used; 231 u32 buffer_limit; 232 u32 buffer_config_limit; 233 234 /* indices for dequeue */ 235 u16 cur_tin; 236 u16 cur_flow; 237 238 struct qdisc_watchdog watchdog; 239 const u8 *tin_index; 240 const u8 *tin_order; 241 242 /* bandwidth capacity estimate */ 243 ktime_t last_packet_time; 244 ktime_t avg_window_begin; 245 u64 avg_packet_interval; 246 u64 avg_window_bytes; 247 u64 avg_peak_bandwidth; 248 ktime_t last_reconfig_time; 249 250 /* packet length stats */ 251 u32 avg_netoff; 252 u16 max_netlen; 253 u16 max_adjlen; 254 u16 min_netlen; 255 u16 min_adjlen; 256 }; 257 258 enum { 259 CAKE_FLAG_OVERHEAD = BIT(0), 260 CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 261 CAKE_FLAG_INGRESS = BIT(2), 262 CAKE_FLAG_WASH = BIT(3), 263 CAKE_FLAG_SPLIT_GSO = BIT(4) 264 }; 265 266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 267 * obtain the best features of each. Codel is excellent on flows which 268 * respond to congestion signals in a TCP-like way. BLUE is more effective on 269 * unresponsive flows. 270 */ 271 272 struct cobalt_skb_cb { 273 ktime_t enqueue_time; 274 u32 adjusted_len; 275 }; 276 277 static u64 us_to_ns(u64 us) 278 { 279 return us * NSEC_PER_USEC; 280 } 281 282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 283 { 284 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 285 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 286 } 287 288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 289 { 290 return get_cobalt_cb(skb)->enqueue_time; 291 } 292 293 static void cobalt_set_enqueue_time(struct sk_buff *skb, 294 ktime_t now) 295 { 296 get_cobalt_cb(skb)->enqueue_time = now; 297 } 298 299 static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 300 301 /* Diffserv lookup tables */ 302 303 static const u8 precedence[] = { 304 0, 0, 0, 0, 0, 0, 0, 0, 305 1, 1, 1, 1, 1, 1, 1, 1, 306 2, 2, 2, 2, 2, 2, 2, 2, 307 3, 3, 3, 3, 3, 3, 3, 3, 308 4, 4, 4, 4, 4, 4, 4, 4, 309 5, 5, 5, 5, 5, 5, 5, 5, 310 6, 6, 6, 6, 6, 6, 6, 6, 311 7, 7, 7, 7, 7, 7, 7, 7, 312 }; 313 314 static const u8 diffserv8[] = { 315 2, 5, 1, 2, 4, 2, 2, 2, 316 0, 2, 1, 2, 1, 2, 1, 2, 317 5, 2, 4, 2, 4, 2, 4, 2, 318 3, 2, 3, 2, 3, 2, 3, 2, 319 6, 2, 3, 2, 3, 2, 3, 2, 320 6, 2, 2, 2, 6, 2, 6, 2, 321 7, 2, 2, 2, 2, 2, 2, 2, 322 7, 2, 2, 2, 2, 2, 2, 2, 323 }; 324 325 static const u8 diffserv4[] = { 326 0, 2, 0, 0, 2, 0, 0, 0, 327 1, 0, 0, 0, 0, 0, 0, 0, 328 2, 0, 2, 0, 2, 0, 2, 0, 329 2, 0, 2, 0, 2, 0, 2, 0, 330 3, 0, 2, 0, 2, 0, 2, 0, 331 3, 0, 0, 0, 3, 0, 3, 0, 332 3, 0, 0, 0, 0, 0, 0, 0, 333 3, 0, 0, 0, 0, 0, 0, 0, 334 }; 335 336 static const u8 diffserv3[] = { 337 0, 0, 0, 0, 2, 0, 0, 0, 338 1, 0, 0, 0, 0, 0, 0, 0, 339 0, 0, 0, 0, 0, 0, 0, 0, 340 0, 0, 0, 0, 0, 0, 0, 0, 341 0, 0, 0, 0, 0, 0, 0, 0, 342 0, 0, 0, 0, 2, 0, 2, 0, 343 2, 0, 0, 0, 0, 0, 0, 0, 344 2, 0, 0, 0, 0, 0, 0, 0, 345 }; 346 347 static const u8 besteffort[] = { 348 0, 0, 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0, 0, 0, 350 0, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 0, 0, 0, 0, 354 0, 0, 0, 0, 0, 0, 0, 0, 355 0, 0, 0, 0, 0, 0, 0, 0, 356 }; 357 358 /* tin priority order for stats dumping */ 359 360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 361 static const u8 bulk_order[] = {1, 0, 2, 3}; 362 363 #define REC_INV_SQRT_CACHE (16) 364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; 365 366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 367 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 368 * 369 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 370 */ 371 372 static void cobalt_newton_step(struct cobalt_vars *vars) 373 { 374 u32 invsqrt, invsqrt2; 375 u64 val; 376 377 invsqrt = vars->rec_inv_sqrt; 378 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 379 val = (3LL << 32) - ((u64)vars->count * invsqrt2); 380 381 val >>= 2; /* avoid overflow in following multiply */ 382 val = (val * invsqrt) >> (32 - 2 + 1); 383 384 vars->rec_inv_sqrt = val; 385 } 386 387 static void cobalt_invsqrt(struct cobalt_vars *vars) 388 { 389 if (vars->count < REC_INV_SQRT_CACHE) 390 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; 391 else 392 cobalt_newton_step(vars); 393 } 394 395 /* There is a big difference in timing between the accurate values placed in 396 * the cache and the approximations given by a single Newton step for small 397 * count values, particularly when stepping from count 1 to 2 or vice versa. 398 * Above 16, a single Newton step gives sufficient accuracy in either 399 * direction, given the precision stored. 400 * 401 * The magnitude of the error when stepping up to count 2 is such as to give 402 * the value that *should* have been produced at count 4. 403 */ 404 405 static void cobalt_cache_init(void) 406 { 407 struct cobalt_vars v; 408 409 memset(&v, 0, sizeof(v)); 410 v.rec_inv_sqrt = ~0U; 411 cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; 412 413 for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { 414 cobalt_newton_step(&v); 415 cobalt_newton_step(&v); 416 cobalt_newton_step(&v); 417 cobalt_newton_step(&v); 418 419 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; 420 } 421 } 422 423 static void cobalt_vars_init(struct cobalt_vars *vars) 424 { 425 memset(vars, 0, sizeof(*vars)); 426 427 if (!cobalt_rec_inv_sqrt_cache[0]) { 428 cobalt_cache_init(); 429 cobalt_rec_inv_sqrt_cache[0] = ~0; 430 } 431 } 432 433 /* CoDel control_law is t + interval/sqrt(count) 434 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 435 * both sqrt() and divide operation. 436 */ 437 static ktime_t cobalt_control(ktime_t t, 438 u64 interval, 439 u32 rec_inv_sqrt) 440 { 441 return ktime_add_ns(t, reciprocal_scale(interval, 442 rec_inv_sqrt)); 443 } 444 445 /* Call this when a packet had to be dropped due to queue overflow. Returns 446 * true if the BLUE state was quiescent before but active after this call. 447 */ 448 static bool cobalt_queue_full(struct cobalt_vars *vars, 449 struct cobalt_params *p, 450 ktime_t now) 451 { 452 bool up = false; 453 454 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 455 up = !vars->p_drop; 456 vars->p_drop += p->p_inc; 457 if (vars->p_drop < p->p_inc) 458 vars->p_drop = ~0; 459 vars->blue_timer = now; 460 } 461 vars->dropping = true; 462 vars->drop_next = now; 463 if (!vars->count) 464 vars->count = 1; 465 466 return up; 467 } 468 469 /* Call this when the queue was serviced but turned out to be empty. Returns 470 * true if the BLUE state was active before but quiescent after this call. 471 */ 472 static bool cobalt_queue_empty(struct cobalt_vars *vars, 473 struct cobalt_params *p, 474 ktime_t now) 475 { 476 bool down = false; 477 478 if (vars->p_drop && 479 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 480 if (vars->p_drop < p->p_dec) 481 vars->p_drop = 0; 482 else 483 vars->p_drop -= p->p_dec; 484 vars->blue_timer = now; 485 down = !vars->p_drop; 486 } 487 vars->dropping = false; 488 489 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 490 vars->count--; 491 cobalt_invsqrt(vars); 492 vars->drop_next = cobalt_control(vars->drop_next, 493 p->interval, 494 vars->rec_inv_sqrt); 495 } 496 497 return down; 498 } 499 500 /* Call this with a freshly dequeued packet for possible congestion marking. 501 * Returns true as an instruction to drop the packet, false for delivery. 502 */ 503 static bool cobalt_should_drop(struct cobalt_vars *vars, 504 struct cobalt_params *p, 505 ktime_t now, 506 struct sk_buff *skb, 507 u32 bulk_flows) 508 { 509 bool next_due, over_target, drop = false; 510 ktime_t schedule; 511 u64 sojourn; 512 513 /* The 'schedule' variable records, in its sign, whether 'now' is before or 514 * after 'drop_next'. This allows 'drop_next' to be updated before the next 515 * scheduling decision is actually branched, without destroying that 516 * information. Similarly, the first 'schedule' value calculated is preserved 517 * in the boolean 'next_due'. 518 * 519 * As for 'drop_next', we take advantage of the fact that 'interval' is both 520 * the delay between first exceeding 'target' and the first signalling event, 521 * *and* the scaling factor for the signalling frequency. It's therefore very 522 * natural to use a single mechanism for both purposes, and eliminates a 523 * significant amount of reference Codel's spaghetti code. To help with this, 524 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 525 * as possible to 1.0 in fixed-point. 526 */ 527 528 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 529 schedule = ktime_sub(now, vars->drop_next); 530 over_target = sojourn > p->target && 531 sojourn > p->mtu_time * bulk_flows * 2 && 532 sojourn > p->mtu_time * 4; 533 next_due = vars->count && ktime_to_ns(schedule) >= 0; 534 535 vars->ecn_marked = false; 536 537 if (over_target) { 538 if (!vars->dropping) { 539 vars->dropping = true; 540 vars->drop_next = cobalt_control(now, 541 p->interval, 542 vars->rec_inv_sqrt); 543 } 544 if (!vars->count) 545 vars->count = 1; 546 } else if (vars->dropping) { 547 vars->dropping = false; 548 } 549 550 if (next_due && vars->dropping) { 551 /* Use ECN mark if possible, otherwise drop */ 552 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); 553 554 vars->count++; 555 if (!vars->count) 556 vars->count--; 557 cobalt_invsqrt(vars); 558 vars->drop_next = cobalt_control(vars->drop_next, 559 p->interval, 560 vars->rec_inv_sqrt); 561 schedule = ktime_sub(now, vars->drop_next); 562 } else { 563 while (next_due) { 564 vars->count--; 565 cobalt_invsqrt(vars); 566 vars->drop_next = cobalt_control(vars->drop_next, 567 p->interval, 568 vars->rec_inv_sqrt); 569 schedule = ktime_sub(now, vars->drop_next); 570 next_due = vars->count && ktime_to_ns(schedule) >= 0; 571 } 572 } 573 574 /* Simple BLUE implementation. Lack of ECN is deliberate. */ 575 if (vars->p_drop) 576 drop |= (prandom_u32() < vars->p_drop); 577 578 /* Overload the drop_next field as an activity timeout */ 579 if (!vars->count) 580 vars->drop_next = ktime_add_ns(now, p->interval); 581 else if (ktime_to_ns(schedule) > 0 && !drop) 582 vars->drop_next = now; 583 584 return drop; 585 } 586 587 static void cake_update_flowkeys(struct flow_keys *keys, 588 const struct sk_buff *skb) 589 { 590 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 591 struct nf_conntrack_tuple tuple = {}; 592 bool rev = !skb->_nfct; 593 594 if (tc_skb_protocol(skb) != htons(ETH_P_IP)) 595 return; 596 597 if (!nf_ct_get_tuple_skb(&tuple, skb)) 598 return; 599 600 keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; 601 keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; 602 603 if (keys->ports.ports) { 604 keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; 605 keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; 606 } 607 #endif 608 } 609 610 /* Cake has several subtle multiple bit settings. In these cases you 611 * would be matching triple isolate mode as well. 612 */ 613 614 static bool cake_dsrc(int flow_mode) 615 { 616 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 617 } 618 619 static bool cake_ddst(int flow_mode) 620 { 621 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 622 } 623 624 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 625 int flow_mode, u16 flow_override, u16 host_override) 626 { 627 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; 628 u16 reduced_hash, srchost_idx, dsthost_idx; 629 struct flow_keys keys, host_keys; 630 631 if (unlikely(flow_mode == CAKE_FLOW_NONE)) 632 return 0; 633 634 /* If both overrides are set we can skip packet dissection entirely */ 635 if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) && 636 (host_override || !(flow_mode & CAKE_FLOW_HOSTS))) 637 goto skip_hash; 638 639 skb_flow_dissect_flow_keys(skb, &keys, 640 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 641 642 if (flow_mode & CAKE_FLOW_NAT_FLAG) 643 cake_update_flowkeys(&keys, skb); 644 645 /* flow_hash_from_keys() sorts the addresses by value, so we have 646 * to preserve their order in a separate data structure to treat 647 * src and dst host addresses as independently selectable. 648 */ 649 host_keys = keys; 650 host_keys.ports.ports = 0; 651 host_keys.basic.ip_proto = 0; 652 host_keys.keyid.keyid = 0; 653 host_keys.tags.flow_label = 0; 654 655 switch (host_keys.control.addr_type) { 656 case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 657 host_keys.addrs.v4addrs.src = 0; 658 dsthost_hash = flow_hash_from_keys(&host_keys); 659 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 660 host_keys.addrs.v4addrs.dst = 0; 661 srchost_hash = flow_hash_from_keys(&host_keys); 662 break; 663 664 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 665 memset(&host_keys.addrs.v6addrs.src, 0, 666 sizeof(host_keys.addrs.v6addrs.src)); 667 dsthost_hash = flow_hash_from_keys(&host_keys); 668 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 669 memset(&host_keys.addrs.v6addrs.dst, 0, 670 sizeof(host_keys.addrs.v6addrs.dst)); 671 srchost_hash = flow_hash_from_keys(&host_keys); 672 break; 673 674 default: 675 dsthost_hash = 0; 676 srchost_hash = 0; 677 } 678 679 /* This *must* be after the above switch, since as a 680 * side-effect it sorts the src and dst addresses. 681 */ 682 if (flow_mode & CAKE_FLOW_FLOWS) 683 flow_hash = flow_hash_from_keys(&keys); 684 685 skip_hash: 686 if (flow_override) 687 flow_hash = flow_override - 1; 688 if (host_override) { 689 dsthost_hash = host_override - 1; 690 srchost_hash = host_override - 1; 691 } 692 693 if (!(flow_mode & CAKE_FLOW_FLOWS)) { 694 if (flow_mode & CAKE_FLOW_SRC_IP) 695 flow_hash ^= srchost_hash; 696 697 if (flow_mode & CAKE_FLOW_DST_IP) 698 flow_hash ^= dsthost_hash; 699 } 700 701 reduced_hash = flow_hash % CAKE_QUEUES; 702 703 /* set-associative hashing */ 704 /* fast path if no hash collision (direct lookup succeeds) */ 705 if (likely(q->tags[reduced_hash] == flow_hash && 706 q->flows[reduced_hash].set)) { 707 q->way_directs++; 708 } else { 709 u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 710 u32 outer_hash = reduced_hash - inner_hash; 711 bool allocate_src = false; 712 bool allocate_dst = false; 713 u32 i, k; 714 715 /* check if any active queue in the set is reserved for 716 * this flow. 717 */ 718 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 719 i++, k = (k + 1) % CAKE_SET_WAYS) { 720 if (q->tags[outer_hash + k] == flow_hash) { 721 if (i) 722 q->way_hits++; 723 724 if (!q->flows[outer_hash + k].set) { 725 /* need to increment host refcnts */ 726 allocate_src = cake_dsrc(flow_mode); 727 allocate_dst = cake_ddst(flow_mode); 728 } 729 730 goto found; 731 } 732 } 733 734 /* no queue is reserved for this flow, look for an 735 * empty one. 736 */ 737 for (i = 0; i < CAKE_SET_WAYS; 738 i++, k = (k + 1) % CAKE_SET_WAYS) { 739 if (!q->flows[outer_hash + k].set) { 740 q->way_misses++; 741 allocate_src = cake_dsrc(flow_mode); 742 allocate_dst = cake_ddst(flow_mode); 743 goto found; 744 } 745 } 746 747 /* With no empty queues, default to the original 748 * queue, accept the collision, update the host tags. 749 */ 750 q->way_collisions++; 751 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { 752 q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; 753 q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; 754 } 755 allocate_src = cake_dsrc(flow_mode); 756 allocate_dst = cake_ddst(flow_mode); 757 found: 758 /* reserve queue for future packets in same flow */ 759 reduced_hash = outer_hash + k; 760 q->tags[reduced_hash] = flow_hash; 761 762 if (allocate_src) { 763 srchost_idx = srchost_hash % CAKE_QUEUES; 764 inner_hash = srchost_idx % CAKE_SET_WAYS; 765 outer_hash = srchost_idx - inner_hash; 766 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 767 i++, k = (k + 1) % CAKE_SET_WAYS) { 768 if (q->hosts[outer_hash + k].srchost_tag == 769 srchost_hash) 770 goto found_src; 771 } 772 for (i = 0; i < CAKE_SET_WAYS; 773 i++, k = (k + 1) % CAKE_SET_WAYS) { 774 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) 775 break; 776 } 777 q->hosts[outer_hash + k].srchost_tag = srchost_hash; 778 found_src: 779 srchost_idx = outer_hash + k; 780 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 781 q->hosts[srchost_idx].srchost_bulk_flow_count++; 782 q->flows[reduced_hash].srchost = srchost_idx; 783 } 784 785 if (allocate_dst) { 786 dsthost_idx = dsthost_hash % CAKE_QUEUES; 787 inner_hash = dsthost_idx % CAKE_SET_WAYS; 788 outer_hash = dsthost_idx - inner_hash; 789 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 790 i++, k = (k + 1) % CAKE_SET_WAYS) { 791 if (q->hosts[outer_hash + k].dsthost_tag == 792 dsthost_hash) 793 goto found_dst; 794 } 795 for (i = 0; i < CAKE_SET_WAYS; 796 i++, k = (k + 1) % CAKE_SET_WAYS) { 797 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) 798 break; 799 } 800 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 801 found_dst: 802 dsthost_idx = outer_hash + k; 803 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 804 q->hosts[dsthost_idx].dsthost_bulk_flow_count++; 805 q->flows[reduced_hash].dsthost = dsthost_idx; 806 } 807 } 808 809 return reduced_hash; 810 } 811 812 /* helper functions : might be changed when/if skb use a standard list_head */ 813 /* remove one skb from head of slot queue */ 814 815 static struct sk_buff *dequeue_head(struct cake_flow *flow) 816 { 817 struct sk_buff *skb = flow->head; 818 819 if (skb) { 820 flow->head = skb->next; 821 skb_mark_not_on_list(skb); 822 } 823 824 return skb; 825 } 826 827 /* add skb to flow queue (tail add) */ 828 829 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 830 { 831 if (!flow->head) 832 flow->head = skb; 833 else 834 flow->tail->next = skb; 835 flow->tail = skb; 836 skb->next = NULL; 837 } 838 839 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 840 struct ipv6hdr *buf) 841 { 842 unsigned int offset = skb_network_offset(skb); 843 struct iphdr *iph; 844 845 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 846 847 if (!iph) 848 return NULL; 849 850 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 851 return skb_header_pointer(skb, offset + iph->ihl * 4, 852 sizeof(struct ipv6hdr), buf); 853 854 else if (iph->version == 4) 855 return iph; 856 857 else if (iph->version == 6) 858 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 859 buf); 860 861 return NULL; 862 } 863 864 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 865 void *buf, unsigned int bufsize) 866 { 867 unsigned int offset = skb_network_offset(skb); 868 const struct ipv6hdr *ipv6h; 869 const struct tcphdr *tcph; 870 const struct iphdr *iph; 871 struct ipv6hdr _ipv6h; 872 struct tcphdr _tcph; 873 874 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 875 876 if (!ipv6h) 877 return NULL; 878 879 if (ipv6h->version == 4) { 880 iph = (struct iphdr *)ipv6h; 881 offset += iph->ihl * 4; 882 883 /* special-case 6in4 tunnelling, as that is a common way to get 884 * v6 connectivity in the home 885 */ 886 if (iph->protocol == IPPROTO_IPV6) { 887 ipv6h = skb_header_pointer(skb, offset, 888 sizeof(_ipv6h), &_ipv6h); 889 890 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 891 return NULL; 892 893 offset += sizeof(struct ipv6hdr); 894 895 } else if (iph->protocol != IPPROTO_TCP) { 896 return NULL; 897 } 898 899 } else if (ipv6h->version == 6) { 900 if (ipv6h->nexthdr != IPPROTO_TCP) 901 return NULL; 902 903 offset += sizeof(struct ipv6hdr); 904 } else { 905 return NULL; 906 } 907 908 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 909 if (!tcph) 910 return NULL; 911 912 return skb_header_pointer(skb, offset, 913 min(__tcp_hdrlen(tcph), bufsize), buf); 914 } 915 916 static const void *cake_get_tcpopt(const struct tcphdr *tcph, 917 int code, int *oplen) 918 { 919 /* inspired by tcp_parse_options in tcp_input.c */ 920 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 921 const u8 *ptr = (const u8 *)(tcph + 1); 922 923 while (length > 0) { 924 int opcode = *ptr++; 925 int opsize; 926 927 if (opcode == TCPOPT_EOL) 928 break; 929 if (opcode == TCPOPT_NOP) { 930 length--; 931 continue; 932 } 933 opsize = *ptr++; 934 if (opsize < 2 || opsize > length) 935 break; 936 937 if (opcode == code) { 938 *oplen = opsize; 939 return ptr; 940 } 941 942 ptr += opsize - 2; 943 length -= opsize; 944 } 945 946 return NULL; 947 } 948 949 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 950 * bytes than the other. In the case where both sequences ACKs bytes that the 951 * other doesn't, A is considered greater. DSACKs in A also makes A be 952 * considered greater. 953 * 954 * @return -1, 0 or 1 as normal compare functions 955 */ 956 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 957 const struct tcphdr *tcph_b) 958 { 959 const struct tcp_sack_block_wire *sack_a, *sack_b; 960 u32 ack_seq_a = ntohl(tcph_a->ack_seq); 961 u32 bytes_a = 0, bytes_b = 0; 962 int oplen_a, oplen_b; 963 bool first = true; 964 965 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 966 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 967 968 /* pointers point to option contents */ 969 oplen_a -= TCPOLEN_SACK_BASE; 970 oplen_b -= TCPOLEN_SACK_BASE; 971 972 if (sack_a && oplen_a >= sizeof(*sack_a) && 973 (!sack_b || oplen_b < sizeof(*sack_b))) 974 return -1; 975 else if (sack_b && oplen_b >= sizeof(*sack_b) && 976 (!sack_a || oplen_a < sizeof(*sack_a))) 977 return 1; 978 else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 979 (!sack_b || oplen_b < sizeof(*sack_b))) 980 return 0; 981 982 while (oplen_a >= sizeof(*sack_a)) { 983 const struct tcp_sack_block_wire *sack_tmp = sack_b; 984 u32 start_a = get_unaligned_be32(&sack_a->start_seq); 985 u32 end_a = get_unaligned_be32(&sack_a->end_seq); 986 int oplen_tmp = oplen_b; 987 bool found = false; 988 989 /* DSACK; always considered greater to prevent dropping */ 990 if (before(start_a, ack_seq_a)) 991 return -1; 992 993 bytes_a += end_a - start_a; 994 995 while (oplen_tmp >= sizeof(*sack_tmp)) { 996 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 997 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 998 999 /* first time through we count the total size */ 1000 if (first) 1001 bytes_b += end_b - start_b; 1002 1003 if (!after(start_b, start_a) && !before(end_b, end_a)) { 1004 found = true; 1005 if (!first) 1006 break; 1007 } 1008 oplen_tmp -= sizeof(*sack_tmp); 1009 sack_tmp++; 1010 } 1011 1012 if (!found) 1013 return -1; 1014 1015 oplen_a -= sizeof(*sack_a); 1016 sack_a++; 1017 first = false; 1018 } 1019 1020 /* If we made it this far, all ranges SACKed by A are covered by B, so 1021 * either the SACKs are equal, or B SACKs more bytes. 1022 */ 1023 return bytes_b > bytes_a ? 1 : 0; 1024 } 1025 1026 static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1027 u32 *tsval, u32 *tsecr) 1028 { 1029 const u8 *ptr; 1030 int opsize; 1031 1032 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1033 1034 if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1035 *tsval = get_unaligned_be32(ptr); 1036 *tsecr = get_unaligned_be32(ptr + 4); 1037 } 1038 } 1039 1040 static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1041 u32 tstamp_new, u32 tsecr_new) 1042 { 1043 /* inspired by tcp_parse_options in tcp_input.c */ 1044 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1045 const u8 *ptr = (const u8 *)(tcph + 1); 1046 u32 tstamp, tsecr; 1047 1048 /* 3 reserved flags must be unset to avoid future breakage 1049 * ACK must be set 1050 * ECE/CWR are handled separately 1051 * All other flags URG/PSH/RST/SYN/FIN must be unset 1052 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1053 * 0x00C00000 = CWR/ECE (handled separately) 1054 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1055 */ 1056 if (((tcp_flag_word(tcph) & 1057 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1058 return false; 1059 1060 while (length > 0) { 1061 int opcode = *ptr++; 1062 int opsize; 1063 1064 if (opcode == TCPOPT_EOL) 1065 break; 1066 if (opcode == TCPOPT_NOP) { 1067 length--; 1068 continue; 1069 } 1070 opsize = *ptr++; 1071 if (opsize < 2 || opsize > length) 1072 break; 1073 1074 switch (opcode) { 1075 case TCPOPT_MD5SIG: /* doesn't influence state */ 1076 break; 1077 1078 case TCPOPT_SACK: /* stricter checking performed later */ 1079 if (opsize % 8 != 2) 1080 return false; 1081 break; 1082 1083 case TCPOPT_TIMESTAMP: 1084 /* only drop timestamps lower than new */ 1085 if (opsize != TCPOLEN_TIMESTAMP) 1086 return false; 1087 tstamp = get_unaligned_be32(ptr); 1088 tsecr = get_unaligned_be32(ptr + 4); 1089 if (after(tstamp, tstamp_new) || 1090 after(tsecr, tsecr_new)) 1091 return false; 1092 break; 1093 1094 case TCPOPT_MSS: /* these should only be set on SYN */ 1095 case TCPOPT_WINDOW: 1096 case TCPOPT_SACK_PERM: 1097 case TCPOPT_FASTOPEN: 1098 case TCPOPT_EXP: 1099 default: /* don't drop if any unknown options are present */ 1100 return false; 1101 } 1102 1103 ptr += opsize - 2; 1104 length -= opsize; 1105 } 1106 1107 return true; 1108 } 1109 1110 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1111 struct cake_flow *flow) 1112 { 1113 bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; 1114 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1115 struct sk_buff *skb_check, *skb_prev = NULL; 1116 const struct ipv6hdr *ipv6h, *ipv6h_check; 1117 unsigned char _tcph[64], _tcph_check[64]; 1118 const struct tcphdr *tcph, *tcph_check; 1119 const struct iphdr *iph, *iph_check; 1120 struct ipv6hdr _iph, _iph_check; 1121 const struct sk_buff *skb; 1122 int seglen, num_found = 0; 1123 u32 tstamp = 0, tsecr = 0; 1124 __be32 elig_flags = 0; 1125 int sack_comp; 1126 1127 /* no other possible ACKs to filter */ 1128 if (flow->head == flow->tail) 1129 return NULL; 1130 1131 skb = flow->tail; 1132 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1133 iph = cake_get_iphdr(skb, &_iph); 1134 if (!tcph) 1135 return NULL; 1136 1137 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1138 1139 /* the 'triggering' packet need only have the ACK flag set. 1140 * also check that SYN is not set, as there won't be any previous ACKs. 1141 */ 1142 if ((tcp_flag_word(tcph) & 1143 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1144 return NULL; 1145 1146 /* the 'triggering' ACK is at the tail of the queue, we have already 1147 * returned if it is the only packet in the flow. loop through the rest 1148 * of the queue looking for pure ACKs with the same 5-tuple as the 1149 * triggering one. 1150 */ 1151 for (skb_check = flow->head; 1152 skb_check && skb_check != skb; 1153 skb_prev = skb_check, skb_check = skb_check->next) { 1154 iph_check = cake_get_iphdr(skb_check, &_iph_check); 1155 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1156 sizeof(_tcph_check)); 1157 1158 /* only TCP packets with matching 5-tuple are eligible, and only 1159 * drop safe headers 1160 */ 1161 if (!tcph_check || iph->version != iph_check->version || 1162 tcph_check->source != tcph->source || 1163 tcph_check->dest != tcph->dest) 1164 continue; 1165 1166 if (iph_check->version == 4) { 1167 if (iph_check->saddr != iph->saddr || 1168 iph_check->daddr != iph->daddr) 1169 continue; 1170 1171 seglen = ntohs(iph_check->tot_len) - 1172 (4 * iph_check->ihl); 1173 } else if (iph_check->version == 6) { 1174 ipv6h = (struct ipv6hdr *)iph; 1175 ipv6h_check = (struct ipv6hdr *)iph_check; 1176 1177 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1178 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1179 continue; 1180 1181 seglen = ntohs(ipv6h_check->payload_len); 1182 } else { 1183 WARN_ON(1); /* shouldn't happen */ 1184 continue; 1185 } 1186 1187 /* If the ECE/CWR flags changed from the previous eligible 1188 * packet in the same flow, we should no longer be dropping that 1189 * previous packet as this would lose information. 1190 */ 1191 if (elig_ack && (tcp_flag_word(tcph_check) & 1192 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1193 elig_ack = NULL; 1194 elig_ack_prev = NULL; 1195 num_found--; 1196 } 1197 1198 /* Check TCP options and flags, don't drop ACKs with segment 1199 * data, and don't drop ACKs with a higher cumulative ACK 1200 * counter than the triggering packet. Check ACK seqno here to 1201 * avoid parsing SACK options of packets we are going to exclude 1202 * anyway. 1203 */ 1204 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1205 (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1206 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1207 continue; 1208 1209 /* Check SACK options. The triggering packet must SACK more data 1210 * than the ACK under consideration, or SACK the same range but 1211 * have a larger cumulative ACK counter. The latter is a 1212 * pathological case, but is contained in the following check 1213 * anyway, just to be safe. 1214 */ 1215 sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1216 1217 if (sack_comp < 0 || 1218 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1219 sack_comp == 0)) 1220 continue; 1221 1222 /* At this point we have found an eligible pure ACK to drop; if 1223 * we are in aggressive mode, we are done. Otherwise, keep 1224 * searching unless this is the second eligible ACK we 1225 * found. 1226 * 1227 * Since we want to drop ACK closest to the head of the queue, 1228 * save the first eligible ACK we find, even if we need to loop 1229 * again. 1230 */ 1231 if (!elig_ack) { 1232 elig_ack = skb_check; 1233 elig_ack_prev = skb_prev; 1234 elig_flags = (tcp_flag_word(tcph_check) 1235 & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1236 } 1237 1238 if (num_found++ > 0) 1239 goto found; 1240 } 1241 1242 /* We made it through the queue without finding two eligible ACKs . If 1243 * we found a single eligible ACK we can drop it in aggressive mode if 1244 * we can guarantee that this does not interfere with ECN flag 1245 * information. We ensure this by dropping it only if the enqueued 1246 * packet is consecutive with the eligible ACK, and their flags match. 1247 */ 1248 if (elig_ack && aggressive && elig_ack->next == skb && 1249 (elig_flags == (tcp_flag_word(tcph) & 1250 (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1251 goto found; 1252 1253 return NULL; 1254 1255 found: 1256 if (elig_ack_prev) 1257 elig_ack_prev->next = elig_ack->next; 1258 else 1259 flow->head = elig_ack->next; 1260 1261 skb_mark_not_on_list(elig_ack); 1262 1263 return elig_ack; 1264 } 1265 1266 static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1267 { 1268 avg -= avg >> shift; 1269 avg += sample >> shift; 1270 return avg; 1271 } 1272 1273 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) 1274 { 1275 if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1276 len -= off; 1277 1278 if (q->max_netlen < len) 1279 q->max_netlen = len; 1280 if (q->min_netlen > len) 1281 q->min_netlen = len; 1282 1283 len += q->rate_overhead; 1284 1285 if (len < q->rate_mpu) 1286 len = q->rate_mpu; 1287 1288 if (q->atm_mode == CAKE_ATM_ATM) { 1289 len += 47; 1290 len /= 48; 1291 len *= 53; 1292 } else if (q->atm_mode == CAKE_ATM_PTM) { 1293 /* Add one byte per 64 bytes or part thereof. 1294 * This is conservative and easier to calculate than the 1295 * precise value. 1296 */ 1297 len += (len + 63) / 64; 1298 } 1299 1300 if (q->max_adjlen < len) 1301 q->max_adjlen = len; 1302 if (q->min_adjlen > len) 1303 q->min_adjlen = len; 1304 1305 return len; 1306 } 1307 1308 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1309 { 1310 const struct skb_shared_info *shinfo = skb_shinfo(skb); 1311 unsigned int hdr_len, last_len = 0; 1312 u32 off = skb_network_offset(skb); 1313 u32 len = qdisc_pkt_len(skb); 1314 u16 segs = 1; 1315 1316 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); 1317 1318 if (!shinfo->gso_size) 1319 return cake_calc_overhead(q, len, off); 1320 1321 /* borrowed from qdisc_pkt_len_init() */ 1322 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 1323 1324 /* + transport layer */ 1325 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1326 SKB_GSO_TCPV6))) { 1327 const struct tcphdr *th; 1328 struct tcphdr _tcphdr; 1329 1330 th = skb_header_pointer(skb, skb_transport_offset(skb), 1331 sizeof(_tcphdr), &_tcphdr); 1332 if (likely(th)) 1333 hdr_len += __tcp_hdrlen(th); 1334 } else { 1335 struct udphdr _udphdr; 1336 1337 if (skb_header_pointer(skb, skb_transport_offset(skb), 1338 sizeof(_udphdr), &_udphdr)) 1339 hdr_len += sizeof(struct udphdr); 1340 } 1341 1342 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) 1343 segs = DIV_ROUND_UP(skb->len - hdr_len, 1344 shinfo->gso_size); 1345 else 1346 segs = shinfo->gso_segs; 1347 1348 len = shinfo->gso_size + hdr_len; 1349 last_len = skb->len - shinfo->gso_size * (segs - 1); 1350 1351 return (cake_calc_overhead(q, len, off) * (segs - 1) + 1352 cake_calc_overhead(q, last_len, off)); 1353 } 1354 1355 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1356 { 1357 struct cake_heap_entry ii = q->overflow_heap[i]; 1358 struct cake_heap_entry jj = q->overflow_heap[j]; 1359 1360 q->overflow_heap[i] = jj; 1361 q->overflow_heap[j] = ii; 1362 1363 q->tins[ii.t].overflow_idx[ii.b] = j; 1364 q->tins[jj.t].overflow_idx[jj.b] = i; 1365 } 1366 1367 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1368 { 1369 struct cake_heap_entry ii = q->overflow_heap[i]; 1370 1371 return q->tins[ii.t].backlogs[ii.b]; 1372 } 1373 1374 static void cake_heapify(struct cake_sched_data *q, u16 i) 1375 { 1376 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1377 u32 mb = cake_heap_get_backlog(q, i); 1378 u32 m = i; 1379 1380 while (m < a) { 1381 u32 l = m + m + 1; 1382 u32 r = l + 1; 1383 1384 if (l < a) { 1385 u32 lb = cake_heap_get_backlog(q, l); 1386 1387 if (lb > mb) { 1388 m = l; 1389 mb = lb; 1390 } 1391 } 1392 1393 if (r < a) { 1394 u32 rb = cake_heap_get_backlog(q, r); 1395 1396 if (rb > mb) { 1397 m = r; 1398 mb = rb; 1399 } 1400 } 1401 1402 if (m != i) { 1403 cake_heap_swap(q, i, m); 1404 i = m; 1405 } else { 1406 break; 1407 } 1408 } 1409 } 1410 1411 static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1412 { 1413 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1414 u16 p = (i - 1) >> 1; 1415 u32 ib = cake_heap_get_backlog(q, i); 1416 u32 pb = cake_heap_get_backlog(q, p); 1417 1418 if (ib > pb) { 1419 cake_heap_swap(q, i, p); 1420 i = p; 1421 } else { 1422 break; 1423 } 1424 } 1425 } 1426 1427 static int cake_advance_shaper(struct cake_sched_data *q, 1428 struct cake_tin_data *b, 1429 struct sk_buff *skb, 1430 ktime_t now, bool drop) 1431 { 1432 u32 len = get_cobalt_cb(skb)->adjusted_len; 1433 1434 /* charge packet bandwidth to this tin 1435 * and to the global shaper. 1436 */ 1437 if (q->rate_ns) { 1438 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1439 u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1440 u64 failsafe_dur = global_dur + (global_dur >> 1); 1441 1442 if (ktime_before(b->time_next_packet, now)) 1443 b->time_next_packet = ktime_add_ns(b->time_next_packet, 1444 tin_dur); 1445 1446 else if (ktime_before(b->time_next_packet, 1447 ktime_add_ns(now, tin_dur))) 1448 b->time_next_packet = ktime_add_ns(now, tin_dur); 1449 1450 q->time_next_packet = ktime_add_ns(q->time_next_packet, 1451 global_dur); 1452 if (!drop) 1453 q->failsafe_next_packet = \ 1454 ktime_add_ns(q->failsafe_next_packet, 1455 failsafe_dur); 1456 } 1457 return len; 1458 } 1459 1460 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1461 { 1462 struct cake_sched_data *q = qdisc_priv(sch); 1463 ktime_t now = ktime_get(); 1464 u32 idx = 0, tin = 0, len; 1465 struct cake_heap_entry qq; 1466 struct cake_tin_data *b; 1467 struct cake_flow *flow; 1468 struct sk_buff *skb; 1469 1470 if (!q->overflow_timeout) { 1471 int i; 1472 /* Build fresh max-heap */ 1473 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) 1474 cake_heapify(q, i); 1475 } 1476 q->overflow_timeout = 65535; 1477 1478 /* select longest queue for pruning */ 1479 qq = q->overflow_heap[0]; 1480 tin = qq.t; 1481 idx = qq.b; 1482 1483 b = &q->tins[tin]; 1484 flow = &b->flows[idx]; 1485 skb = dequeue_head(flow); 1486 if (unlikely(!skb)) { 1487 /* heap has gone wrong, rebuild it next time */ 1488 q->overflow_timeout = 0; 1489 return idx + (tin << 16); 1490 } 1491 1492 if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1493 b->unresponsive_flow_count++; 1494 1495 len = qdisc_pkt_len(skb); 1496 q->buffer_used -= skb->truesize; 1497 b->backlogs[idx] -= len; 1498 b->tin_backlog -= len; 1499 sch->qstats.backlog -= len; 1500 qdisc_tree_reduce_backlog(sch, 1, len); 1501 1502 flow->dropped++; 1503 b->tin_dropped++; 1504 sch->qstats.drops++; 1505 1506 if (q->rate_flags & CAKE_FLAG_INGRESS) 1507 cake_advance_shaper(q, b, skb, now, true); 1508 1509 __qdisc_drop(skb, to_free); 1510 sch->q.qlen--; 1511 1512 cake_heapify(q, 0); 1513 1514 return idx + (tin << 16); 1515 } 1516 1517 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) 1518 { 1519 int wlen = skb_network_offset(skb); 1520 u8 dscp; 1521 1522 switch (tc_skb_protocol(skb)) { 1523 case htons(ETH_P_IP): 1524 wlen += sizeof(struct iphdr); 1525 if (!pskb_may_pull(skb, wlen) || 1526 skb_try_make_writable(skb, wlen)) 1527 return 0; 1528 1529 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; 1530 if (wash && dscp) 1531 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1532 return dscp; 1533 1534 case htons(ETH_P_IPV6): 1535 wlen += sizeof(struct ipv6hdr); 1536 if (!pskb_may_pull(skb, wlen) || 1537 skb_try_make_writable(skb, wlen)) 1538 return 0; 1539 1540 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; 1541 if (wash && dscp) 1542 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1543 return dscp; 1544 1545 case htons(ETH_P_ARP): 1546 return 0x38; /* CS7 - Net Control */ 1547 1548 default: 1549 /* If there is no Diffserv field, treat as best-effort */ 1550 return 0; 1551 } 1552 } 1553 1554 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1555 struct sk_buff *skb) 1556 { 1557 struct cake_sched_data *q = qdisc_priv(sch); 1558 u32 tin, mark; 1559 u8 dscp; 1560 1561 /* Tin selection: Default to diffserv-based selection, allow overriding 1562 * using firewall marks or skb->priority. 1563 */ 1564 dscp = cake_handle_diffserv(skb, 1565 q->rate_flags & CAKE_FLAG_WASH); 1566 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; 1567 1568 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) 1569 tin = 0; 1570 1571 else if (mark && mark <= q->tin_cnt) 1572 tin = q->tin_order[mark - 1]; 1573 1574 else if (TC_H_MAJ(skb->priority) == sch->handle && 1575 TC_H_MIN(skb->priority) > 0 && 1576 TC_H_MIN(skb->priority) <= q->tin_cnt) 1577 tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; 1578 1579 else { 1580 tin = q->tin_index[dscp]; 1581 1582 if (unlikely(tin >= q->tin_cnt)) 1583 tin = 0; 1584 } 1585 1586 return &q->tins[tin]; 1587 } 1588 1589 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1590 struct sk_buff *skb, int flow_mode, int *qerr) 1591 { 1592 struct cake_sched_data *q = qdisc_priv(sch); 1593 struct tcf_proto *filter; 1594 struct tcf_result res; 1595 u16 flow = 0, host = 0; 1596 int result; 1597 1598 filter = rcu_dereference_bh(q->filter_list); 1599 if (!filter) 1600 goto hash; 1601 1602 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1603 result = tcf_classify(skb, filter, &res, false); 1604 1605 if (result >= 0) { 1606 #ifdef CONFIG_NET_CLS_ACT 1607 switch (result) { 1608 case TC_ACT_STOLEN: 1609 case TC_ACT_QUEUED: 1610 case TC_ACT_TRAP: 1611 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1612 /* fall through */ 1613 case TC_ACT_SHOT: 1614 return 0; 1615 } 1616 #endif 1617 if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1618 flow = TC_H_MIN(res.classid); 1619 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) 1620 host = TC_H_MAJ(res.classid) >> 16; 1621 } 1622 hash: 1623 *t = cake_select_tin(sch, skb); 1624 return cake_hash(*t, skb, flow_mode, flow, host) + 1; 1625 } 1626 1627 static void cake_reconfigure(struct Qdisc *sch); 1628 1629 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1630 struct sk_buff **to_free) 1631 { 1632 struct cake_sched_data *q = qdisc_priv(sch); 1633 int len = qdisc_pkt_len(skb); 1634 int uninitialized_var(ret); 1635 struct sk_buff *ack = NULL; 1636 ktime_t now = ktime_get(); 1637 struct cake_tin_data *b; 1638 struct cake_flow *flow; 1639 u32 idx; 1640 1641 /* choose flow to insert into */ 1642 idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); 1643 if (idx == 0) { 1644 if (ret & __NET_XMIT_BYPASS) 1645 qdisc_qstats_drop(sch); 1646 __qdisc_drop(skb, to_free); 1647 return ret; 1648 } 1649 idx--; 1650 flow = &b->flows[idx]; 1651 1652 /* ensure shaper state isn't stale */ 1653 if (!b->tin_backlog) { 1654 if (ktime_before(b->time_next_packet, now)) 1655 b->time_next_packet = now; 1656 1657 if (!sch->q.qlen) { 1658 if (ktime_before(q->time_next_packet, now)) { 1659 q->failsafe_next_packet = now; 1660 q->time_next_packet = now; 1661 } else if (ktime_after(q->time_next_packet, now) && 1662 ktime_after(q->failsafe_next_packet, now)) { 1663 u64 next = \ 1664 min(ktime_to_ns(q->time_next_packet), 1665 ktime_to_ns( 1666 q->failsafe_next_packet)); 1667 sch->qstats.overlimits++; 1668 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1669 } 1670 } 1671 } 1672 1673 if (unlikely(len > b->max_skblen)) 1674 b->max_skblen = len; 1675 1676 if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1677 struct sk_buff *segs, *nskb; 1678 netdev_features_t features = netif_skb_features(skb); 1679 unsigned int slen = 0, numsegs = 0; 1680 1681 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1682 if (IS_ERR_OR_NULL(segs)) 1683 return qdisc_drop(skb, sch, to_free); 1684 1685 while (segs) { 1686 nskb = segs->next; 1687 skb_mark_not_on_list(segs); 1688 qdisc_skb_cb(segs)->pkt_len = segs->len; 1689 cobalt_set_enqueue_time(segs, now); 1690 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1691 segs); 1692 flow_queue_add(flow, segs); 1693 1694 sch->q.qlen++; 1695 numsegs++; 1696 slen += segs->len; 1697 q->buffer_used += segs->truesize; 1698 b->packets++; 1699 segs = nskb; 1700 } 1701 1702 /* stats */ 1703 b->bytes += slen; 1704 b->backlogs[idx] += slen; 1705 b->tin_backlog += slen; 1706 sch->qstats.backlog += slen; 1707 q->avg_window_bytes += slen; 1708 1709 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); 1710 consume_skb(skb); 1711 } else { 1712 /* not splitting */ 1713 cobalt_set_enqueue_time(skb, now); 1714 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1715 flow_queue_add(flow, skb); 1716 1717 if (q->ack_filter) 1718 ack = cake_ack_filter(q, flow); 1719 1720 if (ack) { 1721 b->ack_drops++; 1722 sch->qstats.drops++; 1723 b->bytes += qdisc_pkt_len(ack); 1724 len -= qdisc_pkt_len(ack); 1725 q->buffer_used += skb->truesize - ack->truesize; 1726 if (q->rate_flags & CAKE_FLAG_INGRESS) 1727 cake_advance_shaper(q, b, ack, now, true); 1728 1729 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); 1730 consume_skb(ack); 1731 } else { 1732 sch->q.qlen++; 1733 q->buffer_used += skb->truesize; 1734 } 1735 1736 /* stats */ 1737 b->packets++; 1738 b->bytes += len; 1739 b->backlogs[idx] += len; 1740 b->tin_backlog += len; 1741 sch->qstats.backlog += len; 1742 q->avg_window_bytes += len; 1743 } 1744 1745 if (q->overflow_timeout) 1746 cake_heapify_up(q, b->overflow_idx[idx]); 1747 1748 /* incoming bandwidth capacity estimate */ 1749 if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1750 u64 packet_interval = \ 1751 ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1752 1753 if (packet_interval > NSEC_PER_SEC) 1754 packet_interval = NSEC_PER_SEC; 1755 1756 /* filter out short-term bursts, eg. wifi aggregation */ 1757 q->avg_packet_interval = \ 1758 cake_ewma(q->avg_packet_interval, 1759 packet_interval, 1760 (packet_interval > q->avg_packet_interval ? 1761 2 : 8)); 1762 1763 q->last_packet_time = now; 1764 1765 if (packet_interval > q->avg_packet_interval) { 1766 u64 window_interval = \ 1767 ktime_to_ns(ktime_sub(now, 1768 q->avg_window_begin)); 1769 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1770 1771 do_div(b, window_interval); 1772 q->avg_peak_bandwidth = 1773 cake_ewma(q->avg_peak_bandwidth, b, 1774 b > q->avg_peak_bandwidth ? 2 : 8); 1775 q->avg_window_bytes = 0; 1776 q->avg_window_begin = now; 1777 1778 if (ktime_after(now, 1779 ktime_add_ms(q->last_reconfig_time, 1780 250))) { 1781 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1782 cake_reconfigure(sch); 1783 } 1784 } 1785 } else { 1786 q->avg_window_bytes = 0; 1787 q->last_packet_time = now; 1788 } 1789 1790 /* flowchain */ 1791 if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1792 struct cake_host *srchost = &b->hosts[flow->srchost]; 1793 struct cake_host *dsthost = &b->hosts[flow->dsthost]; 1794 u16 host_load = 1; 1795 1796 if (!flow->set) { 1797 list_add_tail(&flow->flowchain, &b->new_flows); 1798 } else { 1799 b->decaying_flow_count--; 1800 list_move_tail(&flow->flowchain, &b->new_flows); 1801 } 1802 flow->set = CAKE_SET_SPARSE; 1803 b->sparse_flow_count++; 1804 1805 if (cake_dsrc(q->flow_mode)) 1806 host_load = max(host_load, srchost->srchost_bulk_flow_count); 1807 1808 if (cake_ddst(q->flow_mode)) 1809 host_load = max(host_load, dsthost->dsthost_bulk_flow_count); 1810 1811 flow->deficit = (b->flow_quantum * 1812 quantum_div[host_load]) >> 16; 1813 } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 1814 struct cake_host *srchost = &b->hosts[flow->srchost]; 1815 struct cake_host *dsthost = &b->hosts[flow->dsthost]; 1816 1817 /* this flow was empty, accounted as a sparse flow, but actually 1818 * in the bulk rotation. 1819 */ 1820 flow->set = CAKE_SET_BULK; 1821 b->sparse_flow_count--; 1822 b->bulk_flow_count++; 1823 1824 if (cake_dsrc(q->flow_mode)) 1825 srchost->srchost_bulk_flow_count++; 1826 1827 if (cake_ddst(q->flow_mode)) 1828 dsthost->dsthost_bulk_flow_count++; 1829 1830 } 1831 1832 if (q->buffer_used > q->buffer_max_used) 1833 q->buffer_max_used = q->buffer_used; 1834 1835 if (q->buffer_used > q->buffer_limit) { 1836 u32 dropped = 0; 1837 1838 while (q->buffer_used > q->buffer_limit) { 1839 dropped++; 1840 cake_drop(sch, to_free); 1841 } 1842 b->drop_overlimit += dropped; 1843 } 1844 return NET_XMIT_SUCCESS; 1845 } 1846 1847 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 1848 { 1849 struct cake_sched_data *q = qdisc_priv(sch); 1850 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1851 struct cake_flow *flow = &b->flows[q->cur_flow]; 1852 struct sk_buff *skb = NULL; 1853 u32 len; 1854 1855 if (flow->head) { 1856 skb = dequeue_head(flow); 1857 len = qdisc_pkt_len(skb); 1858 b->backlogs[q->cur_flow] -= len; 1859 b->tin_backlog -= len; 1860 sch->qstats.backlog -= len; 1861 q->buffer_used -= skb->truesize; 1862 sch->q.qlen--; 1863 1864 if (q->overflow_timeout) 1865 cake_heapify(q, b->overflow_idx[q->cur_flow]); 1866 } 1867 return skb; 1868 } 1869 1870 /* Discard leftover packets from a tin no longer in use. */ 1871 static void cake_clear_tin(struct Qdisc *sch, u16 tin) 1872 { 1873 struct cake_sched_data *q = qdisc_priv(sch); 1874 struct sk_buff *skb; 1875 1876 q->cur_tin = tin; 1877 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 1878 while (!!(skb = cake_dequeue_one(sch))) 1879 kfree_skb(skb); 1880 } 1881 1882 static struct sk_buff *cake_dequeue(struct Qdisc *sch) 1883 { 1884 struct cake_sched_data *q = qdisc_priv(sch); 1885 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1886 struct cake_host *srchost, *dsthost; 1887 ktime_t now = ktime_get(); 1888 struct cake_flow *flow; 1889 struct list_head *head; 1890 bool first_flow = true; 1891 struct sk_buff *skb; 1892 u16 host_load; 1893 u64 delay; 1894 u32 len; 1895 1896 begin: 1897 if (!sch->q.qlen) 1898 return NULL; 1899 1900 /* global hard shaper */ 1901 if (ktime_after(q->time_next_packet, now) && 1902 ktime_after(q->failsafe_next_packet, now)) { 1903 u64 next = min(ktime_to_ns(q->time_next_packet), 1904 ktime_to_ns(q->failsafe_next_packet)); 1905 1906 sch->qstats.overlimits++; 1907 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1908 return NULL; 1909 } 1910 1911 /* Choose a class to work on. */ 1912 if (!q->rate_ns) { 1913 /* In unlimited mode, can't rely on shaper timings, just balance 1914 * with DRR 1915 */ 1916 bool wrapped = false, empty = true; 1917 1918 while (b->tin_deficit < 0 || 1919 !(b->sparse_flow_count + b->bulk_flow_count)) { 1920 if (b->tin_deficit <= 0) 1921 b->tin_deficit += b->tin_quantum; 1922 if (b->sparse_flow_count + b->bulk_flow_count) 1923 empty = false; 1924 1925 q->cur_tin++; 1926 b++; 1927 if (q->cur_tin >= q->tin_cnt) { 1928 q->cur_tin = 0; 1929 b = q->tins; 1930 1931 if (wrapped) { 1932 /* It's possible for q->qlen to be 1933 * nonzero when we actually have no 1934 * packets anywhere. 1935 */ 1936 if (empty) 1937 return NULL; 1938 } else { 1939 wrapped = true; 1940 } 1941 } 1942 } 1943 } else { 1944 /* In shaped mode, choose: 1945 * - Highest-priority tin with queue and meeting schedule, or 1946 * - The earliest-scheduled tin with queue. 1947 */ 1948 ktime_t best_time = KTIME_MAX; 1949 int tin, best_tin = 0; 1950 1951 for (tin = 0; tin < q->tin_cnt; tin++) { 1952 b = q->tins + tin; 1953 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 1954 ktime_t time_to_pkt = \ 1955 ktime_sub(b->time_next_packet, now); 1956 1957 if (ktime_to_ns(time_to_pkt) <= 0 || 1958 ktime_compare(time_to_pkt, 1959 best_time) <= 0) { 1960 best_time = time_to_pkt; 1961 best_tin = tin; 1962 } 1963 } 1964 } 1965 1966 q->cur_tin = best_tin; 1967 b = q->tins + best_tin; 1968 1969 /* No point in going further if no packets to deliver. */ 1970 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 1971 return NULL; 1972 } 1973 1974 retry: 1975 /* service this class */ 1976 head = &b->decaying_flows; 1977 if (!first_flow || list_empty(head)) { 1978 head = &b->new_flows; 1979 if (list_empty(head)) { 1980 head = &b->old_flows; 1981 if (unlikely(list_empty(head))) { 1982 head = &b->decaying_flows; 1983 if (unlikely(list_empty(head))) 1984 goto begin; 1985 } 1986 } 1987 } 1988 flow = list_first_entry(head, struct cake_flow, flowchain); 1989 q->cur_flow = flow - b->flows; 1990 first_flow = false; 1991 1992 /* triple isolation (modified DRR++) */ 1993 srchost = &b->hosts[flow->srchost]; 1994 dsthost = &b->hosts[flow->dsthost]; 1995 host_load = 1; 1996 1997 /* flow isolation (DRR++) */ 1998 if (flow->deficit <= 0) { 1999 /* Keep all flows with deficits out of the sparse and decaying 2000 * rotations. No non-empty flow can go into the decaying 2001 * rotation, so they can't get deficits 2002 */ 2003 if (flow->set == CAKE_SET_SPARSE) { 2004 if (flow->head) { 2005 b->sparse_flow_count--; 2006 b->bulk_flow_count++; 2007 2008 if (cake_dsrc(q->flow_mode)) 2009 srchost->srchost_bulk_flow_count++; 2010 2011 if (cake_ddst(q->flow_mode)) 2012 dsthost->dsthost_bulk_flow_count++; 2013 2014 flow->set = CAKE_SET_BULK; 2015 } else { 2016 /* we've moved it to the bulk rotation for 2017 * correct deficit accounting but we still want 2018 * to count it as a sparse flow, not a bulk one. 2019 */ 2020 flow->set = CAKE_SET_SPARSE_WAIT; 2021 } 2022 } 2023 2024 if (cake_dsrc(q->flow_mode)) 2025 host_load = max(host_load, srchost->srchost_bulk_flow_count); 2026 2027 if (cake_ddst(q->flow_mode)) 2028 host_load = max(host_load, dsthost->dsthost_bulk_flow_count); 2029 2030 WARN_ON(host_load > CAKE_QUEUES); 2031 2032 /* The shifted prandom_u32() is a way to apply dithering to 2033 * avoid accumulating roundoff errors 2034 */ 2035 flow->deficit += (b->flow_quantum * quantum_div[host_load] + 2036 (prandom_u32() >> 16)) >> 16; 2037 list_move_tail(&flow->flowchain, &b->old_flows); 2038 2039 goto retry; 2040 } 2041 2042 /* Retrieve a packet via the AQM */ 2043 while (1) { 2044 skb = cake_dequeue_one(sch); 2045 if (!skb) { 2046 /* this queue was actually empty */ 2047 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2048 b->unresponsive_flow_count--; 2049 2050 if (flow->cvars.p_drop || flow->cvars.count || 2051 ktime_before(now, flow->cvars.drop_next)) { 2052 /* keep in the flowchain until the state has 2053 * decayed to rest 2054 */ 2055 list_move_tail(&flow->flowchain, 2056 &b->decaying_flows); 2057 if (flow->set == CAKE_SET_BULK) { 2058 b->bulk_flow_count--; 2059 2060 if (cake_dsrc(q->flow_mode)) 2061 srchost->srchost_bulk_flow_count--; 2062 2063 if (cake_ddst(q->flow_mode)) 2064 dsthost->dsthost_bulk_flow_count--; 2065 2066 b->decaying_flow_count++; 2067 } else if (flow->set == CAKE_SET_SPARSE || 2068 flow->set == CAKE_SET_SPARSE_WAIT) { 2069 b->sparse_flow_count--; 2070 b->decaying_flow_count++; 2071 } 2072 flow->set = CAKE_SET_DECAYING; 2073 } else { 2074 /* remove empty queue from the flowchain */ 2075 list_del_init(&flow->flowchain); 2076 if (flow->set == CAKE_SET_SPARSE || 2077 flow->set == CAKE_SET_SPARSE_WAIT) 2078 b->sparse_flow_count--; 2079 else if (flow->set == CAKE_SET_BULK) { 2080 b->bulk_flow_count--; 2081 2082 if (cake_dsrc(q->flow_mode)) 2083 srchost->srchost_bulk_flow_count--; 2084 2085 if (cake_ddst(q->flow_mode)) 2086 dsthost->dsthost_bulk_flow_count--; 2087 2088 } else 2089 b->decaying_flow_count--; 2090 2091 flow->set = CAKE_SET_NONE; 2092 } 2093 goto begin; 2094 } 2095 2096 /* Last packet in queue may be marked, shouldn't be dropped */ 2097 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2098 (b->bulk_flow_count * 2099 !!(q->rate_flags & 2100 CAKE_FLAG_INGRESS))) || 2101 !flow->head) 2102 break; 2103 2104 /* drop this packet, get another one */ 2105 if (q->rate_flags & CAKE_FLAG_INGRESS) { 2106 len = cake_advance_shaper(q, b, skb, 2107 now, true); 2108 flow->deficit -= len; 2109 b->tin_deficit -= len; 2110 } 2111 flow->dropped++; 2112 b->tin_dropped++; 2113 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2114 qdisc_qstats_drop(sch); 2115 kfree_skb(skb); 2116 if (q->rate_flags & CAKE_FLAG_INGRESS) 2117 goto retry; 2118 } 2119 2120 b->tin_ecn_mark += !!flow->cvars.ecn_marked; 2121 qdisc_bstats_update(sch, skb); 2122 2123 /* collect delay stats */ 2124 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2125 b->avge_delay = cake_ewma(b->avge_delay, delay, 8); 2126 b->peak_delay = cake_ewma(b->peak_delay, delay, 2127 delay > b->peak_delay ? 2 : 8); 2128 b->base_delay = cake_ewma(b->base_delay, delay, 2129 delay < b->base_delay ? 2 : 8); 2130 2131 len = cake_advance_shaper(q, b, skb, now, false); 2132 flow->deficit -= len; 2133 b->tin_deficit -= len; 2134 2135 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2136 u64 next = min(ktime_to_ns(q->time_next_packet), 2137 ktime_to_ns(q->failsafe_next_packet)); 2138 2139 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2140 } else if (!sch->q.qlen) { 2141 int i; 2142 2143 for (i = 0; i < q->tin_cnt; i++) { 2144 if (q->tins[i].decaying_flow_count) { 2145 ktime_t next = \ 2146 ktime_add_ns(now, 2147 q->tins[i].cparams.target); 2148 2149 qdisc_watchdog_schedule_ns(&q->watchdog, 2150 ktime_to_ns(next)); 2151 break; 2152 } 2153 } 2154 } 2155 2156 if (q->overflow_timeout) 2157 q->overflow_timeout--; 2158 2159 return skb; 2160 } 2161 2162 static void cake_reset(struct Qdisc *sch) 2163 { 2164 u32 c; 2165 2166 for (c = 0; c < CAKE_MAX_TINS; c++) 2167 cake_clear_tin(sch, c); 2168 } 2169 2170 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2171 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2172 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2173 [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2174 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2175 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2176 [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2177 [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2178 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2179 [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2180 [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2181 [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2182 [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2183 [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2184 [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2185 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2186 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, 2187 [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, 2188 }; 2189 2190 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2191 u64 target_ns, u64 rtt_est_ns) 2192 { 2193 /* convert byte-rate into time-per-byte 2194 * so it will always unwedge in reasonable time. 2195 */ 2196 static const u64 MIN_RATE = 64; 2197 u32 byte_target = mtu; 2198 u64 byte_target_ns; 2199 u8 rate_shft = 0; 2200 u64 rate_ns = 0; 2201 2202 b->flow_quantum = 1514; 2203 if (rate) { 2204 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); 2205 rate_shft = 34; 2206 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2207 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2208 while (!!(rate_ns >> 34)) { 2209 rate_ns >>= 1; 2210 rate_shft--; 2211 } 2212 } /* else unlimited, ie. zero delay */ 2213 2214 b->tin_rate_bps = rate; 2215 b->tin_rate_ns = rate_ns; 2216 b->tin_rate_shft = rate_shft; 2217 2218 byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2219 2220 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); 2221 b->cparams.interval = max(rtt_est_ns + 2222 b->cparams.target - target_ns, 2223 b->cparams.target * 2); 2224 b->cparams.mtu_time = byte_target_ns; 2225 b->cparams.p_inc = 1 << 24; /* 1/256 */ 2226 b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2227 } 2228 2229 static int cake_config_besteffort(struct Qdisc *sch) 2230 { 2231 struct cake_sched_data *q = qdisc_priv(sch); 2232 struct cake_tin_data *b = &q->tins[0]; 2233 u32 mtu = psched_mtu(qdisc_dev(sch)); 2234 u64 rate = q->rate_bps; 2235 2236 q->tin_cnt = 1; 2237 2238 q->tin_index = besteffort; 2239 q->tin_order = normal_order; 2240 2241 cake_set_rate(b, rate, mtu, 2242 us_to_ns(q->target), us_to_ns(q->interval)); 2243 b->tin_quantum = 65535; 2244 2245 return 0; 2246 } 2247 2248 static int cake_config_precedence(struct Qdisc *sch) 2249 { 2250 /* convert high-level (user visible) parameters into internal format */ 2251 struct cake_sched_data *q = qdisc_priv(sch); 2252 u32 mtu = psched_mtu(qdisc_dev(sch)); 2253 u64 rate = q->rate_bps; 2254 u32 quantum = 256; 2255 u32 i; 2256 2257 q->tin_cnt = 8; 2258 q->tin_index = precedence; 2259 q->tin_order = normal_order; 2260 2261 for (i = 0; i < q->tin_cnt; i++) { 2262 struct cake_tin_data *b = &q->tins[i]; 2263 2264 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2265 us_to_ns(q->interval)); 2266 2267 b->tin_quantum = max_t(u16, 1U, quantum); 2268 2269 /* calculate next class's parameters */ 2270 rate *= 7; 2271 rate >>= 3; 2272 2273 quantum *= 7; 2274 quantum >>= 3; 2275 } 2276 2277 return 0; 2278 } 2279 2280 /* List of known Diffserv codepoints: 2281 * 2282 * Least Effort (CS1) 2283 * Best Effort (CS0) 2284 * Max Reliability & LLT "Lo" (TOS1) 2285 * Max Throughput (TOS2) 2286 * Min Delay (TOS4) 2287 * LLT "La" (TOS5) 2288 * Assured Forwarding 1 (AF1x) - x3 2289 * Assured Forwarding 2 (AF2x) - x3 2290 * Assured Forwarding 3 (AF3x) - x3 2291 * Assured Forwarding 4 (AF4x) - x3 2292 * Precedence Class 2 (CS2) 2293 * Precedence Class 3 (CS3) 2294 * Precedence Class 4 (CS4) 2295 * Precedence Class 5 (CS5) 2296 * Precedence Class 6 (CS6) 2297 * Precedence Class 7 (CS7) 2298 * Voice Admit (VA) 2299 * Expedited Forwarding (EF) 2300 2301 * Total 25 codepoints. 2302 */ 2303 2304 /* List of traffic classes in RFC 4594: 2305 * (roughly descending order of contended priority) 2306 * (roughly ascending order of uncontended throughput) 2307 * 2308 * Network Control (CS6,CS7) - routing traffic 2309 * Telephony (EF,VA) - aka. VoIP streams 2310 * Signalling (CS5) - VoIP setup 2311 * Multimedia Conferencing (AF4x) - aka. video calls 2312 * Realtime Interactive (CS4) - eg. games 2313 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2314 * Broadcast Video (CS3) 2315 * Low Latency Data (AF2x,TOS4) - eg. database 2316 * Ops, Admin, Management (CS2,TOS1) - eg. ssh 2317 * Standard Service (CS0 & unrecognised codepoints) 2318 * High Throughput Data (AF1x,TOS2) - eg. web traffic 2319 * Low Priority Data (CS1) - eg. BitTorrent 2320 2321 * Total 12 traffic classes. 2322 */ 2323 2324 static int cake_config_diffserv8(struct Qdisc *sch) 2325 { 2326 /* Pruned list of traffic classes for typical applications: 2327 * 2328 * Network Control (CS6, CS7) 2329 * Minimum Latency (EF, VA, CS5, CS4) 2330 * Interactive Shell (CS2, TOS1) 2331 * Low Latency Transactions (AF2x, TOS4) 2332 * Video Streaming (AF4x, AF3x, CS3) 2333 * Bog Standard (CS0 etc.) 2334 * High Throughput (AF1x, TOS2) 2335 * Background Traffic (CS1) 2336 * 2337 * Total 8 traffic classes. 2338 */ 2339 2340 struct cake_sched_data *q = qdisc_priv(sch); 2341 u32 mtu = psched_mtu(qdisc_dev(sch)); 2342 u64 rate = q->rate_bps; 2343 u32 quantum = 256; 2344 u32 i; 2345 2346 q->tin_cnt = 8; 2347 2348 /* codepoint to class mapping */ 2349 q->tin_index = diffserv8; 2350 q->tin_order = normal_order; 2351 2352 /* class characteristics */ 2353 for (i = 0; i < q->tin_cnt; i++) { 2354 struct cake_tin_data *b = &q->tins[i]; 2355 2356 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2357 us_to_ns(q->interval)); 2358 2359 b->tin_quantum = max_t(u16, 1U, quantum); 2360 2361 /* calculate next class's parameters */ 2362 rate *= 7; 2363 rate >>= 3; 2364 2365 quantum *= 7; 2366 quantum >>= 3; 2367 } 2368 2369 return 0; 2370 } 2371 2372 static int cake_config_diffserv4(struct Qdisc *sch) 2373 { 2374 /* Further pruned list of traffic classes for four-class system: 2375 * 2376 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2377 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) 2378 * Best Effort (CS0, AF1x, TOS2, and those not specified) 2379 * Background Traffic (CS1) 2380 * 2381 * Total 4 traffic classes. 2382 */ 2383 2384 struct cake_sched_data *q = qdisc_priv(sch); 2385 u32 mtu = psched_mtu(qdisc_dev(sch)); 2386 u64 rate = q->rate_bps; 2387 u32 quantum = 1024; 2388 2389 q->tin_cnt = 4; 2390 2391 /* codepoint to class mapping */ 2392 q->tin_index = diffserv4; 2393 q->tin_order = bulk_order; 2394 2395 /* class characteristics */ 2396 cake_set_rate(&q->tins[0], rate, mtu, 2397 us_to_ns(q->target), us_to_ns(q->interval)); 2398 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2399 us_to_ns(q->target), us_to_ns(q->interval)); 2400 cake_set_rate(&q->tins[2], rate >> 1, mtu, 2401 us_to_ns(q->target), us_to_ns(q->interval)); 2402 cake_set_rate(&q->tins[3], rate >> 2, mtu, 2403 us_to_ns(q->target), us_to_ns(q->interval)); 2404 2405 /* bandwidth-sharing weights */ 2406 q->tins[0].tin_quantum = quantum; 2407 q->tins[1].tin_quantum = quantum >> 4; 2408 q->tins[2].tin_quantum = quantum >> 1; 2409 q->tins[3].tin_quantum = quantum >> 2; 2410 2411 return 0; 2412 } 2413 2414 static int cake_config_diffserv3(struct Qdisc *sch) 2415 { 2416 /* Simplified Diffserv structure with 3 tins. 2417 * Low Priority (CS1) 2418 * Best Effort 2419 * Latency Sensitive (TOS4, VA, EF, CS6, CS7) 2420 */ 2421 struct cake_sched_data *q = qdisc_priv(sch); 2422 u32 mtu = psched_mtu(qdisc_dev(sch)); 2423 u64 rate = q->rate_bps; 2424 u32 quantum = 1024; 2425 2426 q->tin_cnt = 3; 2427 2428 /* codepoint to class mapping */ 2429 q->tin_index = diffserv3; 2430 q->tin_order = bulk_order; 2431 2432 /* class characteristics */ 2433 cake_set_rate(&q->tins[0], rate, mtu, 2434 us_to_ns(q->target), us_to_ns(q->interval)); 2435 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2436 us_to_ns(q->target), us_to_ns(q->interval)); 2437 cake_set_rate(&q->tins[2], rate >> 2, mtu, 2438 us_to_ns(q->target), us_to_ns(q->interval)); 2439 2440 /* bandwidth-sharing weights */ 2441 q->tins[0].tin_quantum = quantum; 2442 q->tins[1].tin_quantum = quantum >> 4; 2443 q->tins[2].tin_quantum = quantum >> 2; 2444 2445 return 0; 2446 } 2447 2448 static void cake_reconfigure(struct Qdisc *sch) 2449 { 2450 struct cake_sched_data *q = qdisc_priv(sch); 2451 int c, ft; 2452 2453 switch (q->tin_mode) { 2454 case CAKE_DIFFSERV_BESTEFFORT: 2455 ft = cake_config_besteffort(sch); 2456 break; 2457 2458 case CAKE_DIFFSERV_PRECEDENCE: 2459 ft = cake_config_precedence(sch); 2460 break; 2461 2462 case CAKE_DIFFSERV_DIFFSERV8: 2463 ft = cake_config_diffserv8(sch); 2464 break; 2465 2466 case CAKE_DIFFSERV_DIFFSERV4: 2467 ft = cake_config_diffserv4(sch); 2468 break; 2469 2470 case CAKE_DIFFSERV_DIFFSERV3: 2471 default: 2472 ft = cake_config_diffserv3(sch); 2473 break; 2474 } 2475 2476 for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { 2477 cake_clear_tin(sch, c); 2478 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; 2479 } 2480 2481 q->rate_ns = q->tins[ft].tin_rate_ns; 2482 q->rate_shft = q->tins[ft].tin_rate_shft; 2483 2484 if (q->buffer_config_limit) { 2485 q->buffer_limit = q->buffer_config_limit; 2486 } else if (q->rate_bps) { 2487 u64 t = q->rate_bps * q->interval; 2488 2489 do_div(t, USEC_PER_SEC / 4); 2490 q->buffer_limit = max_t(u32, t, 4U << 20); 2491 } else { 2492 q->buffer_limit = ~0; 2493 } 2494 2495 sch->flags &= ~TCQ_F_CAN_BYPASS; 2496 2497 q->buffer_limit = min(q->buffer_limit, 2498 max(sch->limit * psched_mtu(qdisc_dev(sch)), 2499 q->buffer_config_limit)); 2500 } 2501 2502 static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2503 struct netlink_ext_ack *extack) 2504 { 2505 struct cake_sched_data *q = qdisc_priv(sch); 2506 struct nlattr *tb[TCA_CAKE_MAX + 1]; 2507 int err; 2508 2509 if (!opt) 2510 return -EINVAL; 2511 2512 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, 2513 extack); 2514 if (err < 0) 2515 return err; 2516 2517 if (tb[TCA_CAKE_NAT]) { 2518 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 2519 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2520 q->flow_mode |= CAKE_FLOW_NAT_FLAG * 2521 !!nla_get_u32(tb[TCA_CAKE_NAT]); 2522 #else 2523 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2524 "No conntrack support in kernel"); 2525 return -EOPNOTSUPP; 2526 #endif 2527 } 2528 2529 if (tb[TCA_CAKE_BASE_RATE64]) 2530 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); 2531 2532 if (tb[TCA_CAKE_DIFFSERV_MODE]) 2533 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); 2534 2535 if (tb[TCA_CAKE_WASH]) { 2536 if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2537 q->rate_flags |= CAKE_FLAG_WASH; 2538 else 2539 q->rate_flags &= ~CAKE_FLAG_WASH; 2540 } 2541 2542 if (tb[TCA_CAKE_FLOW_MODE]) 2543 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | 2544 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2545 CAKE_FLOW_MASK)); 2546 2547 if (tb[TCA_CAKE_ATM]) 2548 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); 2549 2550 if (tb[TCA_CAKE_OVERHEAD]) { 2551 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); 2552 q->rate_flags |= CAKE_FLAG_OVERHEAD; 2553 2554 q->max_netlen = 0; 2555 q->max_adjlen = 0; 2556 q->min_netlen = ~0; 2557 q->min_adjlen = ~0; 2558 } 2559 2560 if (tb[TCA_CAKE_RAW]) { 2561 q->rate_flags &= ~CAKE_FLAG_OVERHEAD; 2562 2563 q->max_netlen = 0; 2564 q->max_adjlen = 0; 2565 q->min_netlen = ~0; 2566 q->min_adjlen = ~0; 2567 } 2568 2569 if (tb[TCA_CAKE_MPU]) 2570 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); 2571 2572 if (tb[TCA_CAKE_RTT]) { 2573 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2574 2575 if (!q->interval) 2576 q->interval = 1; 2577 } 2578 2579 if (tb[TCA_CAKE_TARGET]) { 2580 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2581 2582 if (!q->target) 2583 q->target = 1; 2584 } 2585 2586 if (tb[TCA_CAKE_AUTORATE]) { 2587 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) 2588 q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2589 else 2590 q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2591 } 2592 2593 if (tb[TCA_CAKE_INGRESS]) { 2594 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2595 q->rate_flags |= CAKE_FLAG_INGRESS; 2596 else 2597 q->rate_flags &= ~CAKE_FLAG_INGRESS; 2598 } 2599 2600 if (tb[TCA_CAKE_ACK_FILTER]) 2601 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); 2602 2603 if (tb[TCA_CAKE_MEMORY]) 2604 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); 2605 2606 if (tb[TCA_CAKE_SPLIT_GSO]) { 2607 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) 2608 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2609 else 2610 q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2611 } 2612 2613 if (tb[TCA_CAKE_FWMARK]) { 2614 q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]); 2615 q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; 2616 } 2617 2618 if (q->tins) { 2619 sch_tree_lock(sch); 2620 cake_reconfigure(sch); 2621 sch_tree_unlock(sch); 2622 } 2623 2624 return 0; 2625 } 2626 2627 static void cake_destroy(struct Qdisc *sch) 2628 { 2629 struct cake_sched_data *q = qdisc_priv(sch); 2630 2631 qdisc_watchdog_cancel(&q->watchdog); 2632 tcf_block_put(q->block); 2633 kvfree(q->tins); 2634 } 2635 2636 static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2637 struct netlink_ext_ack *extack) 2638 { 2639 struct cake_sched_data *q = qdisc_priv(sch); 2640 int i, j, err; 2641 2642 sch->limit = 10240; 2643 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2644 q->flow_mode = CAKE_FLOW_TRIPLE; 2645 2646 q->rate_bps = 0; /* unlimited by default */ 2647 2648 q->interval = 100000; /* 100ms default */ 2649 q->target = 5000; /* 5ms: codel RFC argues 2650 * for 5 to 10% of interval 2651 */ 2652 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2653 q->cur_tin = 0; 2654 q->cur_flow = 0; 2655 2656 qdisc_watchdog_init(&q->watchdog, sch); 2657 2658 if (opt) { 2659 int err = cake_change(sch, opt, extack); 2660 2661 if (err) 2662 return err; 2663 } 2664 2665 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 2666 if (err) 2667 return err; 2668 2669 quantum_div[0] = ~0; 2670 for (i = 1; i <= CAKE_QUEUES; i++) 2671 quantum_div[i] = 65535 / i; 2672 2673 q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), 2674 GFP_KERNEL); 2675 if (!q->tins) 2676 goto nomem; 2677 2678 for (i = 0; i < CAKE_MAX_TINS; i++) { 2679 struct cake_tin_data *b = q->tins + i; 2680 2681 INIT_LIST_HEAD(&b->new_flows); 2682 INIT_LIST_HEAD(&b->old_flows); 2683 INIT_LIST_HEAD(&b->decaying_flows); 2684 b->sparse_flow_count = 0; 2685 b->bulk_flow_count = 0; 2686 b->decaying_flow_count = 0; 2687 2688 for (j = 0; j < CAKE_QUEUES; j++) { 2689 struct cake_flow *flow = b->flows + j; 2690 u32 k = j * CAKE_MAX_TINS + i; 2691 2692 INIT_LIST_HEAD(&flow->flowchain); 2693 cobalt_vars_init(&flow->cvars); 2694 2695 q->overflow_heap[k].t = i; 2696 q->overflow_heap[k].b = j; 2697 b->overflow_idx[j] = k; 2698 } 2699 } 2700 2701 cake_reconfigure(sch); 2702 q->avg_peak_bandwidth = q->rate_bps; 2703 q->min_netlen = ~0; 2704 q->min_adjlen = ~0; 2705 return 0; 2706 2707 nomem: 2708 cake_destroy(sch); 2709 return -ENOMEM; 2710 } 2711 2712 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2713 { 2714 struct cake_sched_data *q = qdisc_priv(sch); 2715 struct nlattr *opts; 2716 2717 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 2718 if (!opts) 2719 goto nla_put_failure; 2720 2721 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, 2722 TCA_CAKE_PAD)) 2723 goto nla_put_failure; 2724 2725 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, 2726 q->flow_mode & CAKE_FLOW_MASK)) 2727 goto nla_put_failure; 2728 2729 if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) 2730 goto nla_put_failure; 2731 2732 if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) 2733 goto nla_put_failure; 2734 2735 if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) 2736 goto nla_put_failure; 2737 2738 if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2739 !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2740 goto nla_put_failure; 2741 2742 if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2743 !!(q->rate_flags & CAKE_FLAG_INGRESS))) 2744 goto nla_put_failure; 2745 2746 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) 2747 goto nla_put_failure; 2748 2749 if (nla_put_u32(skb, TCA_CAKE_NAT, 2750 !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) 2751 goto nla_put_failure; 2752 2753 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) 2754 goto nla_put_failure; 2755 2756 if (nla_put_u32(skb, TCA_CAKE_WASH, 2757 !!(q->rate_flags & CAKE_FLAG_WASH))) 2758 goto nla_put_failure; 2759 2760 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) 2761 goto nla_put_failure; 2762 2763 if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) 2764 if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2765 goto nla_put_failure; 2766 2767 if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) 2768 goto nla_put_failure; 2769 2770 if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) 2771 goto nla_put_failure; 2772 2773 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 2774 !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) 2775 goto nla_put_failure; 2776 2777 if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask)) 2778 goto nla_put_failure; 2779 2780 return nla_nest_end(skb, opts); 2781 2782 nla_put_failure: 2783 return -1; 2784 } 2785 2786 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 2787 { 2788 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 2789 struct cake_sched_data *q = qdisc_priv(sch); 2790 struct nlattr *tstats, *ts; 2791 int i; 2792 2793 if (!stats) 2794 return -1; 2795 2796 #define PUT_STAT_U32(attr, data) do { \ 2797 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2798 goto nla_put_failure; \ 2799 } while (0) 2800 #define PUT_STAT_U64(attr, data) do { \ 2801 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 2802 data, TCA_CAKE_STATS_PAD)) \ 2803 goto nla_put_failure; \ 2804 } while (0) 2805 2806 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); 2807 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); 2808 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); 2809 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); 2810 PUT_STAT_U32(MAX_NETLEN, q->max_netlen); 2811 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); 2812 PUT_STAT_U32(MIN_NETLEN, q->min_netlen); 2813 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); 2814 2815 #undef PUT_STAT_U32 2816 #undef PUT_STAT_U64 2817 2818 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); 2819 if (!tstats) 2820 goto nla_put_failure; 2821 2822 #define PUT_TSTAT_U32(attr, data) do { \ 2823 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 2824 goto nla_put_failure; \ 2825 } while (0) 2826 #define PUT_TSTAT_U64(attr, data) do { \ 2827 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 2828 data, TCA_CAKE_TIN_STATS_PAD)) \ 2829 goto nla_put_failure; \ 2830 } while (0) 2831 2832 for (i = 0; i < q->tin_cnt; i++) { 2833 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 2834 2835 ts = nla_nest_start_noflag(d->skb, i + 1); 2836 if (!ts) 2837 goto nla_put_failure; 2838 2839 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); 2840 PUT_TSTAT_U64(SENT_BYTES64, b->bytes); 2841 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); 2842 2843 PUT_TSTAT_U32(TARGET_US, 2844 ktime_to_us(ns_to_ktime(b->cparams.target))); 2845 PUT_TSTAT_U32(INTERVAL_US, 2846 ktime_to_us(ns_to_ktime(b->cparams.interval))); 2847 2848 PUT_TSTAT_U32(SENT_PACKETS, b->packets); 2849 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); 2850 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); 2851 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); 2852 2853 PUT_TSTAT_U32(PEAK_DELAY_US, 2854 ktime_to_us(ns_to_ktime(b->peak_delay))); 2855 PUT_TSTAT_U32(AVG_DELAY_US, 2856 ktime_to_us(ns_to_ktime(b->avge_delay))); 2857 PUT_TSTAT_U32(BASE_DELAY_US, 2858 ktime_to_us(ns_to_ktime(b->base_delay))); 2859 2860 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); 2861 PUT_TSTAT_U32(WAY_MISSES, b->way_misses); 2862 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); 2863 2864 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + 2865 b->decaying_flow_count); 2866 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); 2867 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); 2868 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); 2869 2870 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); 2871 nla_nest_end(d->skb, ts); 2872 } 2873 2874 #undef PUT_TSTAT_U32 2875 #undef PUT_TSTAT_U64 2876 2877 nla_nest_end(d->skb, tstats); 2878 return nla_nest_end(d->skb, stats); 2879 2880 nla_put_failure: 2881 nla_nest_cancel(d->skb, stats); 2882 return -1; 2883 } 2884 2885 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 2886 { 2887 return NULL; 2888 } 2889 2890 static unsigned long cake_find(struct Qdisc *sch, u32 classid) 2891 { 2892 return 0; 2893 } 2894 2895 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 2896 u32 classid) 2897 { 2898 return 0; 2899 } 2900 2901 static void cake_unbind(struct Qdisc *q, unsigned long cl) 2902 { 2903 } 2904 2905 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 2906 struct netlink_ext_ack *extack) 2907 { 2908 struct cake_sched_data *q = qdisc_priv(sch); 2909 2910 if (cl) 2911 return NULL; 2912 return q->block; 2913 } 2914 2915 static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 2916 struct sk_buff *skb, struct tcmsg *tcm) 2917 { 2918 tcm->tcm_handle |= TC_H_MIN(cl); 2919 return 0; 2920 } 2921 2922 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 2923 struct gnet_dump *d) 2924 { 2925 struct cake_sched_data *q = qdisc_priv(sch); 2926 const struct cake_flow *flow = NULL; 2927 struct gnet_stats_queue qs = { 0 }; 2928 struct nlattr *stats; 2929 u32 idx = cl - 1; 2930 2931 if (idx < CAKE_QUEUES * q->tin_cnt) { 2932 const struct cake_tin_data *b = \ 2933 &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 2934 const struct sk_buff *skb; 2935 2936 flow = &b->flows[idx % CAKE_QUEUES]; 2937 2938 if (flow->head) { 2939 sch_tree_lock(sch); 2940 skb = flow->head; 2941 while (skb) { 2942 qs.qlen++; 2943 skb = skb->next; 2944 } 2945 sch_tree_unlock(sch); 2946 } 2947 qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 2948 qs.drops = flow->dropped; 2949 } 2950 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 2951 return -1; 2952 if (flow) { 2953 ktime_t now = ktime_get(); 2954 2955 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 2956 if (!stats) 2957 return -1; 2958 2959 #define PUT_STAT_U32(attr, data) do { \ 2960 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2961 goto nla_put_failure; \ 2962 } while (0) 2963 #define PUT_STAT_S32(attr, data) do { \ 2964 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2965 goto nla_put_failure; \ 2966 } while (0) 2967 2968 PUT_STAT_S32(DEFICIT, flow->deficit); 2969 PUT_STAT_U32(DROPPING, flow->cvars.dropping); 2970 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 2971 PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 2972 if (flow->cvars.p_drop) { 2973 PUT_STAT_S32(BLUE_TIMER_US, 2974 ktime_to_us( 2975 ktime_sub(now, 2976 flow->cvars.blue_timer))); 2977 } 2978 if (flow->cvars.dropping) { 2979 PUT_STAT_S32(DROP_NEXT_US, 2980 ktime_to_us( 2981 ktime_sub(now, 2982 flow->cvars.drop_next))); 2983 } 2984 2985 if (nla_nest_end(d->skb, stats) < 0) 2986 return -1; 2987 } 2988 2989 return 0; 2990 2991 nla_put_failure: 2992 nla_nest_cancel(d->skb, stats); 2993 return -1; 2994 } 2995 2996 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 2997 { 2998 struct cake_sched_data *q = qdisc_priv(sch); 2999 unsigned int i, j; 3000 3001 if (arg->stop) 3002 return; 3003 3004 for (i = 0; i < q->tin_cnt; i++) { 3005 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3006 3007 for (j = 0; j < CAKE_QUEUES; j++) { 3008 if (list_empty(&b->flows[j].flowchain) || 3009 arg->count < arg->skip) { 3010 arg->count++; 3011 continue; 3012 } 3013 if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { 3014 arg->stop = 1; 3015 break; 3016 } 3017 arg->count++; 3018 } 3019 } 3020 } 3021 3022 static const struct Qdisc_class_ops cake_class_ops = { 3023 .leaf = cake_leaf, 3024 .find = cake_find, 3025 .tcf_block = cake_tcf_block, 3026 .bind_tcf = cake_bind, 3027 .unbind_tcf = cake_unbind, 3028 .dump = cake_dump_class, 3029 .dump_stats = cake_dump_class_stats, 3030 .walk = cake_walk, 3031 }; 3032 3033 static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 3034 .cl_ops = &cake_class_ops, 3035 .id = "cake", 3036 .priv_size = sizeof(struct cake_sched_data), 3037 .enqueue = cake_enqueue, 3038 .dequeue = cake_dequeue, 3039 .peek = qdisc_peek_dequeued, 3040 .init = cake_init, 3041 .reset = cake_reset, 3042 .destroy = cake_destroy, 3043 .change = cake_change, 3044 .dump = cake_dump, 3045 .dump_stats = cake_dump_stats, 3046 .owner = THIS_MODULE, 3047 }; 3048 3049 static int __init cake_module_init(void) 3050 { 3051 return register_qdisc(&cake_qdisc_ops); 3052 } 3053 3054 static void __exit cake_module_exit(void) 3055 { 3056 unregister_qdisc(&cake_qdisc_ops); 3057 } 3058 3059 module_init(cake_module_init) 3060 module_exit(cake_module_exit) 3061 MODULE_AUTHOR("Jonathan Morton"); 3062 MODULE_LICENSE("Dual BSD/GPL"); 3063 MODULE_DESCRIPTION("The CAKE shaper."); 3064