1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* COMMON Applications Kept Enhanced (CAKE) discipline 4 * 5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> 6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> 7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> 8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> 9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> 10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> 11 * 12 * The CAKE Principles: 13 * (or, how to have your cake and eat it too) 14 * 15 * This is a combination of several shaping, AQM and FQ techniques into one 16 * easy-to-use package: 17 * 18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 * eliminating the need for any sort of burst parameter (eg. token bucket 21 * depth). Burst support is limited to that necessary to overcome scheduling 22 * latency. 23 * 24 * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 * the priority is reduced to avoid starving other tins. 27 * 28 * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 * flows from each other. This prevents a burst on one flow from increasing 30 * the delay to another. Flows are distributed to queues using a 31 * set-associative hash function. 32 * 33 * - Each queue is actively managed by Cobalt, which is a combination of the 34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 * congestion early via ECN (if available) and/or packet drops, to keep 36 * latency low. The codel parameters are auto-tuned based on the bandwidth 37 * setting, as is necessary at low bandwidths. 38 * 39 * The configuration parameters are kept deliberately simple for ease of use. 40 * Everything has sane defaults. Complete generality of configuration is *not* 41 * a goal. 42 * 43 * The priority queue operates according to a weighted DRR scheme, combined with 44 * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 * bandwidth sharing threshold the tin is operating. This determines whether a 46 * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 * that tin in the current pass. 48 * 49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 * granted us permission to leverage. 51 */ 52 53 #include <linux/module.h> 54 #include <linux/types.h> 55 #include <linux/kernel.h> 56 #include <linux/jiffies.h> 57 #include <linux/string.h> 58 #include <linux/in.h> 59 #include <linux/errno.h> 60 #include <linux/init.h> 61 #include <linux/skbuff.h> 62 #include <linux/jhash.h> 63 #include <linux/slab.h> 64 #include <linux/vmalloc.h> 65 #include <linux/reciprocal_div.h> 66 #include <net/netlink.h> 67 #include <linux/if_vlan.h> 68 #include <net/gso.h> 69 #include <net/pkt_sched.h> 70 #include <net/sch_priv.h> 71 #include <net/pkt_cls.h> 72 #include <net/tcp.h> 73 #include <net/flow_dissector.h> 74 75 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 76 #include <net/netfilter/nf_conntrack_core.h> 77 #endif 78 79 #define CAKE_SET_WAYS (8) 80 #define CAKE_MAX_TINS (8) 81 #define CAKE_QUEUES (1024) 82 #define CAKE_FLOW_MASK 63 83 #define CAKE_FLOW_NAT_FLAG 64 84 85 /* struct cobalt_params - contains codel and blue parameters 86 * @interval: codel initial drop rate 87 * @target: maximum persistent sojourn time & blue update rate 88 * @mtu_time: serialisation delay of maximum-size packet 89 * @p_inc: increment of blue drop probability (0.32 fxp) 90 * @p_dec: decrement of blue drop probability (0.32 fxp) 91 */ 92 struct cobalt_params { 93 u64 interval; 94 u64 target; 95 u64 mtu_time; 96 u32 p_inc; 97 u32 p_dec; 98 }; 99 100 /* struct cobalt_vars - contains codel and blue variables 101 * @count: codel dropping frequency 102 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 103 * @drop_next: time to drop next packet, or when we dropped last 104 * @blue_timer: Blue time to next drop 105 * @p_drop: BLUE drop probability (0.32 fxp) 106 * @dropping: set if in dropping state 107 * @ecn_marked: set if marked 108 */ 109 struct cobalt_vars { 110 u32 count; 111 u32 rec_inv_sqrt; 112 ktime_t drop_next; 113 ktime_t blue_timer; 114 u32 p_drop; 115 bool dropping; 116 bool ecn_marked; 117 }; 118 119 enum { 120 CAKE_SET_NONE = 0, 121 CAKE_SET_SPARSE, 122 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 123 CAKE_SET_BULK, 124 CAKE_SET_DECAYING 125 }; 126 127 struct cake_flow { 128 /* this stuff is all needed per-flow at dequeue time */ 129 struct sk_buff *head; 130 struct sk_buff *tail; 131 struct list_head flowchain; 132 s32 deficit; 133 u32 dropped; 134 struct cobalt_vars cvars; 135 u16 srchost; /* index into cake_host table */ 136 u16 dsthost; 137 u8 set; 138 }; /* please try to keep this structure <= 64 bytes */ 139 140 struct cake_host { 141 u32 srchost_tag; 142 u32 dsthost_tag; 143 u16 srchost_bulk_flow_count; 144 u16 dsthost_bulk_flow_count; 145 }; 146 147 struct cake_heap_entry { 148 u16 t:3, b:10; 149 }; 150 151 struct cake_tin_data { 152 struct cake_flow flows[CAKE_QUEUES]; 153 u32 backlogs[CAKE_QUEUES]; 154 u32 tags[CAKE_QUEUES]; /* for set association */ 155 u16 overflow_idx[CAKE_QUEUES]; 156 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 157 u16 flow_quantum; 158 159 struct cobalt_params cparams; 160 u32 drop_overlimit; 161 u16 bulk_flow_count; 162 u16 sparse_flow_count; 163 u16 decaying_flow_count; 164 u16 unresponsive_flow_count; 165 166 u32 max_skblen; 167 168 struct list_head new_flows; 169 struct list_head old_flows; 170 struct list_head decaying_flows; 171 172 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 173 ktime_t time_next_packet; 174 u64 tin_rate_ns; 175 u64 tin_rate_bps; 176 u16 tin_rate_shft; 177 178 u16 tin_quantum; 179 s32 tin_deficit; 180 u32 tin_backlog; 181 u32 tin_dropped; 182 u32 tin_ecn_mark; 183 184 u32 packets; 185 u64 bytes; 186 187 u32 ack_drops; 188 189 /* moving averages */ 190 u64 avge_delay; 191 u64 peak_delay; 192 u64 base_delay; 193 194 /* hash function stats */ 195 u32 way_directs; 196 u32 way_hits; 197 u32 way_misses; 198 u32 way_collisions; 199 }; /* number of tins is small, so size of this struct doesn't matter much */ 200 201 struct cake_sched_config { 202 u64 rate_bps; 203 u64 interval; 204 u64 target; 205 u64 sync_time; 206 u32 buffer_config_limit; 207 u32 fwmark_mask; 208 u16 fwmark_shft; 209 s16 rate_overhead; 210 u16 rate_mpu; 211 u16 rate_flags; 212 u8 tin_mode; 213 u8 flow_mode; 214 u8 atm_mode; 215 u8 ack_filter; 216 u8 is_shared; 217 }; 218 219 struct cake_sched_data { 220 struct tcf_proto __rcu *filter_list; /* optional external classifier */ 221 struct tcf_block *block; 222 struct cake_tin_data *tins; 223 struct cake_sched_config *config; 224 struct cake_sched_config initial_config; 225 226 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 227 228 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 229 ktime_t time_next_packet; 230 ktime_t failsafe_next_packet; 231 u64 rate_ns; 232 u16 rate_shft; 233 u16 overflow_timeout; 234 u16 tin_cnt; 235 236 /* resource tracking */ 237 u32 buffer_used; 238 u32 buffer_max_used; 239 u32 buffer_limit; 240 241 /* indices for dequeue */ 242 u16 cur_tin; 243 u16 cur_flow; 244 245 struct qdisc_watchdog watchdog; 246 const u8 *tin_index; 247 const u8 *tin_order; 248 249 /* bandwidth capacity estimate */ 250 ktime_t last_packet_time; 251 ktime_t avg_window_begin; 252 u64 avg_packet_interval; 253 u64 avg_window_bytes; 254 u64 avg_peak_bandwidth; 255 ktime_t last_reconfig_time; 256 257 /* packet length stats */ 258 u32 avg_netoff; 259 u16 max_netlen; 260 u16 max_adjlen; 261 u16 min_netlen; 262 u16 min_adjlen; 263 264 /* mq sync state */ 265 u64 last_checked_active; 266 u64 last_active; 267 u32 active_queues; 268 }; 269 270 enum { 271 CAKE_FLAG_OVERHEAD = BIT(0), 272 CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 273 CAKE_FLAG_INGRESS = BIT(2), 274 CAKE_FLAG_WASH = BIT(3), 275 CAKE_FLAG_SPLIT_GSO = BIT(4) 276 }; 277 278 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 279 * obtain the best features of each. Codel is excellent on flows which 280 * respond to congestion signals in a TCP-like way. BLUE is more effective on 281 * unresponsive flows. 282 */ 283 284 struct cobalt_skb_cb { 285 ktime_t enqueue_time; 286 u32 adjusted_len; 287 }; 288 289 static u64 us_to_ns(u64 us) 290 { 291 return us * NSEC_PER_USEC; 292 } 293 294 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 295 { 296 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 297 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 298 } 299 300 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 301 { 302 return get_cobalt_cb(skb)->enqueue_time; 303 } 304 305 static void cobalt_set_enqueue_time(struct sk_buff *skb, 306 ktime_t now) 307 { 308 get_cobalt_cb(skb)->enqueue_time = now; 309 } 310 311 static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 312 313 /* Diffserv lookup tables */ 314 315 static const u8 precedence[] = { 316 0, 0, 0, 0, 0, 0, 0, 0, 317 1, 1, 1, 1, 1, 1, 1, 1, 318 2, 2, 2, 2, 2, 2, 2, 2, 319 3, 3, 3, 3, 3, 3, 3, 3, 320 4, 4, 4, 4, 4, 4, 4, 4, 321 5, 5, 5, 5, 5, 5, 5, 5, 322 6, 6, 6, 6, 6, 6, 6, 6, 323 7, 7, 7, 7, 7, 7, 7, 7, 324 }; 325 326 static const u8 diffserv8[] = { 327 2, 0, 1, 2, 4, 2, 2, 2, 328 1, 2, 1, 2, 1, 2, 1, 2, 329 5, 2, 4, 2, 4, 2, 4, 2, 330 3, 2, 3, 2, 3, 2, 3, 2, 331 6, 2, 3, 2, 3, 2, 3, 2, 332 6, 2, 2, 2, 6, 2, 6, 2, 333 7, 2, 2, 2, 2, 2, 2, 2, 334 7, 2, 2, 2, 2, 2, 2, 2, 335 }; 336 337 static const u8 diffserv4[] = { 338 0, 1, 0, 0, 2, 0, 0, 0, 339 1, 0, 0, 0, 0, 0, 0, 0, 340 2, 0, 2, 0, 2, 0, 2, 0, 341 2, 0, 2, 0, 2, 0, 2, 0, 342 3, 0, 2, 0, 2, 0, 2, 0, 343 3, 0, 0, 0, 3, 0, 3, 0, 344 3, 0, 0, 0, 0, 0, 0, 0, 345 3, 0, 0, 0, 0, 0, 0, 0, 346 }; 347 348 static const u8 diffserv3[] = { 349 0, 1, 0, 0, 2, 0, 0, 0, 350 1, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 0, 0, 0, 0, 354 0, 0, 0, 0, 2, 0, 2, 0, 355 2, 0, 0, 0, 0, 0, 0, 0, 356 2, 0, 0, 0, 0, 0, 0, 0, 357 }; 358 359 static const u8 besteffort[] = { 360 0, 0, 0, 0, 0, 0, 0, 0, 361 0, 0, 0, 0, 0, 0, 0, 0, 362 0, 0, 0, 0, 0, 0, 0, 0, 363 0, 0, 0, 0, 0, 0, 0, 0, 364 0, 0, 0, 0, 0, 0, 0, 0, 365 0, 0, 0, 0, 0, 0, 0, 0, 366 0, 0, 0, 0, 0, 0, 0, 0, 367 0, 0, 0, 0, 0, 0, 0, 0, 368 }; 369 370 /* tin priority order for stats dumping */ 371 372 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 373 static const u8 bulk_order[] = {1, 0, 2, 3}; 374 375 /* There is a big difference in timing between the accurate values placed in the 376 * cache and the approximations given by a single Newton step for small count 377 * values, particularly when stepping from count 1 to 2 or vice versa. Hence, 378 * these values are calculated using eight Newton steps, using the 379 * implementation below. Above 16, a single Newton step gives sufficient 380 * accuracy in either direction, given the precision stored. 381 * 382 * The magnitude of the error when stepping up to count 2 is such as to give the 383 * value that *should* have been produced at count 4. 384 */ 385 386 #define REC_INV_SQRT_CACHE (16) 387 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 388 ~0, ~0, 3037000500, 2479700525, 389 2147483647, 1920767767, 1753413056, 1623345051, 390 1518500250, 1431655765, 1358187914, 1294981364, 391 1239850263, 1191209601, 1147878294, 1108955788 392 }; 393 394 static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust); 395 396 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 397 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 398 * 399 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 400 */ 401 402 static void cobalt_newton_step(struct cobalt_vars *vars) 403 { 404 u32 invsqrt, invsqrt2; 405 u64 val; 406 407 invsqrt = vars->rec_inv_sqrt; 408 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 409 val = (3LL << 32) - ((u64)vars->count * invsqrt2); 410 411 val >>= 2; /* avoid overflow in following multiply */ 412 val = (val * invsqrt) >> (32 - 2 + 1); 413 414 vars->rec_inv_sqrt = val; 415 } 416 417 static void cobalt_invsqrt(struct cobalt_vars *vars) 418 { 419 if (vars->count < REC_INV_SQRT_CACHE) 420 vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; 421 else 422 cobalt_newton_step(vars); 423 } 424 425 static void cobalt_vars_init(struct cobalt_vars *vars) 426 { 427 memset(vars, 0, sizeof(*vars)); 428 } 429 430 /* CoDel control_law is t + interval/sqrt(count) 431 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 432 * both sqrt() and divide operation. 433 */ 434 static ktime_t cobalt_control(ktime_t t, 435 u64 interval, 436 u32 rec_inv_sqrt) 437 { 438 return ktime_add_ns(t, reciprocal_scale(interval, 439 rec_inv_sqrt)); 440 } 441 442 /* Call this when a packet had to be dropped due to queue overflow. Returns 443 * true if the BLUE state was quiescent before but active after this call. 444 */ 445 static bool cobalt_queue_full(struct cobalt_vars *vars, 446 struct cobalt_params *p, 447 ktime_t now) 448 { 449 bool up = false; 450 451 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 452 up = !vars->p_drop; 453 vars->p_drop += p->p_inc; 454 if (vars->p_drop < p->p_inc) 455 vars->p_drop = ~0; 456 vars->blue_timer = now; 457 } 458 vars->dropping = true; 459 vars->drop_next = now; 460 if (!vars->count) 461 vars->count = 1; 462 463 return up; 464 } 465 466 /* Call this when the queue was serviced but turned out to be empty. Returns 467 * true if the BLUE state was active before but quiescent after this call. 468 */ 469 static bool cobalt_queue_empty(struct cobalt_vars *vars, 470 struct cobalt_params *p, 471 ktime_t now) 472 { 473 bool down = false; 474 475 if (vars->p_drop && 476 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 477 if (vars->p_drop < p->p_dec) 478 vars->p_drop = 0; 479 else 480 vars->p_drop -= p->p_dec; 481 vars->blue_timer = now; 482 down = !vars->p_drop; 483 } 484 vars->dropping = false; 485 486 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 487 vars->count--; 488 cobalt_invsqrt(vars); 489 vars->drop_next = cobalt_control(vars->drop_next, 490 p->interval, 491 vars->rec_inv_sqrt); 492 } 493 494 return down; 495 } 496 497 /* Call this with a freshly dequeued packet for possible congestion marking. 498 * Returns true as an instruction to drop the packet, false for delivery. 499 */ 500 static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, 501 struct cobalt_params *p, 502 ktime_t now, 503 struct sk_buff *skb, 504 u32 bulk_flows) 505 { 506 enum qdisc_drop_reason reason = QDISC_DROP_UNSPEC; 507 bool next_due, over_target; 508 ktime_t schedule; 509 u64 sojourn; 510 511 /* The 'schedule' variable records, in its sign, whether 'now' is before or 512 * after 'drop_next'. This allows 'drop_next' to be updated before the next 513 * scheduling decision is actually branched, without destroying that 514 * information. Similarly, the first 'schedule' value calculated is preserved 515 * in the boolean 'next_due'. 516 * 517 * As for 'drop_next', we take advantage of the fact that 'interval' is both 518 * the delay between first exceeding 'target' and the first signalling event, 519 * *and* the scaling factor for the signalling frequency. It's therefore very 520 * natural to use a single mechanism for both purposes, and eliminates a 521 * significant amount of reference Codel's spaghetti code. To help with this, 522 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 523 * as possible to 1.0 in fixed-point. 524 */ 525 526 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 527 schedule = ktime_sub(now, vars->drop_next); 528 over_target = sojourn > p->target && 529 sojourn > p->mtu_time * bulk_flows * 2 && 530 sojourn > p->mtu_time * 4; 531 next_due = vars->count && ktime_to_ns(schedule) >= 0; 532 533 vars->ecn_marked = false; 534 535 if (over_target) { 536 if (!vars->dropping) { 537 vars->dropping = true; 538 vars->drop_next = cobalt_control(now, 539 p->interval, 540 vars->rec_inv_sqrt); 541 } 542 if (!vars->count) 543 vars->count = 1; 544 } else if (vars->dropping) { 545 vars->dropping = false; 546 } 547 548 if (next_due && vars->dropping) { 549 /* Use ECN mark if possible, otherwise drop */ 550 if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) 551 reason = QDISC_DROP_CONGESTED; 552 553 vars->count++; 554 if (!vars->count) 555 vars->count--; 556 cobalt_invsqrt(vars); 557 vars->drop_next = cobalt_control(vars->drop_next, 558 p->interval, 559 vars->rec_inv_sqrt); 560 schedule = ktime_sub(now, vars->drop_next); 561 } else { 562 while (next_due) { 563 vars->count--; 564 cobalt_invsqrt(vars); 565 vars->drop_next = cobalt_control(vars->drop_next, 566 p->interval, 567 vars->rec_inv_sqrt); 568 schedule = ktime_sub(now, vars->drop_next); 569 next_due = vars->count && ktime_to_ns(schedule) >= 0; 570 } 571 } 572 573 /* Simple BLUE implementation. Lack of ECN is deliberate. */ 574 if (vars->p_drop && reason == QDISC_DROP_UNSPEC && 575 get_random_u32() < vars->p_drop) 576 reason = QDISC_DROP_FLOOD_PROTECTION; 577 578 /* Overload the drop_next field as an activity timeout */ 579 if (!vars->count) 580 vars->drop_next = ktime_add_ns(now, p->interval); 581 else if (ktime_to_ns(schedule) > 0 && reason == QDISC_DROP_UNSPEC) 582 vars->drop_next = now; 583 584 return reason; 585 } 586 587 static bool cake_update_flowkeys(struct flow_keys *keys, 588 const struct sk_buff *skb) 589 { 590 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 591 struct nf_conntrack_tuple tuple = {}; 592 bool rev = !skb->_nfct, upd = false; 593 __be32 ip; 594 595 if (skb_protocol(skb, true) != htons(ETH_P_IP)) 596 return false; 597 598 if (!nf_ct_get_tuple_skb(&tuple, skb)) 599 return false; 600 601 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; 602 if (ip != keys->addrs.v4addrs.src) { 603 keys->addrs.v4addrs.src = ip; 604 upd = true; 605 } 606 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; 607 if (ip != keys->addrs.v4addrs.dst) { 608 keys->addrs.v4addrs.dst = ip; 609 upd = true; 610 } 611 612 if (keys->ports.ports) { 613 __be16 port; 614 615 port = rev ? tuple.dst.u.all : tuple.src.u.all; 616 if (port != keys->ports.src) { 617 keys->ports.src = port; 618 upd = true; 619 } 620 port = rev ? tuple.src.u.all : tuple.dst.u.all; 621 if (port != keys->ports.dst) { 622 keys->ports.dst = port; 623 upd = true; 624 } 625 } 626 return upd; 627 #else 628 return false; 629 #endif 630 } 631 632 /* Cake has several subtle multiple bit settings. In these cases you 633 * would be matching triple isolate mode as well. 634 */ 635 636 static bool cake_dsrc(int flow_mode) 637 { 638 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 639 } 640 641 static bool cake_ddst(int flow_mode) 642 { 643 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 644 } 645 646 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, 647 struct cake_flow *flow, 648 int flow_mode) 649 { 650 if (likely(cake_dsrc(flow_mode) && 651 q->hosts[flow->srchost].srchost_bulk_flow_count)) 652 q->hosts[flow->srchost].srchost_bulk_flow_count--; 653 } 654 655 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, 656 struct cake_flow *flow, 657 int flow_mode) 658 { 659 if (likely(cake_dsrc(flow_mode) && 660 q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) 661 q->hosts[flow->srchost].srchost_bulk_flow_count++; 662 } 663 664 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, 665 struct cake_flow *flow, 666 int flow_mode) 667 { 668 if (likely(cake_ddst(flow_mode) && 669 q->hosts[flow->dsthost].dsthost_bulk_flow_count)) 670 q->hosts[flow->dsthost].dsthost_bulk_flow_count--; 671 } 672 673 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, 674 struct cake_flow *flow, 675 int flow_mode) 676 { 677 if (likely(cake_ddst(flow_mode) && 678 q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) 679 q->hosts[flow->dsthost].dsthost_bulk_flow_count++; 680 } 681 682 static u16 cake_get_flow_quantum(struct cake_tin_data *q, 683 struct cake_flow *flow, 684 int flow_mode) 685 { 686 u16 host_load = 1; 687 688 if (cake_dsrc(flow_mode)) 689 host_load = max(host_load, 690 q->hosts[flow->srchost].srchost_bulk_flow_count); 691 692 if (cake_ddst(flow_mode)) 693 host_load = max(host_load, 694 q->hosts[flow->dsthost].dsthost_bulk_flow_count); 695 696 /* The get_random_u16() is a way to apply dithering to avoid 697 * accumulating roundoff errors 698 */ 699 return (q->flow_quantum * quantum_div[host_load] + 700 get_random_u16()) >> 16; 701 } 702 703 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 704 int flow_mode, u16 flow_override, u16 host_override) 705 { 706 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); 707 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); 708 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); 709 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; 710 u16 reduced_hash, srchost_idx, dsthost_idx; 711 struct flow_keys keys, host_keys; 712 bool use_skbhash = skb->l4_hash; 713 714 if (unlikely(flow_mode == CAKE_FLOW_NONE)) 715 return 0; 716 717 /* If both overrides are set, or we can use the SKB hash and nat mode is 718 * disabled, we can skip packet dissection entirely. If nat mode is 719 * enabled there's another check below after doing the conntrack lookup. 720 */ 721 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) 722 goto skip_hash; 723 724 skb_flow_dissect_flow_keys(skb, &keys, 725 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 726 727 /* Don't use the SKB hash if we change the lookup keys from conntrack */ 728 if (nat_enabled && cake_update_flowkeys(&keys, skb)) 729 use_skbhash = false; 730 731 /* If we can still use the SKB hash and don't need the host hash, we can 732 * skip the rest of the hashing procedure 733 */ 734 if (use_skbhash && !hash_hosts) 735 goto skip_hash; 736 737 /* flow_hash_from_keys() sorts the addresses by value, so we have 738 * to preserve their order in a separate data structure to treat 739 * src and dst host addresses as independently selectable. 740 */ 741 host_keys = keys; 742 host_keys.ports.ports = 0; 743 host_keys.basic.ip_proto = 0; 744 host_keys.keyid.keyid = 0; 745 host_keys.tags.flow_label = 0; 746 747 switch (host_keys.control.addr_type) { 748 case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 749 host_keys.addrs.v4addrs.src = 0; 750 dsthost_hash = flow_hash_from_keys(&host_keys); 751 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 752 host_keys.addrs.v4addrs.dst = 0; 753 srchost_hash = flow_hash_from_keys(&host_keys); 754 break; 755 756 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 757 memset(&host_keys.addrs.v6addrs.src, 0, 758 sizeof(host_keys.addrs.v6addrs.src)); 759 dsthost_hash = flow_hash_from_keys(&host_keys); 760 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 761 memset(&host_keys.addrs.v6addrs.dst, 0, 762 sizeof(host_keys.addrs.v6addrs.dst)); 763 srchost_hash = flow_hash_from_keys(&host_keys); 764 break; 765 766 default: 767 dsthost_hash = 0; 768 srchost_hash = 0; 769 } 770 771 /* This *must* be after the above switch, since as a 772 * side-effect it sorts the src and dst addresses. 773 */ 774 if (hash_flows && !use_skbhash) 775 flow_hash = flow_hash_from_keys(&keys); 776 777 skip_hash: 778 if (flow_override) 779 flow_hash = flow_override - 1; 780 else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) 781 flow_hash = skb->hash; 782 if (host_override) { 783 dsthost_hash = host_override - 1; 784 srchost_hash = host_override - 1; 785 } 786 787 if (!(flow_mode & CAKE_FLOW_FLOWS)) { 788 if (flow_mode & CAKE_FLOW_SRC_IP) 789 flow_hash ^= srchost_hash; 790 791 if (flow_mode & CAKE_FLOW_DST_IP) 792 flow_hash ^= dsthost_hash; 793 } 794 795 reduced_hash = flow_hash % CAKE_QUEUES; 796 797 /* set-associative hashing */ 798 /* fast path if no hash collision (direct lookup succeeds) */ 799 if (likely(q->tags[reduced_hash] == flow_hash && 800 q->flows[reduced_hash].set)) { 801 q->way_directs++; 802 } else { 803 u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 804 u32 outer_hash = reduced_hash - inner_hash; 805 bool allocate_src = false; 806 bool allocate_dst = false; 807 u32 i, k; 808 809 /* check if any active queue in the set is reserved for 810 * this flow. 811 */ 812 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 813 i++, k = (k + 1) % CAKE_SET_WAYS) { 814 if (q->tags[outer_hash + k] == flow_hash) { 815 if (i) 816 WRITE_ONCE(q->way_hits, q->way_hits + 1); 817 818 if (!q->flows[outer_hash + k].set) { 819 /* need to increment host refcnts */ 820 allocate_src = cake_dsrc(flow_mode); 821 allocate_dst = cake_ddst(flow_mode); 822 } 823 824 goto found; 825 } 826 } 827 828 /* no queue is reserved for this flow, look for an 829 * empty one. 830 */ 831 for (i = 0; i < CAKE_SET_WAYS; 832 i++, k = (k + 1) % CAKE_SET_WAYS) { 833 if (!q->flows[outer_hash + k].set) { 834 WRITE_ONCE(q->way_misses, q->way_misses + 1); 835 allocate_src = cake_dsrc(flow_mode); 836 allocate_dst = cake_ddst(flow_mode); 837 goto found; 838 } 839 } 840 841 /* With no empty queues, default to the original 842 * queue, accept the collision, update the host tags. 843 */ 844 WRITE_ONCE(q->way_collisions, q->way_collisions + 1); 845 allocate_src = cake_dsrc(flow_mode); 846 allocate_dst = cake_ddst(flow_mode); 847 848 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { 849 cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); 850 cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); 851 } 852 found: 853 /* reserve queue for future packets in same flow */ 854 reduced_hash = outer_hash + k; 855 q->tags[reduced_hash] = flow_hash; 856 857 if (allocate_src) { 858 srchost_idx = srchost_hash % CAKE_QUEUES; 859 inner_hash = srchost_idx % CAKE_SET_WAYS; 860 outer_hash = srchost_idx - inner_hash; 861 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 862 i++, k = (k + 1) % CAKE_SET_WAYS) { 863 if (q->hosts[outer_hash + k].srchost_tag == 864 srchost_hash) 865 goto found_src; 866 } 867 for (i = 0; i < CAKE_SET_WAYS; 868 i++, k = (k + 1) % CAKE_SET_WAYS) { 869 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) 870 break; 871 } 872 q->hosts[outer_hash + k].srchost_tag = srchost_hash; 873 found_src: 874 srchost_idx = outer_hash + k; 875 q->flows[reduced_hash].srchost = srchost_idx; 876 877 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 878 cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); 879 } 880 881 if (allocate_dst) { 882 dsthost_idx = dsthost_hash % CAKE_QUEUES; 883 inner_hash = dsthost_idx % CAKE_SET_WAYS; 884 outer_hash = dsthost_idx - inner_hash; 885 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 886 i++, k = (k + 1) % CAKE_SET_WAYS) { 887 if (q->hosts[outer_hash + k].dsthost_tag == 888 dsthost_hash) 889 goto found_dst; 890 } 891 for (i = 0; i < CAKE_SET_WAYS; 892 i++, k = (k + 1) % CAKE_SET_WAYS) { 893 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) 894 break; 895 } 896 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 897 found_dst: 898 dsthost_idx = outer_hash + k; 899 q->flows[reduced_hash].dsthost = dsthost_idx; 900 901 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 902 cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); 903 } 904 } 905 906 return reduced_hash; 907 } 908 909 /* helper functions : might be changed when/if skb use a standard list_head */ 910 /* remove one skb from head of slot queue */ 911 912 static struct sk_buff *dequeue_head(struct cake_flow *flow) 913 { 914 struct sk_buff *skb = flow->head; 915 916 if (skb) { 917 flow->head = skb->next; 918 skb_mark_not_on_list(skb); 919 } 920 921 return skb; 922 } 923 924 /* add skb to flow queue (tail add) */ 925 926 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 927 { 928 if (!flow->head) 929 flow->head = skb; 930 else 931 flow->tail->next = skb; 932 flow->tail = skb; 933 skb->next = NULL; 934 } 935 936 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 937 struct ipv6hdr *buf) 938 { 939 unsigned int offset = skb_network_offset(skb); 940 struct iphdr *iph; 941 942 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 943 944 if (!iph) 945 return NULL; 946 947 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 948 return skb_header_pointer(skb, offset + iph->ihl * 4, 949 sizeof(struct ipv6hdr), buf); 950 951 else if (iph->version == 4) 952 return iph; 953 954 else if (iph->version == 6) 955 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 956 buf); 957 958 return NULL; 959 } 960 961 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 962 void *buf, unsigned int bufsize) 963 { 964 unsigned int offset = skb_network_offset(skb); 965 const struct ipv6hdr *ipv6h; 966 const struct tcphdr *tcph; 967 const struct iphdr *iph; 968 struct ipv6hdr _ipv6h; 969 struct tcphdr _tcph; 970 971 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 972 973 if (!ipv6h) 974 return NULL; 975 976 if (ipv6h->version == 4) { 977 iph = (struct iphdr *)ipv6h; 978 offset += iph->ihl * 4; 979 980 /* special-case 6in4 tunnelling, as that is a common way to get 981 * v6 connectivity in the home 982 */ 983 if (iph->protocol == IPPROTO_IPV6) { 984 ipv6h = skb_header_pointer(skb, offset, 985 sizeof(_ipv6h), &_ipv6h); 986 987 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 988 return NULL; 989 990 offset += sizeof(struct ipv6hdr); 991 992 } else if (iph->protocol != IPPROTO_TCP) { 993 return NULL; 994 } 995 996 } else if (ipv6h->version == 6) { 997 if (ipv6h->nexthdr != IPPROTO_TCP) 998 return NULL; 999 1000 offset += sizeof(struct ipv6hdr); 1001 } else { 1002 return NULL; 1003 } 1004 1005 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 1006 if (!tcph || tcph->doff < 5) 1007 return NULL; 1008 1009 return skb_header_pointer(skb, offset, 1010 min(__tcp_hdrlen(tcph), bufsize), buf); 1011 } 1012 1013 static const void *cake_get_tcpopt(const struct tcphdr *tcph, 1014 int code, int *oplen) 1015 { 1016 /* inspired by tcp_parse_options in tcp_input.c */ 1017 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1018 const u8 *ptr = (const u8 *)(tcph + 1); 1019 1020 while (length > 0) { 1021 int opcode = *ptr++; 1022 int opsize; 1023 1024 if (opcode == TCPOPT_EOL) 1025 break; 1026 if (opcode == TCPOPT_NOP) { 1027 length--; 1028 continue; 1029 } 1030 if (length < 2) 1031 break; 1032 opsize = *ptr++; 1033 if (opsize < 2 || opsize > length) 1034 break; 1035 1036 if (opcode == code) { 1037 *oplen = opsize; 1038 return ptr; 1039 } 1040 1041 ptr += opsize - 2; 1042 length -= opsize; 1043 } 1044 1045 return NULL; 1046 } 1047 1048 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 1049 * bytes than the other. In the case where both sequences ACKs bytes that the 1050 * other doesn't, A is considered greater. DSACKs in A also makes A be 1051 * considered greater. 1052 * 1053 * @return -1, 0 or 1 as normal compare functions 1054 */ 1055 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 1056 const struct tcphdr *tcph_b) 1057 { 1058 const struct tcp_sack_block_wire *sack_a, *sack_b; 1059 u32 ack_seq_a = ntohl(tcph_a->ack_seq); 1060 u32 bytes_a = 0, bytes_b = 0; 1061 int oplen_a, oplen_b; 1062 bool first = true; 1063 1064 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 1065 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 1066 1067 /* pointers point to option contents */ 1068 oplen_a -= TCPOLEN_SACK_BASE; 1069 oplen_b -= TCPOLEN_SACK_BASE; 1070 1071 if (sack_a && oplen_a >= sizeof(*sack_a) && 1072 (!sack_b || oplen_b < sizeof(*sack_b))) 1073 return -1; 1074 else if (sack_b && oplen_b >= sizeof(*sack_b) && 1075 (!sack_a || oplen_a < sizeof(*sack_a))) 1076 return 1; 1077 else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 1078 (!sack_b || oplen_b < sizeof(*sack_b))) 1079 return 0; 1080 1081 while (oplen_a >= sizeof(*sack_a)) { 1082 const struct tcp_sack_block_wire *sack_tmp = sack_b; 1083 u32 start_a = get_unaligned_be32(&sack_a->start_seq); 1084 u32 end_a = get_unaligned_be32(&sack_a->end_seq); 1085 int oplen_tmp = oplen_b; 1086 bool found = false; 1087 1088 /* DSACK; always considered greater to prevent dropping */ 1089 if (before(start_a, ack_seq_a)) 1090 return -1; 1091 1092 bytes_a += end_a - start_a; 1093 1094 while (oplen_tmp >= sizeof(*sack_tmp)) { 1095 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 1096 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 1097 1098 /* first time through we count the total size */ 1099 if (first) 1100 bytes_b += end_b - start_b; 1101 1102 if (!after(start_b, start_a) && !before(end_b, end_a)) { 1103 found = true; 1104 if (!first) 1105 break; 1106 } 1107 oplen_tmp -= sizeof(*sack_tmp); 1108 sack_tmp++; 1109 } 1110 1111 if (!found) 1112 return -1; 1113 1114 oplen_a -= sizeof(*sack_a); 1115 sack_a++; 1116 first = false; 1117 } 1118 1119 /* If we made it this far, all ranges SACKed by A are covered by B, so 1120 * either the SACKs are equal, or B SACKs more bytes. 1121 */ 1122 return bytes_b > bytes_a ? 1 : 0; 1123 } 1124 1125 static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1126 u32 *tsval, u32 *tsecr) 1127 { 1128 const u8 *ptr; 1129 int opsize; 1130 1131 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1132 1133 if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1134 *tsval = get_unaligned_be32(ptr); 1135 *tsecr = get_unaligned_be32(ptr + 4); 1136 } 1137 } 1138 1139 static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1140 u32 tstamp_new, u32 tsecr_new) 1141 { 1142 /* inspired by tcp_parse_options in tcp_input.c */ 1143 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1144 const u8 *ptr = (const u8 *)(tcph + 1); 1145 u32 tstamp, tsecr; 1146 1147 /* 3 reserved flags must be unset to avoid future breakage 1148 * ACK must be set 1149 * ECE/CWR are handled separately 1150 * All other flags URG/PSH/RST/SYN/FIN must be unset 1151 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1152 * 0x00C00000 = CWR/ECE (handled separately) 1153 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1154 */ 1155 if (((tcp_flag_word(tcph) & 1156 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1157 return false; 1158 1159 while (length > 0) { 1160 int opcode = *ptr++; 1161 int opsize; 1162 1163 if (opcode == TCPOPT_EOL) 1164 break; 1165 if (opcode == TCPOPT_NOP) { 1166 length--; 1167 continue; 1168 } 1169 if (length < 2) 1170 break; 1171 opsize = *ptr++; 1172 if (opsize < 2 || opsize > length) 1173 break; 1174 1175 switch (opcode) { 1176 case TCPOPT_MD5SIG: /* doesn't influence state */ 1177 break; 1178 1179 case TCPOPT_SACK: /* stricter checking performed later */ 1180 if (opsize % 8 != 2) 1181 return false; 1182 break; 1183 1184 case TCPOPT_TIMESTAMP: 1185 /* only drop timestamps lower than new */ 1186 if (opsize != TCPOLEN_TIMESTAMP) 1187 return false; 1188 tstamp = get_unaligned_be32(ptr); 1189 tsecr = get_unaligned_be32(ptr + 4); 1190 if (after(tstamp, tstamp_new) || 1191 after(tsecr, tsecr_new)) 1192 return false; 1193 break; 1194 1195 case TCPOPT_MSS: /* these should only be set on SYN */ 1196 case TCPOPT_WINDOW: 1197 case TCPOPT_SACK_PERM: 1198 case TCPOPT_FASTOPEN: 1199 case TCPOPT_EXP: 1200 default: /* don't drop if any unknown options are present */ 1201 return false; 1202 } 1203 1204 ptr += opsize - 2; 1205 length -= opsize; 1206 } 1207 1208 return true; 1209 } 1210 1211 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1212 struct cake_flow *flow) 1213 { 1214 bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE; 1215 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1216 struct sk_buff *skb_check, *skb_prev = NULL; 1217 const struct ipv6hdr *ipv6h, *ipv6h_check; 1218 unsigned char _tcph[64], _tcph_check[64]; 1219 const struct tcphdr *tcph, *tcph_check; 1220 const struct iphdr *iph, *iph_check; 1221 struct ipv6hdr _iph, _iph_check; 1222 const struct sk_buff *skb; 1223 int seglen, num_found = 0; 1224 u32 tstamp = 0, tsecr = 0; 1225 __be32 elig_flags = 0; 1226 int sack_comp; 1227 1228 /* no other possible ACKs to filter */ 1229 if (flow->head == flow->tail) 1230 return NULL; 1231 1232 skb = flow->tail; 1233 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1234 iph = cake_get_iphdr(skb, &_iph); 1235 if (!tcph) 1236 return NULL; 1237 1238 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1239 1240 /* the 'triggering' packet need only have the ACK flag set. 1241 * also check that SYN is not set, as there won't be any previous ACKs. 1242 */ 1243 if ((tcp_flag_word(tcph) & 1244 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1245 return NULL; 1246 1247 /* the 'triggering' ACK is at the tail of the queue, we have already 1248 * returned if it is the only packet in the flow. loop through the rest 1249 * of the queue looking for pure ACKs with the same 5-tuple as the 1250 * triggering one. 1251 */ 1252 for (skb_check = flow->head; 1253 skb_check && skb_check != skb; 1254 skb_prev = skb_check, skb_check = skb_check->next) { 1255 iph_check = cake_get_iphdr(skb_check, &_iph_check); 1256 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1257 sizeof(_tcph_check)); 1258 1259 /* only TCP packets with matching 5-tuple are eligible, and only 1260 * drop safe headers 1261 */ 1262 if (!tcph_check || iph->version != iph_check->version || 1263 tcph_check->source != tcph->source || 1264 tcph_check->dest != tcph->dest) 1265 continue; 1266 1267 if (iph_check->version == 4) { 1268 if (iph_check->saddr != iph->saddr || 1269 iph_check->daddr != iph->daddr) 1270 continue; 1271 1272 seglen = iph_totlen(skb, iph_check) - 1273 (4 * iph_check->ihl); 1274 } else if (iph_check->version == 6) { 1275 ipv6h = (struct ipv6hdr *)iph; 1276 ipv6h_check = (struct ipv6hdr *)iph_check; 1277 1278 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1279 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1280 continue; 1281 1282 seglen = ipv6_payload_len(skb, ipv6h_check); 1283 } else { 1284 WARN_ON(1); /* shouldn't happen */ 1285 continue; 1286 } 1287 1288 /* If the ECE/CWR flags changed from the previous eligible 1289 * packet in the same flow, we should no longer be dropping that 1290 * previous packet as this would lose information. 1291 */ 1292 if (elig_ack && (tcp_flag_word(tcph_check) & 1293 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1294 elig_ack = NULL; 1295 elig_ack_prev = NULL; 1296 num_found--; 1297 } 1298 1299 /* Check TCP options and flags, don't drop ACKs with segment 1300 * data, and don't drop ACKs with a higher cumulative ACK 1301 * counter than the triggering packet. Check ACK seqno here to 1302 * avoid parsing SACK options of packets we are going to exclude 1303 * anyway. 1304 */ 1305 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1306 (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1307 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1308 continue; 1309 1310 /* Check SACK options. The triggering packet must SACK more data 1311 * than the ACK under consideration, or SACK the same range but 1312 * have a larger cumulative ACK counter. The latter is a 1313 * pathological case, but is contained in the following check 1314 * anyway, just to be safe. 1315 */ 1316 sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1317 1318 if (sack_comp < 0 || 1319 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1320 sack_comp == 0)) 1321 continue; 1322 1323 /* At this point we have found an eligible pure ACK to drop; if 1324 * we are in aggressive mode, we are done. Otherwise, keep 1325 * searching unless this is the second eligible ACK we 1326 * found. 1327 * 1328 * Since we want to drop ACK closest to the head of the queue, 1329 * save the first eligible ACK we find, even if we need to loop 1330 * again. 1331 */ 1332 if (!elig_ack) { 1333 elig_ack = skb_check; 1334 elig_ack_prev = skb_prev; 1335 elig_flags = (tcp_flag_word(tcph_check) 1336 & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1337 } 1338 1339 if (num_found++ > 0) 1340 goto found; 1341 } 1342 1343 /* We made it through the queue without finding two eligible ACKs . If 1344 * we found a single eligible ACK we can drop it in aggressive mode if 1345 * we can guarantee that this does not interfere with ECN flag 1346 * information. We ensure this by dropping it only if the enqueued 1347 * packet is consecutive with the eligible ACK, and their flags match. 1348 */ 1349 if (elig_ack && aggressive && elig_ack->next == skb && 1350 (elig_flags == (tcp_flag_word(tcph) & 1351 (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1352 goto found; 1353 1354 return NULL; 1355 1356 found: 1357 if (elig_ack_prev) 1358 elig_ack_prev->next = elig_ack->next; 1359 else 1360 flow->head = elig_ack->next; 1361 1362 skb_mark_not_on_list(elig_ack); 1363 1364 return elig_ack; 1365 } 1366 1367 static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1368 { 1369 avg -= avg >> shift; 1370 avg += sample >> shift; 1371 return avg; 1372 } 1373 1374 static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off) 1375 { 1376 struct cake_sched_config *q = qd->config; 1377 1378 if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1379 len -= off; 1380 1381 if (qd->max_netlen < len) 1382 WRITE_ONCE(qd->max_netlen, len); 1383 if (qd->min_netlen > len) 1384 WRITE_ONCE(qd->min_netlen, len); 1385 1386 len += q->rate_overhead; 1387 1388 if (len < q->rate_mpu) 1389 len = q->rate_mpu; 1390 1391 if (q->atm_mode == CAKE_ATM_ATM) { 1392 len += 47; 1393 len /= 48; 1394 len *= 53; 1395 } else if (q->atm_mode == CAKE_ATM_PTM) { 1396 /* Add one byte per 64 bytes or part thereof. 1397 * This is conservative and easier to calculate than the 1398 * precise value. 1399 */ 1400 len += (len + 63) / 64; 1401 } 1402 1403 if (qd->max_adjlen < len) 1404 WRITE_ONCE(qd->max_adjlen, len); 1405 if (qd->min_adjlen > len) 1406 WRITE_ONCE(qd->min_adjlen, len); 1407 1408 return len; 1409 } 1410 1411 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1412 { 1413 const struct skb_shared_info *shinfo = skb_shinfo(skb); 1414 unsigned int hdr_len, last_len = 0; 1415 u32 off = skb_network_offset(skb); 1416 u16 segs = qdisc_pkt_segs(skb); 1417 u32 len = qdisc_pkt_len(skb); 1418 1419 WRITE_ONCE(q->avg_netoff, cake_ewma(q->avg_netoff, off << 16, 8)); 1420 1421 if (segs == 1) 1422 return cake_calc_overhead(q, len, off); 1423 1424 /* borrowed from qdisc_pkt_len_segs_init() */ 1425 if (!skb->encapsulation) 1426 hdr_len = skb_transport_offset(skb); 1427 else 1428 hdr_len = skb_inner_transport_offset(skb); 1429 1430 /* + transport layer */ 1431 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1432 SKB_GSO_TCPV6))) { 1433 const struct tcphdr *th; 1434 struct tcphdr _tcphdr; 1435 1436 th = skb_header_pointer(skb, hdr_len, 1437 sizeof(_tcphdr), &_tcphdr); 1438 if (likely(th)) 1439 hdr_len += __tcp_hdrlen(th); 1440 } else { 1441 struct udphdr _udphdr; 1442 1443 if (skb_header_pointer(skb, hdr_len, 1444 sizeof(_udphdr), &_udphdr)) 1445 hdr_len += sizeof(struct udphdr); 1446 } 1447 1448 len = shinfo->gso_size + hdr_len; 1449 last_len = skb->len - shinfo->gso_size * (segs - 1); 1450 1451 return (cake_calc_overhead(q, len, off) * (segs - 1) + 1452 cake_calc_overhead(q, last_len, off)); 1453 } 1454 1455 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1456 { 1457 struct cake_heap_entry ii = q->overflow_heap[i]; 1458 struct cake_heap_entry jj = q->overflow_heap[j]; 1459 1460 q->overflow_heap[i] = jj; 1461 q->overflow_heap[j] = ii; 1462 1463 q->tins[ii.t].overflow_idx[ii.b] = j; 1464 q->tins[jj.t].overflow_idx[jj.b] = i; 1465 } 1466 1467 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1468 { 1469 struct cake_heap_entry ii = q->overflow_heap[i]; 1470 1471 return q->tins[ii.t].backlogs[ii.b]; 1472 } 1473 1474 static void cake_heapify(struct cake_sched_data *q, u16 i) 1475 { 1476 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1477 u32 mb = cake_heap_get_backlog(q, i); 1478 u32 m = i; 1479 1480 while (m < a) { 1481 u32 l = m + m + 1; 1482 u32 r = l + 1; 1483 1484 if (l < a) { 1485 u32 lb = cake_heap_get_backlog(q, l); 1486 1487 if (lb > mb) { 1488 m = l; 1489 mb = lb; 1490 } 1491 } 1492 1493 if (r < a) { 1494 u32 rb = cake_heap_get_backlog(q, r); 1495 1496 if (rb > mb) { 1497 m = r; 1498 mb = rb; 1499 } 1500 } 1501 1502 if (m != i) { 1503 cake_heap_swap(q, i, m); 1504 i = m; 1505 } else { 1506 break; 1507 } 1508 } 1509 } 1510 1511 static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1512 { 1513 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1514 u16 p = (i - 1) >> 1; 1515 u32 ib = cake_heap_get_backlog(q, i); 1516 u32 pb = cake_heap_get_backlog(q, p); 1517 1518 if (ib > pb) { 1519 cake_heap_swap(q, i, p); 1520 i = p; 1521 } else { 1522 break; 1523 } 1524 } 1525 } 1526 1527 static int cake_advance_shaper(struct cake_sched_data *q, 1528 struct cake_tin_data *b, 1529 struct sk_buff *skb, 1530 ktime_t now, bool drop) 1531 { 1532 u32 len = get_cobalt_cb(skb)->adjusted_len; 1533 1534 /* charge packet bandwidth to this tin 1535 * and to the global shaper. 1536 */ 1537 if (q->rate_ns) { 1538 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1539 u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1540 u64 failsafe_dur = global_dur + (global_dur >> 1); 1541 1542 if (ktime_before(b->time_next_packet, now)) 1543 b->time_next_packet = ktime_add_ns(b->time_next_packet, 1544 tin_dur); 1545 1546 else if (ktime_before(b->time_next_packet, 1547 ktime_add_ns(now, tin_dur))) 1548 b->time_next_packet = ktime_add_ns(now, tin_dur); 1549 1550 q->time_next_packet = ktime_add_ns(q->time_next_packet, 1551 global_dur); 1552 if (!drop) 1553 q->failsafe_next_packet = \ 1554 ktime_add_ns(q->failsafe_next_packet, 1555 failsafe_dur); 1556 } 1557 return len; 1558 } 1559 1560 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1561 { 1562 struct cake_sched_data *q = qdisc_priv(sch); 1563 ktime_t now = ktime_get(); 1564 u32 idx = 0, tin = 0, len; 1565 struct cake_heap_entry qq; 1566 struct cake_tin_data *b; 1567 struct cake_flow *flow; 1568 struct sk_buff *skb; 1569 1570 if (!q->overflow_timeout) { 1571 int i; 1572 /* Build fresh max-heap */ 1573 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) 1574 cake_heapify(q, i); 1575 } 1576 q->overflow_timeout = 65535; 1577 1578 /* select longest queue for pruning */ 1579 qq = q->overflow_heap[0]; 1580 tin = qq.t; 1581 idx = qq.b; 1582 1583 b = &q->tins[tin]; 1584 flow = &b->flows[idx]; 1585 skb = dequeue_head(flow); 1586 if (unlikely(!skb)) { 1587 /* heap has gone wrong, rebuild it next time */ 1588 q->overflow_timeout = 0; 1589 return idx + (tin << 16); 1590 } 1591 1592 if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1593 WRITE_ONCE(b->unresponsive_flow_count, 1594 b->unresponsive_flow_count + 1); 1595 1596 len = qdisc_pkt_len(skb); 1597 q->buffer_used -= skb->truesize; 1598 b->backlogs[idx] -= len; 1599 WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); 1600 sch->qstats.backlog -= len; 1601 1602 flow->dropped++; 1603 WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); 1604 1605 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 1606 cake_advance_shaper(q, b, skb, now, true); 1607 1608 qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); 1609 sch->q.qlen--; 1610 1611 cake_heapify(q, 0); 1612 1613 return idx + (tin << 16); 1614 } 1615 1616 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) 1617 { 1618 const int offset = skb_network_offset(skb); 1619 u16 *buf, buf_; 1620 u8 dscp; 1621 1622 switch (skb_protocol(skb, true)) { 1623 case htons(ETH_P_IP): 1624 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1625 if (unlikely(!buf)) 1626 return 0; 1627 1628 /* ToS is in the second byte of iphdr */ 1629 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; 1630 1631 if (wash && dscp) { 1632 const int wlen = offset + sizeof(struct iphdr); 1633 1634 if (!pskb_may_pull(skb, wlen) || 1635 skb_try_make_writable(skb, wlen)) 1636 return 0; 1637 1638 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1639 } 1640 1641 return dscp; 1642 1643 case htons(ETH_P_IPV6): 1644 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1645 if (unlikely(!buf)) 1646 return 0; 1647 1648 /* Traffic class is in the first and second bytes of ipv6hdr */ 1649 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; 1650 1651 if (wash && dscp) { 1652 const int wlen = offset + sizeof(struct ipv6hdr); 1653 1654 if (!pskb_may_pull(skb, wlen) || 1655 skb_try_make_writable(skb, wlen)) 1656 return 0; 1657 1658 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1659 } 1660 1661 return dscp; 1662 1663 case htons(ETH_P_ARP): 1664 return 0x38; /* CS7 - Net Control */ 1665 1666 default: 1667 /* If there is no Diffserv field, treat as best-effort */ 1668 return 0; 1669 } 1670 } 1671 1672 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1673 struct sk_buff *skb) 1674 { 1675 struct cake_sched_data *qd = qdisc_priv(sch); 1676 struct cake_sched_config *q = qd->config; 1677 u32 tin, mark; 1678 bool wash; 1679 u8 dscp; 1680 1681 /* Tin selection: Default to diffserv-based selection, allow overriding 1682 * using firewall marks or skb->priority. Call DSCP parsing early if 1683 * wash is enabled, otherwise defer to below to skip unneeded parsing. 1684 */ 1685 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; 1686 wash = !!(q->rate_flags & CAKE_FLAG_WASH); 1687 if (wash) 1688 dscp = cake_handle_diffserv(skb, wash); 1689 1690 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) 1691 tin = 0; 1692 1693 else if (mark && mark <= qd->tin_cnt) 1694 tin = qd->tin_order[mark - 1]; 1695 1696 else if (TC_H_MAJ(skb->priority) == sch->handle && 1697 TC_H_MIN(skb->priority) > 0 && 1698 TC_H_MIN(skb->priority) <= qd->tin_cnt) 1699 tin = qd->tin_order[TC_H_MIN(skb->priority) - 1]; 1700 1701 else { 1702 if (!wash) 1703 dscp = cake_handle_diffserv(skb, wash); 1704 tin = qd->tin_index[dscp]; 1705 1706 if (unlikely(tin >= qd->tin_cnt)) 1707 tin = 0; 1708 } 1709 1710 return &qd->tins[tin]; 1711 } 1712 1713 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1714 struct sk_buff *skb, int flow_mode, int *qerr) 1715 { 1716 struct cake_sched_data *q = qdisc_priv(sch); 1717 struct tcf_proto *filter; 1718 struct tcf_result res; 1719 u16 flow = 0, host = 0; 1720 int result; 1721 1722 filter = rcu_dereference_bh(q->filter_list); 1723 if (!filter) 1724 goto hash; 1725 1726 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1727 result = tcf_classify(skb, NULL, filter, &res, false); 1728 1729 if (result >= 0) { 1730 #ifdef CONFIG_NET_CLS_ACT 1731 switch (result) { 1732 case TC_ACT_STOLEN: 1733 case TC_ACT_QUEUED: 1734 case TC_ACT_TRAP: 1735 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1736 fallthrough; 1737 case TC_ACT_SHOT: 1738 return 0; 1739 } 1740 #endif 1741 if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1742 flow = TC_H_MIN(res.classid); 1743 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) 1744 host = TC_H_MAJ(res.classid) >> 16; 1745 } 1746 hash: 1747 *t = cake_select_tin(sch, skb); 1748 return cake_hash(*t, skb, flow_mode, flow, host) + 1; 1749 } 1750 1751 static void cake_reconfigure(struct Qdisc *sch); 1752 1753 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1754 struct sk_buff **to_free) 1755 { 1756 u32 idx, tin, prev_qlen, prev_backlog, drop_id; 1757 struct cake_sched_data *q = qdisc_priv(sch); 1758 int len = qdisc_pkt_len(skb), ret; 1759 struct sk_buff *ack = NULL; 1760 ktime_t now = ktime_get(); 1761 struct cake_tin_data *b; 1762 struct cake_flow *flow; 1763 bool same_flow = false; 1764 1765 /* choose flow to insert into */ 1766 idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret); 1767 if (idx == 0) { 1768 if (ret & __NET_XMIT_BYPASS) 1769 qdisc_qstats_drop(sch); 1770 __qdisc_drop(skb, to_free); 1771 return ret; 1772 } 1773 tin = (u32)(b - q->tins); 1774 idx--; 1775 flow = &b->flows[idx]; 1776 1777 /* ensure shaper state isn't stale */ 1778 if (!b->tin_backlog) { 1779 if (ktime_before(b->time_next_packet, now)) 1780 b->time_next_packet = now; 1781 1782 if (!sch->q.qlen) { 1783 if (ktime_before(q->time_next_packet, now)) { 1784 q->failsafe_next_packet = now; 1785 q->time_next_packet = now; 1786 } else if (ktime_after(q->time_next_packet, now) && 1787 ktime_after(q->failsafe_next_packet, now)) { 1788 u64 next = \ 1789 min(ktime_to_ns(q->time_next_packet), 1790 ktime_to_ns( 1791 q->failsafe_next_packet)); 1792 sch->qstats.overlimits++; 1793 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1794 } 1795 } 1796 } 1797 1798 if (unlikely(len > b->max_skblen)) 1799 WRITE_ONCE(b->max_skblen, len); 1800 1801 if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1802 struct sk_buff *segs, *nskb; 1803 netdev_features_t features = netif_skb_features(skb); 1804 unsigned int slen = 0, numsegs = 0; 1805 1806 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1807 if (IS_ERR_OR_NULL(segs)) 1808 return qdisc_drop(skb, sch, to_free); 1809 1810 skb_list_walk_safe(segs, segs, nskb) { 1811 skb_mark_not_on_list(segs); 1812 qdisc_skb_cb(segs)->pkt_len = segs->len; 1813 qdisc_skb_cb(segs)->pkt_segs = 1; 1814 cobalt_set_enqueue_time(segs, now); 1815 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1816 segs); 1817 flow_queue_add(flow, segs); 1818 1819 sch->q.qlen++; 1820 numsegs++; 1821 slen += segs->len; 1822 q->buffer_used += segs->truesize; 1823 WRITE_ONCE(b->packets, b->packets + 1); 1824 } 1825 1826 /* stats */ 1827 b->backlogs[idx] += slen; 1828 sch->qstats.backlog += slen; 1829 q->avg_window_bytes += slen; 1830 WRITE_ONCE(b->bytes, b->bytes + slen); 1831 WRITE_ONCE(b->tin_backlog, b->tin_backlog + slen); 1832 1833 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); 1834 consume_skb(skb); 1835 } else { 1836 /* not splitting */ 1837 int ack_pkt_len = 0; 1838 1839 cobalt_set_enqueue_time(skb, now); 1840 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1841 flow_queue_add(flow, skb); 1842 1843 if (q->config->ack_filter) 1844 ack = cake_ack_filter(q, flow); 1845 1846 if (ack) { 1847 WRITE_ONCE(b->ack_drops, b->ack_drops + 1); 1848 sch->qstats.drops++; 1849 ack_pkt_len = qdisc_pkt_len(ack); 1850 WRITE_ONCE(b->bytes, b->bytes + ack_pkt_len); 1851 q->buffer_used += skb->truesize - ack->truesize; 1852 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 1853 cake_advance_shaper(q, b, ack, now, true); 1854 1855 qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len); 1856 consume_skb(ack); 1857 } else { 1858 sch->q.qlen++; 1859 q->buffer_used += skb->truesize; 1860 } 1861 1862 /* stats */ 1863 WRITE_ONCE(b->packets, b->packets + 1); 1864 b->backlogs[idx] += len - ack_pkt_len; 1865 sch->qstats.backlog += len - ack_pkt_len; 1866 q->avg_window_bytes += len - ack_pkt_len; 1867 WRITE_ONCE(b->bytes, b->bytes + len - ack_pkt_len); 1868 WRITE_ONCE(b->tin_backlog, b->tin_backlog + len - ack_pkt_len); 1869 } 1870 1871 if (q->overflow_timeout) 1872 cake_heapify_up(q, b->overflow_idx[idx]); 1873 1874 /* incoming bandwidth capacity estimate */ 1875 if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1876 u64 packet_interval = \ 1877 ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1878 1879 if (packet_interval > NSEC_PER_SEC) 1880 packet_interval = NSEC_PER_SEC; 1881 1882 /* filter out short-term bursts, eg. wifi aggregation */ 1883 q->avg_packet_interval = \ 1884 cake_ewma(q->avg_packet_interval, 1885 packet_interval, 1886 (packet_interval > q->avg_packet_interval ? 1887 2 : 8)); 1888 1889 q->last_packet_time = now; 1890 1891 if (packet_interval > q->avg_packet_interval) { 1892 u64 window_interval = \ 1893 ktime_to_ns(ktime_sub(now, 1894 q->avg_window_begin)); 1895 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1896 1897 b = div64_u64(b, window_interval); 1898 WRITE_ONCE(q->avg_peak_bandwidth, 1899 cake_ewma(q->avg_peak_bandwidth, b, 1900 b > q->avg_peak_bandwidth ? 2 : 8)); 1901 q->avg_window_bytes = 0; 1902 q->avg_window_begin = now; 1903 1904 if (ktime_after(now, 1905 ktime_add_ms(q->last_reconfig_time, 1906 250))) { 1907 q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1908 cake_reconfigure(sch); 1909 } 1910 } 1911 } else { 1912 q->avg_window_bytes = 0; 1913 q->last_packet_time = now; 1914 } 1915 1916 /* flowchain */ 1917 if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1918 if (!flow->set) { 1919 list_add_tail(&flow->flowchain, &b->new_flows); 1920 } else { 1921 WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count - 1); 1922 list_move_tail(&flow->flowchain, &b->new_flows); 1923 } 1924 flow->set = CAKE_SET_SPARSE; 1925 WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count + 1); 1926 1927 flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); 1928 } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 1929 /* this flow was empty, accounted as a sparse flow, but actually 1930 * in the bulk rotation. 1931 */ 1932 flow->set = CAKE_SET_BULK; 1933 WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); 1934 WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count + 1); 1935 1936 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 1937 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 1938 } 1939 1940 if (q->buffer_used > q->buffer_max_used) 1941 WRITE_ONCE(q->buffer_max_used, q->buffer_used); 1942 1943 if (q->buffer_used <= q->buffer_limit) 1944 return NET_XMIT_SUCCESS; 1945 1946 prev_qlen = sch->q.qlen; 1947 prev_backlog = sch->qstats.backlog; 1948 1949 while (q->buffer_used > q->buffer_limit) { 1950 drop_id = cake_drop(sch, to_free); 1951 if ((drop_id >> 16) == tin && 1952 (drop_id & 0xFFFF) == idx) 1953 same_flow = true; 1954 } 1955 1956 prev_qlen -= sch->q.qlen; 1957 prev_backlog -= sch->qstats.backlog; 1958 b->drop_overlimit += prev_qlen; 1959 1960 if (same_flow) { 1961 qdisc_tree_reduce_backlog(sch, prev_qlen - 1, 1962 prev_backlog - len); 1963 return NET_XMIT_CN; 1964 } 1965 qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog); 1966 return NET_XMIT_SUCCESS; 1967 } 1968 1969 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 1970 { 1971 struct cake_sched_data *q = qdisc_priv(sch); 1972 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1973 struct cake_flow *flow = &b->flows[q->cur_flow]; 1974 struct sk_buff *skb = NULL; 1975 u32 len; 1976 1977 if (flow->head) { 1978 skb = dequeue_head(flow); 1979 len = qdisc_pkt_len(skb); 1980 b->backlogs[q->cur_flow] -= len; 1981 WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); 1982 sch->qstats.backlog -= len; 1983 q->buffer_used -= skb->truesize; 1984 sch->q.qlen--; 1985 1986 if (q->overflow_timeout) 1987 cake_heapify(q, b->overflow_idx[q->cur_flow]); 1988 } 1989 return skb; 1990 } 1991 1992 /* Discard leftover packets from a tin no longer in use. */ 1993 static void cake_clear_tin(struct Qdisc *sch, u16 tin) 1994 { 1995 struct cake_sched_data *q = qdisc_priv(sch); 1996 struct sk_buff *skb; 1997 1998 q->cur_tin = tin; 1999 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 2000 while (!!(skb = cake_dequeue_one(sch))) 2001 kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); 2002 } 2003 2004 static struct sk_buff *cake_dequeue(struct Qdisc *sch) 2005 { 2006 struct cake_sched_data *q = qdisc_priv(sch); 2007 struct cake_tin_data *b = &q->tins[q->cur_tin]; 2008 enum qdisc_drop_reason reason; 2009 ktime_t now = ktime_get(); 2010 struct cake_flow *flow; 2011 struct list_head *head; 2012 bool first_flow = true; 2013 struct sk_buff *skb; 2014 u64 delay; 2015 u32 len; 2016 2017 if (q->config->is_shared && q->rate_ns && 2018 now - q->last_checked_active >= q->config->sync_time) { 2019 struct net_device *dev = qdisc_dev(sch); 2020 struct cake_sched_data *other_priv; 2021 u64 new_rate = q->config->rate_bps; 2022 u64 other_qlen, other_last_active; 2023 struct Qdisc *other_sch; 2024 u32 num_active_qs = 1; 2025 unsigned int ntx; 2026 2027 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 2028 other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); 2029 other_priv = qdisc_priv(other_sch); 2030 2031 if (other_priv == q) 2032 continue; 2033 2034 other_qlen = READ_ONCE(other_sch->q.qlen); 2035 other_last_active = READ_ONCE(other_priv->last_active); 2036 2037 if (other_qlen || other_last_active > q->last_checked_active) 2038 num_active_qs++; 2039 } 2040 2041 if (num_active_qs > 1) 2042 new_rate = div64_u64(q->config->rate_bps, num_active_qs); 2043 2044 cake_configure_rates(sch, new_rate, true); 2045 q->last_checked_active = now; 2046 WRITE_ONCE(q->active_queues, num_active_qs); 2047 } 2048 2049 begin: 2050 if (!sch->q.qlen) 2051 return NULL; 2052 2053 /* global hard shaper */ 2054 if (ktime_after(q->time_next_packet, now) && 2055 ktime_after(q->failsafe_next_packet, now)) { 2056 u64 next = min(ktime_to_ns(q->time_next_packet), 2057 ktime_to_ns(q->failsafe_next_packet)); 2058 2059 sch->qstats.overlimits++; 2060 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2061 return NULL; 2062 } 2063 2064 /* Choose a class to work on. */ 2065 if (!q->rate_ns) { 2066 /* In unlimited mode, can't rely on shaper timings, just balance 2067 * with DRR 2068 */ 2069 bool wrapped = false, empty = true; 2070 2071 while (b->tin_deficit < 0 || 2072 !(b->sparse_flow_count + b->bulk_flow_count)) { 2073 if (b->tin_deficit <= 0) 2074 b->tin_deficit += b->tin_quantum; 2075 if (b->sparse_flow_count + b->bulk_flow_count) 2076 empty = false; 2077 2078 q->cur_tin++; 2079 b++; 2080 if (q->cur_tin >= q->tin_cnt) { 2081 q->cur_tin = 0; 2082 b = q->tins; 2083 2084 if (wrapped) { 2085 /* It's possible for q->qlen to be 2086 * nonzero when we actually have no 2087 * packets anywhere. 2088 */ 2089 if (empty) 2090 return NULL; 2091 } else { 2092 wrapped = true; 2093 } 2094 } 2095 } 2096 } else { 2097 /* In shaped mode, choose: 2098 * - Highest-priority tin with queue and meeting schedule, or 2099 * - The earliest-scheduled tin with queue. 2100 */ 2101 ktime_t best_time = KTIME_MAX; 2102 int tin, best_tin = 0; 2103 2104 for (tin = 0; tin < q->tin_cnt; tin++) { 2105 b = q->tins + tin; 2106 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 2107 ktime_t time_to_pkt = \ 2108 ktime_sub(b->time_next_packet, now); 2109 2110 if (ktime_to_ns(time_to_pkt) <= 0 || 2111 ktime_compare(time_to_pkt, 2112 best_time) <= 0) { 2113 best_time = time_to_pkt; 2114 best_tin = tin; 2115 } 2116 } 2117 } 2118 2119 q->cur_tin = best_tin; 2120 b = q->tins + best_tin; 2121 2122 /* No point in going further if no packets to deliver. */ 2123 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 2124 return NULL; 2125 } 2126 2127 retry: 2128 /* service this class */ 2129 head = &b->decaying_flows; 2130 if (!first_flow || list_empty(head)) { 2131 head = &b->new_flows; 2132 if (list_empty(head)) { 2133 head = &b->old_flows; 2134 if (unlikely(list_empty(head))) { 2135 head = &b->decaying_flows; 2136 if (unlikely(list_empty(head))) 2137 goto begin; 2138 } 2139 } 2140 } 2141 flow = list_first_entry(head, struct cake_flow, flowchain); 2142 q->cur_flow = flow - b->flows; 2143 first_flow = false; 2144 2145 /* flow isolation (DRR++) */ 2146 if (flow->deficit <= 0) { 2147 /* Keep all flows with deficits out of the sparse and decaying 2148 * rotations. No non-empty flow can go into the decaying 2149 * rotation, so they can't get deficits 2150 */ 2151 if (flow->set == CAKE_SET_SPARSE) { 2152 if (flow->head) { 2153 WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); 2154 WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count + 1); 2155 2156 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2157 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2158 2159 flow->set = CAKE_SET_BULK; 2160 } else { 2161 /* we've moved it to the bulk rotation for 2162 * correct deficit accounting but we still want 2163 * to count it as a sparse flow, not a bulk one. 2164 */ 2165 flow->set = CAKE_SET_SPARSE_WAIT; 2166 } 2167 } 2168 2169 flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode); 2170 list_move_tail(&flow->flowchain, &b->old_flows); 2171 2172 goto retry; 2173 } 2174 2175 /* Retrieve a packet via the AQM */ 2176 while (1) { 2177 skb = cake_dequeue_one(sch); 2178 if (!skb) { 2179 /* this queue was actually empty */ 2180 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2181 WRITE_ONCE(b->unresponsive_flow_count, 2182 b->unresponsive_flow_count - 1); 2183 2184 if (flow->cvars.p_drop || flow->cvars.count || 2185 ktime_before(now, flow->cvars.drop_next)) { 2186 /* keep in the flowchain until the state has 2187 * decayed to rest 2188 */ 2189 list_move_tail(&flow->flowchain, 2190 &b->decaying_flows); 2191 if (flow->set == CAKE_SET_BULK) { 2192 WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count - 1); 2193 2194 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2195 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2196 2197 WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count + 1); 2198 } else if (flow->set == CAKE_SET_SPARSE || 2199 flow->set == CAKE_SET_SPARSE_WAIT) { 2200 WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); 2201 WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count + 1); 2202 } 2203 flow->set = CAKE_SET_DECAYING; 2204 } else { 2205 /* remove empty queue from the flowchain */ 2206 list_del_init(&flow->flowchain); 2207 if (flow->set == CAKE_SET_SPARSE || 2208 flow->set == CAKE_SET_SPARSE_WAIT) { 2209 WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); 2210 } else if (flow->set == CAKE_SET_BULK) { 2211 WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count - 1); 2212 2213 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2214 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2215 } else { 2216 WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count - 1); 2217 } 2218 flow->set = CAKE_SET_NONE; 2219 } 2220 goto begin; 2221 } 2222 2223 reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2224 (b->bulk_flow_count * 2225 !!(q->config->rate_flags & 2226 CAKE_FLAG_INGRESS))); 2227 /* Last packet in queue may be marked, shouldn't be dropped */ 2228 if (reason == QDISC_DROP_UNSPEC || !flow->head) 2229 break; 2230 2231 /* drop this packet, get another one */ 2232 if (q->config->rate_flags & CAKE_FLAG_INGRESS) { 2233 len = cake_advance_shaper(q, b, skb, 2234 now, true); 2235 flow->deficit -= len; 2236 b->tin_deficit -= len; 2237 } 2238 flow->dropped++; 2239 WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); 2240 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2241 qdisc_qstats_drop(sch); 2242 qdisc_dequeue_drop(sch, skb, reason); 2243 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 2244 goto retry; 2245 } 2246 2247 WRITE_ONCE(b->tin_ecn_mark, b->tin_ecn_mark + !!flow->cvars.ecn_marked); 2248 qdisc_bstats_update(sch, skb); 2249 WRITE_ONCE(q->last_active, now); 2250 2251 /* collect delay stats */ 2252 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2253 WRITE_ONCE(b->avge_delay, cake_ewma(b->avge_delay, delay, 8)); 2254 WRITE_ONCE(b->peak_delay, 2255 cake_ewma(b->peak_delay, delay, 2256 delay > b->peak_delay ? 2 : 8)); 2257 WRITE_ONCE(b->base_delay, 2258 cake_ewma(b->base_delay, delay, 2259 delay < b->base_delay ? 2 : 8)); 2260 2261 len = cake_advance_shaper(q, b, skb, now, false); 2262 flow->deficit -= len; 2263 b->tin_deficit -= len; 2264 2265 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2266 u64 next = min(ktime_to_ns(q->time_next_packet), 2267 ktime_to_ns(q->failsafe_next_packet)); 2268 2269 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2270 } else if (!sch->q.qlen) { 2271 int i; 2272 2273 for (i = 0; i < q->tin_cnt; i++) { 2274 if (q->tins[i].decaying_flow_count) { 2275 ktime_t next = \ 2276 ktime_add_ns(now, 2277 q->tins[i].cparams.target); 2278 2279 qdisc_watchdog_schedule_ns(&q->watchdog, 2280 ktime_to_ns(next)); 2281 break; 2282 } 2283 } 2284 } 2285 2286 if (q->overflow_timeout) 2287 q->overflow_timeout--; 2288 2289 return skb; 2290 } 2291 2292 static void cake_reset(struct Qdisc *sch) 2293 { 2294 struct cake_sched_data *q = qdisc_priv(sch); 2295 u32 c; 2296 2297 if (!q->tins) 2298 return; 2299 2300 for (c = 0; c < CAKE_MAX_TINS; c++) 2301 cake_clear_tin(sch, c); 2302 } 2303 2304 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2305 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2306 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2307 [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2308 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2309 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2310 [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2311 [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2312 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2313 [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2314 [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2315 [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2316 [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2317 [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2318 [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2319 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2320 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, 2321 [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, 2322 }; 2323 2324 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2325 u64 target_ns, u64 rtt_est_ns) 2326 { 2327 /* convert byte-rate into time-per-byte 2328 * so it will always unwedge in reasonable time. 2329 */ 2330 static const u64 MIN_RATE = 64; 2331 u32 byte_target = mtu; 2332 u64 byte_target_ns; 2333 u8 rate_shft = 0; 2334 u64 rate_ns = 0; 2335 2336 if (rate) { 2337 WRITE_ONCE(b->flow_quantum, 2338 max(min(rate >> 12, 1514ULL), 300ULL)); 2339 rate_shft = 34; 2340 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2341 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2342 while (!!(rate_ns >> 34)) { 2343 rate_ns >>= 1; 2344 rate_shft--; 2345 } 2346 } else { 2347 /* else unlimited, ie. zero delay */ 2348 WRITE_ONCE(b->flow_quantum, 1514); 2349 } 2350 WRITE_ONCE(b->tin_rate_bps, rate); 2351 b->tin_rate_ns = rate_ns; 2352 b->tin_rate_shft = rate_shft; 2353 2354 if (mtu == 0) 2355 return; 2356 2357 byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2358 2359 WRITE_ONCE(b->cparams.target, 2360 max((byte_target_ns * 3) / 2, target_ns)); 2361 WRITE_ONCE(b->cparams.interval, 2362 max(rtt_est_ns + b->cparams.target - target_ns, 2363 b->cparams.target * 2)); 2364 b->cparams.mtu_time = byte_target_ns; 2365 b->cparams.p_inc = 1 << 24; /* 1/256 */ 2366 b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2367 } 2368 2369 static int cake_config_besteffort(struct Qdisc *sch, u64 rate, u32 mtu) 2370 { 2371 struct cake_sched_data *q = qdisc_priv(sch); 2372 struct cake_tin_data *b = &q->tins[0]; 2373 2374 q->tin_cnt = 1; 2375 2376 q->tin_index = besteffort; 2377 q->tin_order = normal_order; 2378 2379 cake_set_rate(b, rate, mtu, 2380 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2381 b->tin_quantum = 65535; 2382 2383 return 0; 2384 } 2385 2386 static int cake_config_precedence(struct Qdisc *sch, u64 rate, u32 mtu) 2387 { 2388 /* convert high-level (user visible) parameters into internal format */ 2389 struct cake_sched_data *q = qdisc_priv(sch); 2390 u32 quantum = 256; 2391 u32 i; 2392 2393 q->tin_cnt = 8; 2394 q->tin_index = precedence; 2395 q->tin_order = normal_order; 2396 2397 for (i = 0; i < q->tin_cnt; i++) { 2398 struct cake_tin_data *b = &q->tins[i]; 2399 2400 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), 2401 us_to_ns(q->config->interval)); 2402 2403 b->tin_quantum = max_t(u16, 1U, quantum); 2404 2405 /* calculate next class's parameters */ 2406 rate *= 7; 2407 rate >>= 3; 2408 2409 quantum *= 7; 2410 quantum >>= 3; 2411 } 2412 2413 return 0; 2414 } 2415 2416 /* List of known Diffserv codepoints: 2417 * 2418 * Default Forwarding (DF/CS0) - Best Effort 2419 * Max Throughput (TOS2) 2420 * Min Delay (TOS4) 2421 * LLT "La" (TOS5) 2422 * Assured Forwarding 1 (AF1x) - x3 2423 * Assured Forwarding 2 (AF2x) - x3 2424 * Assured Forwarding 3 (AF3x) - x3 2425 * Assured Forwarding 4 (AF4x) - x3 2426 * Precedence Class 1 (CS1) 2427 * Precedence Class 2 (CS2) 2428 * Precedence Class 3 (CS3) 2429 * Precedence Class 4 (CS4) 2430 * Precedence Class 5 (CS5) 2431 * Precedence Class 6 (CS6) 2432 * Precedence Class 7 (CS7) 2433 * Voice Admit (VA) 2434 * Expedited Forwarding (EF) 2435 * Lower Effort (LE) 2436 * 2437 * Total 26 codepoints. 2438 */ 2439 2440 /* List of traffic classes in RFC 4594, updated by RFC 8622: 2441 * (roughly descending order of contended priority) 2442 * (roughly ascending order of uncontended throughput) 2443 * 2444 * Network Control (CS6,CS7) - routing traffic 2445 * Telephony (EF,VA) - aka. VoIP streams 2446 * Signalling (CS5) - VoIP setup 2447 * Multimedia Conferencing (AF4x) - aka. video calls 2448 * Realtime Interactive (CS4) - eg. games 2449 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2450 * Broadcast Video (CS3) 2451 * Low-Latency Data (AF2x,TOS4) - eg. database 2452 * Ops, Admin, Management (CS2) - eg. ssh 2453 * Standard Service (DF & unrecognised codepoints) 2454 * High-Throughput Data (AF1x,TOS2) - eg. web traffic 2455 * Low-Priority Data (LE,CS1) - eg. BitTorrent 2456 * 2457 * Total 12 traffic classes. 2458 */ 2459 2460 static int cake_config_diffserv8(struct Qdisc *sch, u64 rate, u32 mtu) 2461 { 2462 /* Pruned list of traffic classes for typical applications: 2463 * 2464 * Network Control (CS6, CS7) 2465 * Minimum Latency (EF, VA, CS5, CS4) 2466 * Interactive Shell (CS2) 2467 * Low Latency Transactions (AF2x, TOS4) 2468 * Video Streaming (AF4x, AF3x, CS3) 2469 * Bog Standard (DF etc.) 2470 * High Throughput (AF1x, TOS2, CS1) 2471 * Background Traffic (LE) 2472 * 2473 * Total 8 traffic classes. 2474 */ 2475 2476 struct cake_sched_data *q = qdisc_priv(sch); 2477 u32 quantum = 256; 2478 u32 i; 2479 2480 q->tin_cnt = 8; 2481 2482 /* codepoint to class mapping */ 2483 q->tin_index = diffserv8; 2484 q->tin_order = normal_order; 2485 2486 /* class characteristics */ 2487 for (i = 0; i < q->tin_cnt; i++) { 2488 struct cake_tin_data *b = &q->tins[i]; 2489 2490 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), 2491 us_to_ns(q->config->interval)); 2492 2493 b->tin_quantum = max_t(u16, 1U, quantum); 2494 2495 /* calculate next class's parameters */ 2496 rate *= 7; 2497 rate >>= 3; 2498 2499 quantum *= 7; 2500 quantum >>= 3; 2501 } 2502 2503 return 0; 2504 } 2505 2506 static int cake_config_diffserv4(struct Qdisc *sch, u64 rate, u32 mtu) 2507 { 2508 /* Further pruned list of traffic classes for four-class system: 2509 * 2510 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2511 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) 2512 * Best Effort (DF, AF1x, TOS2, and those not specified) 2513 * Background Traffic (LE, CS1) 2514 * 2515 * Total 4 traffic classes. 2516 */ 2517 2518 struct cake_sched_data *q = qdisc_priv(sch); 2519 u32 quantum = 1024; 2520 2521 q->tin_cnt = 4; 2522 2523 /* codepoint to class mapping */ 2524 q->tin_index = diffserv4; 2525 q->tin_order = bulk_order; 2526 2527 /* class characteristics */ 2528 cake_set_rate(&q->tins[0], rate, mtu, 2529 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2530 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2531 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2532 cake_set_rate(&q->tins[2], rate >> 1, mtu, 2533 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2534 cake_set_rate(&q->tins[3], rate >> 2, mtu, 2535 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2536 2537 /* bandwidth-sharing weights */ 2538 q->tins[0].tin_quantum = quantum; 2539 q->tins[1].tin_quantum = quantum >> 4; 2540 q->tins[2].tin_quantum = quantum >> 1; 2541 q->tins[3].tin_quantum = quantum >> 2; 2542 2543 return 0; 2544 } 2545 2546 static int cake_config_diffserv3(struct Qdisc *sch, u64 rate, u32 mtu) 2547 { 2548 /* Simplified Diffserv structure with 3 tins. 2549 * Latency Sensitive (CS7, CS6, EF, VA, TOS4) 2550 * Best Effort 2551 * Low Priority (LE, CS1) 2552 */ 2553 struct cake_sched_data *q = qdisc_priv(sch); 2554 u32 quantum = 1024; 2555 2556 q->tin_cnt = 3; 2557 2558 /* codepoint to class mapping */ 2559 q->tin_index = diffserv3; 2560 q->tin_order = bulk_order; 2561 2562 /* class characteristics */ 2563 cake_set_rate(&q->tins[0], rate, mtu, 2564 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2565 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2566 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2567 cake_set_rate(&q->tins[2], rate >> 2, mtu, 2568 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2569 2570 /* bandwidth-sharing weights */ 2571 q->tins[0].tin_quantum = quantum; 2572 q->tins[1].tin_quantum = quantum >> 4; 2573 q->tins[2].tin_quantum = quantum >> 2; 2574 2575 return 0; 2576 } 2577 2578 static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust) 2579 { 2580 u32 mtu = likely(rate_adjust) ? 0 : psched_mtu(qdisc_dev(sch)); 2581 struct cake_sched_data *qd = qdisc_priv(sch); 2582 struct cake_sched_config *q = qd->config; 2583 int c, ft; 2584 2585 switch (q->tin_mode) { 2586 case CAKE_DIFFSERV_BESTEFFORT: 2587 ft = cake_config_besteffort(sch, rate, mtu); 2588 break; 2589 2590 case CAKE_DIFFSERV_PRECEDENCE: 2591 ft = cake_config_precedence(sch, rate, mtu); 2592 break; 2593 2594 case CAKE_DIFFSERV_DIFFSERV8: 2595 ft = cake_config_diffserv8(sch, rate, mtu); 2596 break; 2597 2598 case CAKE_DIFFSERV_DIFFSERV4: 2599 ft = cake_config_diffserv4(sch, rate, mtu); 2600 break; 2601 2602 case CAKE_DIFFSERV_DIFFSERV3: 2603 default: 2604 ft = cake_config_diffserv3(sch, rate, mtu); 2605 break; 2606 } 2607 2608 for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) { 2609 cake_clear_tin(sch, c); 2610 qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time; 2611 } 2612 2613 qd->rate_ns = qd->tins[ft].tin_rate_ns; 2614 qd->rate_shft = qd->tins[ft].tin_rate_shft; 2615 } 2616 2617 static void cake_reconfigure(struct Qdisc *sch) 2618 { 2619 struct cake_sched_data *qd = qdisc_priv(sch); 2620 struct cake_sched_config *q = qd->config; 2621 u32 buffer_limit; 2622 2623 cake_configure_rates(sch, qd->config->rate_bps, false); 2624 2625 if (q->buffer_config_limit) { 2626 buffer_limit = q->buffer_config_limit; 2627 } else if (q->rate_bps) { 2628 u64 t = q->rate_bps * q->interval; 2629 2630 do_div(t, USEC_PER_SEC / 4); 2631 buffer_limit = max_t(u32, t, 4U << 20); 2632 } else { 2633 buffer_limit = ~0; 2634 } 2635 2636 sch->flags &= ~TCQ_F_CAN_BYPASS; 2637 2638 WRITE_ONCE(qd->buffer_limit, 2639 min(buffer_limit, 2640 max(sch->limit * psched_mtu(qdisc_dev(sch)), 2641 q->buffer_config_limit))); 2642 } 2643 2644 static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt, 2645 struct netlink_ext_ack *extack, bool *overhead_changed) 2646 { 2647 struct nlattr *tb[TCA_CAKE_MAX + 1]; 2648 u16 rate_flags = q->rate_flags; 2649 u8 flow_mode = q->flow_mode; 2650 int err; 2651 2652 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, 2653 extack); 2654 if (err < 0) 2655 return err; 2656 2657 if (tb[TCA_CAKE_NAT]) { 2658 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 2659 flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2660 flow_mode |= CAKE_FLOW_NAT_FLAG * 2661 !!nla_get_u32(tb[TCA_CAKE_NAT]); 2662 #else 2663 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2664 "No conntrack support in kernel"); 2665 return -EOPNOTSUPP; 2666 #endif 2667 } 2668 2669 if (tb[TCA_CAKE_AUTORATE]) { 2670 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) { 2671 if (q->is_shared) { 2672 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE], 2673 "Can't use autorate-ingress with cake_mq"); 2674 return -EOPNOTSUPP; 2675 } 2676 rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2677 } else { 2678 rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2679 } 2680 } 2681 2682 if (tb[TCA_CAKE_BASE_RATE64]) 2683 WRITE_ONCE(q->rate_bps, 2684 nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); 2685 2686 if (tb[TCA_CAKE_DIFFSERV_MODE]) 2687 WRITE_ONCE(q->tin_mode, 2688 nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); 2689 2690 if (tb[TCA_CAKE_WASH]) { 2691 if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2692 rate_flags |= CAKE_FLAG_WASH; 2693 else 2694 rate_flags &= ~CAKE_FLAG_WASH; 2695 } 2696 2697 if (tb[TCA_CAKE_FLOW_MODE]) 2698 flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | 2699 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2700 CAKE_FLOW_MASK)); 2701 2702 if (tb[TCA_CAKE_ATM]) 2703 WRITE_ONCE(q->atm_mode, 2704 nla_get_u32(tb[TCA_CAKE_ATM])); 2705 2706 if (tb[TCA_CAKE_OVERHEAD]) { 2707 WRITE_ONCE(q->rate_overhead, 2708 nla_get_s32(tb[TCA_CAKE_OVERHEAD])); 2709 rate_flags |= CAKE_FLAG_OVERHEAD; 2710 *overhead_changed = true; 2711 } 2712 2713 if (tb[TCA_CAKE_RAW]) { 2714 rate_flags &= ~CAKE_FLAG_OVERHEAD; 2715 *overhead_changed = true; 2716 } 2717 2718 if (tb[TCA_CAKE_MPU]) 2719 WRITE_ONCE(q->rate_mpu, 2720 nla_get_u32(tb[TCA_CAKE_MPU])); 2721 2722 if (tb[TCA_CAKE_RTT]) { 2723 u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2724 2725 WRITE_ONCE(q->interval, max(interval, 1U)); 2726 } 2727 2728 if (tb[TCA_CAKE_TARGET]) { 2729 u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2730 2731 WRITE_ONCE(q->target, max(target, 1U)); 2732 } 2733 2734 if (tb[TCA_CAKE_INGRESS]) { 2735 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2736 rate_flags |= CAKE_FLAG_INGRESS; 2737 else 2738 rate_flags &= ~CAKE_FLAG_INGRESS; 2739 } 2740 2741 if (tb[TCA_CAKE_ACK_FILTER]) 2742 WRITE_ONCE(q->ack_filter, 2743 nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); 2744 2745 if (tb[TCA_CAKE_MEMORY]) 2746 WRITE_ONCE(q->buffer_config_limit, 2747 nla_get_u32(tb[TCA_CAKE_MEMORY])); 2748 2749 if (tb[TCA_CAKE_SPLIT_GSO]) { 2750 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) 2751 rate_flags |= CAKE_FLAG_SPLIT_GSO; 2752 else 2753 rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2754 } 2755 2756 if (tb[TCA_CAKE_FWMARK]) { 2757 WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); 2758 WRITE_ONCE(q->fwmark_shft, 2759 q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); 2760 } 2761 2762 WRITE_ONCE(q->rate_flags, rate_flags); 2763 WRITE_ONCE(q->flow_mode, flow_mode); 2764 2765 return 0; 2766 } 2767 2768 static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2769 struct netlink_ext_ack *extack) 2770 { 2771 struct cake_sched_data *qd = qdisc_priv(sch); 2772 struct cake_sched_config *q = qd->config; 2773 bool overhead_changed = false; 2774 int ret; 2775 2776 if (q->is_shared) { 2777 NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs"); 2778 return -EOPNOTSUPP; 2779 } 2780 2781 ret = cake_config_change(q, opt, extack, &overhead_changed); 2782 if (ret) 2783 return ret; 2784 2785 if (overhead_changed) { 2786 WRITE_ONCE(qd->max_netlen, 0); 2787 WRITE_ONCE(qd->max_adjlen, 0); 2788 WRITE_ONCE(qd->min_netlen, ~0); 2789 WRITE_ONCE(qd->min_adjlen, ~0); 2790 } 2791 2792 if (qd->tins) { 2793 sch_tree_lock(sch); 2794 cake_reconfigure(sch); 2795 sch_tree_unlock(sch); 2796 } 2797 2798 return 0; 2799 } 2800 2801 static void cake_destroy(struct Qdisc *sch) 2802 { 2803 struct cake_sched_data *q = qdisc_priv(sch); 2804 2805 qdisc_watchdog_cancel(&q->watchdog); 2806 tcf_block_put(q->block); 2807 kvfree(q->tins); 2808 } 2809 2810 static void cake_config_init(struct cake_sched_config *q, bool is_shared) 2811 { 2812 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2813 q->flow_mode = CAKE_FLOW_TRIPLE; 2814 2815 q->rate_bps = 0; /* unlimited by default */ 2816 2817 q->interval = 100000; /* 100ms default */ 2818 q->target = 5000; /* 5ms: codel RFC argues 2819 * for 5 to 10% of interval 2820 */ 2821 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2822 q->is_shared = is_shared; 2823 q->sync_time = 200 * NSEC_PER_USEC; 2824 } 2825 2826 static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2827 struct netlink_ext_ack *extack) 2828 { 2829 struct cake_sched_data *qd = qdisc_priv(sch); 2830 struct cake_sched_config *q = &qd->initial_config; 2831 int i, j, err; 2832 2833 cake_config_init(q, false); 2834 2835 sch->limit = 10240; 2836 sch->flags |= TCQ_F_DEQUEUE_DROPS; 2837 2838 qd->cur_tin = 0; 2839 qd->cur_flow = 0; 2840 qd->config = q; 2841 2842 qdisc_watchdog_init(&qd->watchdog, sch); 2843 2844 if (opt) { 2845 err = cake_change(sch, opt, extack); 2846 if (err) 2847 return err; 2848 } 2849 2850 err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack); 2851 if (err) 2852 return err; 2853 2854 quantum_div[0] = ~0; 2855 for (i = 1; i <= CAKE_QUEUES; i++) 2856 quantum_div[i] = 65535 / i; 2857 2858 qd->tins = kvzalloc_objs(struct cake_tin_data, CAKE_MAX_TINS); 2859 if (!qd->tins) 2860 return -ENOMEM; 2861 2862 for (i = 0; i < CAKE_MAX_TINS; i++) { 2863 struct cake_tin_data *b = qd->tins + i; 2864 2865 INIT_LIST_HEAD(&b->new_flows); 2866 INIT_LIST_HEAD(&b->old_flows); 2867 INIT_LIST_HEAD(&b->decaying_flows); 2868 b->sparse_flow_count = 0; 2869 b->bulk_flow_count = 0; 2870 b->decaying_flow_count = 0; 2871 2872 for (j = 0; j < CAKE_QUEUES; j++) { 2873 struct cake_flow *flow = b->flows + j; 2874 u32 k = j * CAKE_MAX_TINS + i; 2875 2876 INIT_LIST_HEAD(&flow->flowchain); 2877 cobalt_vars_init(&flow->cvars); 2878 2879 qd->overflow_heap[k].t = i; 2880 qd->overflow_heap[k].b = j; 2881 b->overflow_idx[j] = k; 2882 } 2883 } 2884 2885 cake_reconfigure(sch); 2886 qd->avg_peak_bandwidth = q->rate_bps; 2887 qd->min_netlen = ~0; 2888 qd->min_adjlen = ~0; 2889 qd->active_queues = 0; 2890 qd->last_checked_active = 0; 2891 2892 return 0; 2893 } 2894 2895 static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg) 2896 { 2897 struct cake_sched_data *qd = qdisc_priv(sch); 2898 2899 qd->config = cfg; 2900 cake_reconfigure(sch); 2901 } 2902 2903 static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb) 2904 { 2905 struct nlattr *opts; 2906 u16 rate_flags; 2907 u8 flow_mode; 2908 2909 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 2910 if (!opts) 2911 goto nla_put_failure; 2912 2913 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, 2914 READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) 2915 goto nla_put_failure; 2916 2917 flow_mode = READ_ONCE(q->flow_mode); 2918 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) 2919 goto nla_put_failure; 2920 2921 if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) 2922 goto nla_put_failure; 2923 2924 if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) 2925 goto nla_put_failure; 2926 2927 if (nla_put_u32(skb, TCA_CAKE_MEMORY, 2928 READ_ONCE(q->buffer_config_limit))) 2929 goto nla_put_failure; 2930 2931 rate_flags = READ_ONCE(q->rate_flags); 2932 if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2933 !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2934 goto nla_put_failure; 2935 2936 if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2937 !!(rate_flags & CAKE_FLAG_INGRESS))) 2938 goto nla_put_failure; 2939 2940 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) 2941 goto nla_put_failure; 2942 2943 if (nla_put_u32(skb, TCA_CAKE_NAT, 2944 !!(flow_mode & CAKE_FLOW_NAT_FLAG))) 2945 goto nla_put_failure; 2946 2947 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) 2948 goto nla_put_failure; 2949 2950 if (nla_put_u32(skb, TCA_CAKE_WASH, 2951 !!(rate_flags & CAKE_FLAG_WASH))) 2952 goto nla_put_failure; 2953 2954 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) 2955 goto nla_put_failure; 2956 2957 if (!(rate_flags & CAKE_FLAG_OVERHEAD)) 2958 if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2959 goto nla_put_failure; 2960 2961 if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) 2962 goto nla_put_failure; 2963 2964 if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) 2965 goto nla_put_failure; 2966 2967 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 2968 !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) 2969 goto nla_put_failure; 2970 2971 if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) 2972 goto nla_put_failure; 2973 2974 return nla_nest_end(skb, opts); 2975 2976 nla_put_failure: 2977 return -1; 2978 } 2979 2980 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2981 { 2982 struct cake_sched_data *qd = qdisc_priv(sch); 2983 2984 return cake_config_dump(qd->config, skb); 2985 } 2986 2987 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 2988 { 2989 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 2990 struct cake_sched_data *q = qdisc_priv(sch); 2991 struct nlattr *tstats, *ts; 2992 int i; 2993 2994 if (!stats) 2995 return -1; 2996 2997 #define PUT_STAT_U32(attr, data) do { \ 2998 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2999 goto nla_put_failure; \ 3000 } while (0) 3001 #define PUT_STAT_U64(attr, data) do { \ 3002 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 3003 data, TCA_CAKE_STATS_PAD)) \ 3004 goto nla_put_failure; \ 3005 } while (0) 3006 3007 PUT_STAT_U64(CAPACITY_ESTIMATE64, READ_ONCE(q->avg_peak_bandwidth)); 3008 PUT_STAT_U32(MEMORY_LIMIT, READ_ONCE(q->buffer_limit)); 3009 PUT_STAT_U32(MEMORY_USED, READ_ONCE(q->buffer_max_used)); 3010 PUT_STAT_U32(AVG_NETOFF, ((READ_ONCE(q->avg_netoff) + 0x8000) >> 16)); 3011 PUT_STAT_U32(MAX_NETLEN, READ_ONCE(q->max_netlen)); 3012 PUT_STAT_U32(MAX_ADJLEN, READ_ONCE(q->max_adjlen)); 3013 PUT_STAT_U32(MIN_NETLEN, READ_ONCE(q->min_netlen)); 3014 PUT_STAT_U32(MIN_ADJLEN, READ_ONCE(q->min_adjlen)); 3015 PUT_STAT_U32(ACTIVE_QUEUES, READ_ONCE(q->active_queues)); 3016 3017 #undef PUT_STAT_U32 3018 #undef PUT_STAT_U64 3019 3020 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); 3021 if (!tstats) 3022 goto nla_put_failure; 3023 3024 #define PUT_TSTAT_U32(attr, data) do { \ 3025 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 3026 goto nla_put_failure; \ 3027 } while (0) 3028 #define PUT_TSTAT_U64(attr, data) do { \ 3029 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 3030 data, TCA_CAKE_TIN_STATS_PAD)) \ 3031 goto nla_put_failure; \ 3032 } while (0) 3033 3034 for (i = 0; i < q->tin_cnt; i++) { 3035 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3036 3037 ts = nla_nest_start_noflag(d->skb, i + 1); 3038 if (!ts) 3039 goto nla_put_failure; 3040 3041 PUT_TSTAT_U64(THRESHOLD_RATE64, READ_ONCE(b->tin_rate_bps)); 3042 PUT_TSTAT_U64(SENT_BYTES64, READ_ONCE(b->bytes)); 3043 PUT_TSTAT_U32(BACKLOG_BYTES, READ_ONCE(b->tin_backlog)); 3044 3045 PUT_TSTAT_U32(TARGET_US, 3046 ktime_to_us(ns_to_ktime(READ_ONCE(b->cparams.target)))); 3047 PUT_TSTAT_U32(INTERVAL_US, 3048 ktime_to_us(ns_to_ktime(READ_ONCE(b->cparams.interval)))); 3049 3050 PUT_TSTAT_U32(SENT_PACKETS, READ_ONCE(b->packets)); 3051 PUT_TSTAT_U32(DROPPED_PACKETS, READ_ONCE(b->tin_dropped)); 3052 PUT_TSTAT_U32(ECN_MARKED_PACKETS, READ_ONCE(b->tin_ecn_mark)); 3053 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, READ_ONCE(b->ack_drops)); 3054 3055 PUT_TSTAT_U32(PEAK_DELAY_US, 3056 ktime_to_us(ns_to_ktime(READ_ONCE(b->peak_delay)))); 3057 PUT_TSTAT_U32(AVG_DELAY_US, 3058 ktime_to_us(ns_to_ktime(READ_ONCE(b->avge_delay)))); 3059 PUT_TSTAT_U32(BASE_DELAY_US, 3060 ktime_to_us(ns_to_ktime(READ_ONCE(b->base_delay)))); 3061 3062 PUT_TSTAT_U32(WAY_INDIRECT_HITS, READ_ONCE(b->way_hits)); 3063 PUT_TSTAT_U32(WAY_MISSES, READ_ONCE(b->way_misses)); 3064 PUT_TSTAT_U32(WAY_COLLISIONS, READ_ONCE(b->way_collisions)); 3065 3066 PUT_TSTAT_U32(SPARSE_FLOWS, READ_ONCE(b->sparse_flow_count) + 3067 READ_ONCE(b->decaying_flow_count)); 3068 PUT_TSTAT_U32(BULK_FLOWS, READ_ONCE(b->bulk_flow_count)); 3069 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, READ_ONCE(b->unresponsive_flow_count)); 3070 PUT_TSTAT_U32(MAX_SKBLEN, READ_ONCE(b->max_skblen)); 3071 3072 PUT_TSTAT_U32(FLOW_QUANTUM, READ_ONCE(b->flow_quantum)); 3073 nla_nest_end(d->skb, ts); 3074 } 3075 3076 #undef PUT_TSTAT_U32 3077 #undef PUT_TSTAT_U64 3078 3079 nla_nest_end(d->skb, tstats); 3080 return nla_nest_end(d->skb, stats); 3081 3082 nla_put_failure: 3083 nla_nest_cancel(d->skb, stats); 3084 return -1; 3085 } 3086 3087 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 3088 { 3089 return NULL; 3090 } 3091 3092 static unsigned long cake_find(struct Qdisc *sch, u32 classid) 3093 { 3094 return 0; 3095 } 3096 3097 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 3098 u32 classid) 3099 { 3100 return 0; 3101 } 3102 3103 static void cake_unbind(struct Qdisc *q, unsigned long cl) 3104 { 3105 } 3106 3107 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 3108 struct netlink_ext_ack *extack) 3109 { 3110 struct cake_sched_data *q = qdisc_priv(sch); 3111 3112 if (cl) 3113 return NULL; 3114 return q->block; 3115 } 3116 3117 static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 3118 struct sk_buff *skb, struct tcmsg *tcm) 3119 { 3120 tcm->tcm_handle |= TC_H_MIN(cl); 3121 return 0; 3122 } 3123 3124 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 3125 struct gnet_dump *d) 3126 { 3127 struct cake_sched_data *q = qdisc_priv(sch); 3128 const struct cake_flow *flow = NULL; 3129 struct gnet_stats_queue qs = { 0 }; 3130 struct nlattr *stats; 3131 u32 idx = cl - 1; 3132 3133 if (idx < CAKE_QUEUES * q->tin_cnt) { 3134 const struct cake_tin_data *b = \ 3135 &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 3136 const struct sk_buff *skb; 3137 3138 flow = &b->flows[idx % CAKE_QUEUES]; 3139 3140 if (flow->head) { 3141 sch_tree_lock(sch); 3142 skb = flow->head; 3143 while (skb) { 3144 qs.qlen++; 3145 skb = skb->next; 3146 } 3147 sch_tree_unlock(sch); 3148 } 3149 qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 3150 qs.drops = flow->dropped; 3151 } 3152 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 3153 return -1; 3154 if (flow) { 3155 ktime_t now = ktime_get(); 3156 3157 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 3158 if (!stats) 3159 return -1; 3160 3161 #define PUT_STAT_U32(attr, data) do { \ 3162 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3163 goto nla_put_failure; \ 3164 } while (0) 3165 #define PUT_STAT_S32(attr, data) do { \ 3166 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3167 goto nla_put_failure; \ 3168 } while (0) 3169 3170 PUT_STAT_S32(DEFICIT, flow->deficit); 3171 PUT_STAT_U32(DROPPING, flow->cvars.dropping); 3172 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 3173 PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 3174 if (flow->cvars.p_drop) { 3175 PUT_STAT_S32(BLUE_TIMER_US, 3176 ktime_to_us( 3177 ktime_sub(now, 3178 flow->cvars.blue_timer))); 3179 } 3180 if (flow->cvars.dropping) { 3181 PUT_STAT_S32(DROP_NEXT_US, 3182 ktime_to_us( 3183 ktime_sub(now, 3184 flow->cvars.drop_next))); 3185 } 3186 3187 if (nla_nest_end(d->skb, stats) < 0) 3188 return -1; 3189 } 3190 3191 return 0; 3192 3193 nla_put_failure: 3194 nla_nest_cancel(d->skb, stats); 3195 return -1; 3196 } 3197 3198 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 3199 { 3200 struct cake_sched_data *q = qdisc_priv(sch); 3201 unsigned int i, j; 3202 3203 if (arg->stop) 3204 return; 3205 3206 for (i = 0; i < q->tin_cnt; i++) { 3207 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3208 3209 for (j = 0; j < CAKE_QUEUES; j++) { 3210 if (list_empty(&b->flows[j].flowchain)) { 3211 arg->count++; 3212 continue; 3213 } 3214 if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, 3215 arg)) 3216 break; 3217 } 3218 } 3219 } 3220 3221 static const struct Qdisc_class_ops cake_class_ops = { 3222 .leaf = cake_leaf, 3223 .find = cake_find, 3224 .tcf_block = cake_tcf_block, 3225 .bind_tcf = cake_bind, 3226 .unbind_tcf = cake_unbind, 3227 .dump = cake_dump_class, 3228 .dump_stats = cake_dump_class_stats, 3229 .walk = cake_walk, 3230 }; 3231 3232 static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 3233 .cl_ops = &cake_class_ops, 3234 .id = "cake", 3235 .priv_size = sizeof(struct cake_sched_data), 3236 .enqueue = cake_enqueue, 3237 .dequeue = cake_dequeue, 3238 .peek = qdisc_peek_dequeued, 3239 .init = cake_init, 3240 .reset = cake_reset, 3241 .destroy = cake_destroy, 3242 .change = cake_change, 3243 .dump = cake_dump, 3244 .dump_stats = cake_dump_stats, 3245 .owner = THIS_MODULE, 3246 }; 3247 MODULE_ALIAS_NET_SCH("cake"); 3248 3249 struct cake_mq_sched { 3250 struct mq_sched mq_priv; /* must be first */ 3251 struct cake_sched_config cake_config; 3252 }; 3253 3254 static void cake_mq_destroy(struct Qdisc *sch) 3255 { 3256 mq_destroy_common(sch); 3257 } 3258 3259 static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt, 3260 struct netlink_ext_ack *extack) 3261 { 3262 struct cake_mq_sched *priv = qdisc_priv(sch); 3263 struct net_device *dev = qdisc_dev(sch); 3264 int ret, ntx; 3265 bool _unused; 3266 3267 cake_config_init(&priv->cake_config, true); 3268 if (opt) { 3269 ret = cake_config_change(&priv->cake_config, opt, extack, &_unused); 3270 if (ret) 3271 return ret; 3272 } 3273 3274 ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops); 3275 if (ret) 3276 return ret; 3277 3278 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) 3279 cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config); 3280 3281 return 0; 3282 } 3283 3284 static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb) 3285 { 3286 struct cake_mq_sched *priv = qdisc_priv(sch); 3287 3288 mq_dump_common(sch, skb); 3289 return cake_config_dump(&priv->cake_config, skb); 3290 } 3291 3292 static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt, 3293 struct netlink_ext_ack *extack) 3294 { 3295 struct cake_mq_sched *priv = qdisc_priv(sch); 3296 struct net_device *dev = qdisc_dev(sch); 3297 bool overhead_changed = false; 3298 unsigned int ntx; 3299 int ret; 3300 3301 ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed); 3302 if (ret) 3303 return ret; 3304 3305 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 3306 struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); 3307 struct cake_sched_data *qd = qdisc_priv(chld); 3308 3309 if (overhead_changed) { 3310 WRITE_ONCE(qd->max_netlen, 0); 3311 WRITE_ONCE(qd->max_adjlen, 0); 3312 WRITE_ONCE(qd->min_netlen, ~0); 3313 WRITE_ONCE(qd->min_adjlen, ~0); 3314 } 3315 3316 if (qd->tins) { 3317 sch_tree_lock(chld); 3318 cake_reconfigure(chld); 3319 sch_tree_unlock(chld); 3320 } 3321 } 3322 3323 return 0; 3324 } 3325 3326 static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, 3327 struct Qdisc **old, struct netlink_ext_ack *extack) 3328 { 3329 NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs"); 3330 return -EOPNOTSUPP; 3331 } 3332 3333 static const struct Qdisc_class_ops cake_mq_class_ops = { 3334 .select_queue = mq_select_queue, 3335 .graft = cake_mq_graft, 3336 .leaf = mq_leaf, 3337 .find = mq_find, 3338 .walk = mq_walk, 3339 .dump = mq_dump_class, 3340 .dump_stats = mq_dump_class_stats, 3341 }; 3342 3343 static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = { 3344 .cl_ops = &cake_mq_class_ops, 3345 .id = "cake_mq", 3346 .priv_size = sizeof(struct cake_mq_sched), 3347 .init = cake_mq_init, 3348 .destroy = cake_mq_destroy, 3349 .attach = mq_attach, 3350 .change = cake_mq_change, 3351 .change_real_num_tx = mq_change_real_num_tx, 3352 .dump = cake_mq_dump, 3353 .owner = THIS_MODULE, 3354 }; 3355 MODULE_ALIAS_NET_SCH("cake_mq"); 3356 3357 static int __init cake_module_init(void) 3358 { 3359 int ret; 3360 3361 ret = register_qdisc(&cake_qdisc_ops); 3362 if (ret) 3363 return ret; 3364 3365 ret = register_qdisc(&cake_mq_qdisc_ops); 3366 if (ret) 3367 unregister_qdisc(&cake_qdisc_ops); 3368 3369 return ret; 3370 } 3371 3372 static void __exit cake_module_exit(void) 3373 { 3374 unregister_qdisc(&cake_qdisc_ops); 3375 unregister_qdisc(&cake_mq_qdisc_ops); 3376 } 3377 3378 module_init(cake_module_init) 3379 module_exit(cake_module_exit) 3380 MODULE_AUTHOR("Jonathan Morton"); 3381 MODULE_LICENSE("Dual BSD/GPL"); 3382 MODULE_DESCRIPTION("The CAKE shaper."); 3383 MODULE_IMPORT_NS("NET_SCHED_INTERNAL"); 3384