1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* COMMON Applications Kept Enhanced (CAKE) discipline 4 * 5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> 6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> 7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> 8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> 9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> 10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> 11 * 12 * The CAKE Principles: 13 * (or, how to have your cake and eat it too) 14 * 15 * This is a combination of several shaping, AQM and FQ techniques into one 16 * easy-to-use package: 17 * 18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 * eliminating the need for any sort of burst parameter (eg. token bucket 21 * depth). Burst support is limited to that necessary to overcome scheduling 22 * latency. 23 * 24 * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 * the priority is reduced to avoid starving other tins. 27 * 28 * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 * flows from each other. This prevents a burst on one flow from increasing 30 * the delay to another. Flows are distributed to queues using a 31 * set-associative hash function. 32 * 33 * - Each queue is actively managed by Cobalt, which is a combination of the 34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 * congestion early via ECN (if available) and/or packet drops, to keep 36 * latency low. The codel parameters are auto-tuned based on the bandwidth 37 * setting, as is necessary at low bandwidths. 38 * 39 * The configuration parameters are kept deliberately simple for ease of use. 40 * Everything has sane defaults. Complete generality of configuration is *not* 41 * a goal. 42 * 43 * The priority queue operates according to a weighted DRR scheme, combined with 44 * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 * bandwidth sharing threshold the tin is operating. This determines whether a 46 * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 * that tin in the current pass. 48 * 49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 * granted us permission to leverage. 51 */ 52 53 #include <linux/module.h> 54 #include <linux/types.h> 55 #include <linux/kernel.h> 56 #include <linux/jiffies.h> 57 #include <linux/string.h> 58 #include <linux/in.h> 59 #include <linux/errno.h> 60 #include <linux/init.h> 61 #include <linux/skbuff.h> 62 #include <linux/jhash.h> 63 #include <linux/slab.h> 64 #include <linux/vmalloc.h> 65 #include <linux/reciprocal_div.h> 66 #include <net/netlink.h> 67 #include <linux/if_vlan.h> 68 #include <net/gso.h> 69 #include <net/pkt_sched.h> 70 #include <net/sch_priv.h> 71 #include <net/pkt_cls.h> 72 #include <net/tcp.h> 73 #include <net/flow_dissector.h> 74 75 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 76 #include <net/netfilter/nf_conntrack_core.h> 77 #endif 78 79 #define CAKE_SET_WAYS (8) 80 #define CAKE_MAX_TINS (8) 81 #define CAKE_QUEUES (1024) 82 #define CAKE_FLOW_MASK 63 83 #define CAKE_FLOW_NAT_FLAG 64 84 85 /* struct cobalt_params - contains codel and blue parameters 86 * @interval: codel initial drop rate 87 * @target: maximum persistent sojourn time & blue update rate 88 * @mtu_time: serialisation delay of maximum-size packet 89 * @p_inc: increment of blue drop probability (0.32 fxp) 90 * @p_dec: decrement of blue drop probability (0.32 fxp) 91 */ 92 struct cobalt_params { 93 u64 interval; 94 u64 target; 95 u64 mtu_time; 96 u32 p_inc; 97 u32 p_dec; 98 }; 99 100 /* struct cobalt_vars - contains codel and blue variables 101 * @count: codel dropping frequency 102 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 103 * @drop_next: time to drop next packet, or when we dropped last 104 * @blue_timer: Blue time to next drop 105 * @p_drop: BLUE drop probability (0.32 fxp) 106 * @dropping: set if in dropping state 107 * @ecn_marked: set if marked 108 */ 109 struct cobalt_vars { 110 u32 count; 111 u32 rec_inv_sqrt; 112 ktime_t drop_next; 113 ktime_t blue_timer; 114 u32 p_drop; 115 bool dropping; 116 bool ecn_marked; 117 }; 118 119 enum { 120 CAKE_SET_NONE = 0, 121 CAKE_SET_SPARSE, 122 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 123 CAKE_SET_BULK, 124 CAKE_SET_DECAYING 125 }; 126 127 struct cake_flow { 128 /* this stuff is all needed per-flow at dequeue time */ 129 struct sk_buff *head; 130 struct sk_buff *tail; 131 struct list_head flowchain; 132 s32 deficit; 133 u32 dropped; 134 struct cobalt_vars cvars; 135 u16 srchost; /* index into cake_host table */ 136 u16 dsthost; 137 u8 set; 138 }; /* please try to keep this structure <= 64 bytes */ 139 140 struct cake_host { 141 u32 srchost_tag; 142 u32 dsthost_tag; 143 u16 srchost_bulk_flow_count; 144 u16 dsthost_bulk_flow_count; 145 }; 146 147 struct cake_heap_entry { 148 u16 t:3, b:10; 149 }; 150 151 struct cake_tin_data { 152 struct cake_flow flows[CAKE_QUEUES]; 153 u32 backlogs[CAKE_QUEUES]; 154 u32 tags[CAKE_QUEUES]; /* for set association */ 155 u16 overflow_idx[CAKE_QUEUES]; 156 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 157 u16 flow_quantum; 158 159 struct cobalt_params cparams; 160 u32 drop_overlimit; 161 u16 bulk_flow_count; 162 u16 sparse_flow_count; 163 u16 decaying_flow_count; 164 u16 unresponsive_flow_count; 165 166 u32 max_skblen; 167 168 struct list_head new_flows; 169 struct list_head old_flows; 170 struct list_head decaying_flows; 171 172 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 173 ktime_t time_next_packet; 174 u64 tin_rate_ns; 175 u64 tin_rate_bps; 176 u16 tin_rate_shft; 177 178 u16 tin_quantum; 179 s32 tin_deficit; 180 u32 tin_backlog; 181 u32 tin_dropped; 182 u32 tin_ecn_mark; 183 184 u32 packets; 185 u64 bytes; 186 187 u32 ack_drops; 188 189 /* moving averages */ 190 u64 avge_delay; 191 u64 peak_delay; 192 u64 base_delay; 193 194 /* hash function stats */ 195 u32 way_directs; 196 u32 way_hits; 197 u32 way_misses; 198 u32 way_collisions; 199 }; /* number of tins is small, so size of this struct doesn't matter much */ 200 201 struct cake_sched_config { 202 u64 rate_bps; 203 u64 interval; 204 u64 target; 205 u64 sync_time; 206 u32 buffer_config_limit; 207 u32 fwmark_mask; 208 u16 fwmark_shft; 209 s16 rate_overhead; 210 u16 rate_mpu; 211 u16 rate_flags; 212 u8 tin_mode; 213 u8 flow_mode; 214 u8 atm_mode; 215 u8 ack_filter; 216 u8 is_shared; 217 }; 218 219 struct cake_sched_data { 220 struct tcf_proto __rcu *filter_list; /* optional external classifier */ 221 struct tcf_block *block; 222 struct cake_tin_data *tins; 223 struct cake_sched_config *config; 224 225 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 226 227 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 228 ktime_t time_next_packet; 229 ktime_t failsafe_next_packet; 230 u64 rate_ns; 231 u16 rate_shft; 232 u16 overflow_timeout; 233 u16 tin_cnt; 234 235 /* resource tracking */ 236 u32 buffer_used; 237 u32 buffer_max_used; 238 u32 buffer_limit; 239 240 /* indices for dequeue */ 241 u16 cur_tin; 242 u16 cur_flow; 243 244 struct qdisc_watchdog watchdog; 245 const u8 *tin_index; 246 const u8 *tin_order; 247 248 /* bandwidth capacity estimate */ 249 ktime_t last_packet_time; 250 ktime_t avg_window_begin; 251 u64 avg_packet_interval; 252 u64 avg_window_bytes; 253 u64 avg_peak_bandwidth; 254 ktime_t last_reconfig_time; 255 256 /* packet length stats */ 257 u32 avg_netoff; 258 u16 max_netlen; 259 u16 max_adjlen; 260 u16 min_netlen; 261 u16 min_adjlen; 262 263 /* mq sync state */ 264 u64 last_checked_active; 265 u64 last_active; 266 u32 active_queues; 267 }; 268 269 enum { 270 CAKE_FLAG_OVERHEAD = BIT(0), 271 CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 272 CAKE_FLAG_INGRESS = BIT(2), 273 CAKE_FLAG_WASH = BIT(3), 274 CAKE_FLAG_SPLIT_GSO = BIT(4) 275 }; 276 277 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 278 * obtain the best features of each. Codel is excellent on flows which 279 * respond to congestion signals in a TCP-like way. BLUE is more effective on 280 * unresponsive flows. 281 */ 282 283 struct cobalt_skb_cb { 284 ktime_t enqueue_time; 285 u32 adjusted_len; 286 }; 287 288 static u64 us_to_ns(u64 us) 289 { 290 return us * NSEC_PER_USEC; 291 } 292 293 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 294 { 295 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 296 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 297 } 298 299 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 300 { 301 return get_cobalt_cb(skb)->enqueue_time; 302 } 303 304 static void cobalt_set_enqueue_time(struct sk_buff *skb, 305 ktime_t now) 306 { 307 get_cobalt_cb(skb)->enqueue_time = now; 308 } 309 310 static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 311 312 /* Diffserv lookup tables */ 313 314 static const u8 precedence[] = { 315 0, 0, 0, 0, 0, 0, 0, 0, 316 1, 1, 1, 1, 1, 1, 1, 1, 317 2, 2, 2, 2, 2, 2, 2, 2, 318 3, 3, 3, 3, 3, 3, 3, 3, 319 4, 4, 4, 4, 4, 4, 4, 4, 320 5, 5, 5, 5, 5, 5, 5, 5, 321 6, 6, 6, 6, 6, 6, 6, 6, 322 7, 7, 7, 7, 7, 7, 7, 7, 323 }; 324 325 static const u8 diffserv8[] = { 326 2, 0, 1, 2, 4, 2, 2, 2, 327 1, 2, 1, 2, 1, 2, 1, 2, 328 5, 2, 4, 2, 4, 2, 4, 2, 329 3, 2, 3, 2, 3, 2, 3, 2, 330 6, 2, 3, 2, 3, 2, 3, 2, 331 6, 2, 2, 2, 6, 2, 6, 2, 332 7, 2, 2, 2, 2, 2, 2, 2, 333 7, 2, 2, 2, 2, 2, 2, 2, 334 }; 335 336 static const u8 diffserv4[] = { 337 0, 1, 0, 0, 2, 0, 0, 0, 338 1, 0, 0, 0, 0, 0, 0, 0, 339 2, 0, 2, 0, 2, 0, 2, 0, 340 2, 0, 2, 0, 2, 0, 2, 0, 341 3, 0, 2, 0, 2, 0, 2, 0, 342 3, 0, 0, 0, 3, 0, 3, 0, 343 3, 0, 0, 0, 0, 0, 0, 0, 344 3, 0, 0, 0, 0, 0, 0, 0, 345 }; 346 347 static const u8 diffserv3[] = { 348 0, 1, 0, 0, 2, 0, 0, 0, 349 1, 0, 0, 0, 0, 0, 0, 0, 350 0, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 2, 0, 2, 0, 354 2, 0, 0, 0, 0, 0, 0, 0, 355 2, 0, 0, 0, 0, 0, 0, 0, 356 }; 357 358 static const u8 besteffort[] = { 359 0, 0, 0, 0, 0, 0, 0, 0, 360 0, 0, 0, 0, 0, 0, 0, 0, 361 0, 0, 0, 0, 0, 0, 0, 0, 362 0, 0, 0, 0, 0, 0, 0, 0, 363 0, 0, 0, 0, 0, 0, 0, 0, 364 0, 0, 0, 0, 0, 0, 0, 0, 365 0, 0, 0, 0, 0, 0, 0, 0, 366 0, 0, 0, 0, 0, 0, 0, 0, 367 }; 368 369 /* tin priority order for stats dumping */ 370 371 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 372 static const u8 bulk_order[] = {1, 0, 2, 3}; 373 374 /* There is a big difference in timing between the accurate values placed in the 375 * cache and the approximations given by a single Newton step for small count 376 * values, particularly when stepping from count 1 to 2 or vice versa. Hence, 377 * these values are calculated using eight Newton steps, using the 378 * implementation below. Above 16, a single Newton step gives sufficient 379 * accuracy in either direction, given the precision stored. 380 * 381 * The magnitude of the error when stepping up to count 2 is such as to give the 382 * value that *should* have been produced at count 4. 383 */ 384 385 #define REC_INV_SQRT_CACHE (16) 386 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 387 ~0, ~0, 3037000500, 2479700525, 388 2147483647, 1920767767, 1753413056, 1623345051, 389 1518500250, 1431655765, 1358187914, 1294981364, 390 1239850263, 1191209601, 1147878294, 1108955788 391 }; 392 393 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 394 u64 target_ns, u64 rtt_est_ns); 395 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 396 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 397 * 398 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 399 */ 400 401 static void cobalt_newton_step(struct cobalt_vars *vars) 402 { 403 u32 invsqrt, invsqrt2; 404 u64 val; 405 406 invsqrt = vars->rec_inv_sqrt; 407 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 408 val = (3LL << 32) - ((u64)vars->count * invsqrt2); 409 410 val >>= 2; /* avoid overflow in following multiply */ 411 val = (val * invsqrt) >> (32 - 2 + 1); 412 413 vars->rec_inv_sqrt = val; 414 } 415 416 static void cobalt_invsqrt(struct cobalt_vars *vars) 417 { 418 if (vars->count < REC_INV_SQRT_CACHE) 419 vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; 420 else 421 cobalt_newton_step(vars); 422 } 423 424 static void cobalt_vars_init(struct cobalt_vars *vars) 425 { 426 memset(vars, 0, sizeof(*vars)); 427 } 428 429 /* CoDel control_law is t + interval/sqrt(count) 430 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 431 * both sqrt() and divide operation. 432 */ 433 static ktime_t cobalt_control(ktime_t t, 434 u64 interval, 435 u32 rec_inv_sqrt) 436 { 437 return ktime_add_ns(t, reciprocal_scale(interval, 438 rec_inv_sqrt)); 439 } 440 441 /* Call this when a packet had to be dropped due to queue overflow. Returns 442 * true if the BLUE state was quiescent before but active after this call. 443 */ 444 static bool cobalt_queue_full(struct cobalt_vars *vars, 445 struct cobalt_params *p, 446 ktime_t now) 447 { 448 bool up = false; 449 450 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 451 up = !vars->p_drop; 452 vars->p_drop += p->p_inc; 453 if (vars->p_drop < p->p_inc) 454 vars->p_drop = ~0; 455 vars->blue_timer = now; 456 } 457 vars->dropping = true; 458 vars->drop_next = now; 459 if (!vars->count) 460 vars->count = 1; 461 462 return up; 463 } 464 465 /* Call this when the queue was serviced but turned out to be empty. Returns 466 * true if the BLUE state was active before but quiescent after this call. 467 */ 468 static bool cobalt_queue_empty(struct cobalt_vars *vars, 469 struct cobalt_params *p, 470 ktime_t now) 471 { 472 bool down = false; 473 474 if (vars->p_drop && 475 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 476 if (vars->p_drop < p->p_dec) 477 vars->p_drop = 0; 478 else 479 vars->p_drop -= p->p_dec; 480 vars->blue_timer = now; 481 down = !vars->p_drop; 482 } 483 vars->dropping = false; 484 485 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 486 vars->count--; 487 cobalt_invsqrt(vars); 488 vars->drop_next = cobalt_control(vars->drop_next, 489 p->interval, 490 vars->rec_inv_sqrt); 491 } 492 493 return down; 494 } 495 496 /* Call this with a freshly dequeued packet for possible congestion marking. 497 * Returns true as an instruction to drop the packet, false for delivery. 498 */ 499 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, 500 struct cobalt_params *p, 501 ktime_t now, 502 struct sk_buff *skb, 503 u32 bulk_flows) 504 { 505 enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; 506 bool next_due, over_target; 507 ktime_t schedule; 508 u64 sojourn; 509 510 /* The 'schedule' variable records, in its sign, whether 'now' is before or 511 * after 'drop_next'. This allows 'drop_next' to be updated before the next 512 * scheduling decision is actually branched, without destroying that 513 * information. Similarly, the first 'schedule' value calculated is preserved 514 * in the boolean 'next_due'. 515 * 516 * As for 'drop_next', we take advantage of the fact that 'interval' is both 517 * the delay between first exceeding 'target' and the first signalling event, 518 * *and* the scaling factor for the signalling frequency. It's therefore very 519 * natural to use a single mechanism for both purposes, and eliminates a 520 * significant amount of reference Codel's spaghetti code. To help with this, 521 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 522 * as possible to 1.0 in fixed-point. 523 */ 524 525 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 526 schedule = ktime_sub(now, vars->drop_next); 527 over_target = sojourn > p->target && 528 sojourn > p->mtu_time * bulk_flows * 2 && 529 sojourn > p->mtu_time * 4; 530 next_due = vars->count && ktime_to_ns(schedule) >= 0; 531 532 vars->ecn_marked = false; 533 534 if (over_target) { 535 if (!vars->dropping) { 536 vars->dropping = true; 537 vars->drop_next = cobalt_control(now, 538 p->interval, 539 vars->rec_inv_sqrt); 540 } 541 if (!vars->count) 542 vars->count = 1; 543 } else if (vars->dropping) { 544 vars->dropping = false; 545 } 546 547 if (next_due && vars->dropping) { 548 /* Use ECN mark if possible, otherwise drop */ 549 if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) 550 reason = SKB_DROP_REASON_QDISC_CONGESTED; 551 552 vars->count++; 553 if (!vars->count) 554 vars->count--; 555 cobalt_invsqrt(vars); 556 vars->drop_next = cobalt_control(vars->drop_next, 557 p->interval, 558 vars->rec_inv_sqrt); 559 schedule = ktime_sub(now, vars->drop_next); 560 } else { 561 while (next_due) { 562 vars->count--; 563 cobalt_invsqrt(vars); 564 vars->drop_next = cobalt_control(vars->drop_next, 565 p->interval, 566 vars->rec_inv_sqrt); 567 schedule = ktime_sub(now, vars->drop_next); 568 next_due = vars->count && ktime_to_ns(schedule) >= 0; 569 } 570 } 571 572 /* Simple BLUE implementation. Lack of ECN is deliberate. */ 573 if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && 574 get_random_u32() < vars->p_drop) 575 reason = SKB_DROP_REASON_CAKE_FLOOD; 576 577 /* Overload the drop_next field as an activity timeout */ 578 if (!vars->count) 579 vars->drop_next = ktime_add_ns(now, p->interval); 580 else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) 581 vars->drop_next = now; 582 583 return reason; 584 } 585 586 static bool cake_update_flowkeys(struct flow_keys *keys, 587 const struct sk_buff *skb) 588 { 589 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 590 struct nf_conntrack_tuple tuple = {}; 591 bool rev = !skb->_nfct, upd = false; 592 __be32 ip; 593 594 if (skb_protocol(skb, true) != htons(ETH_P_IP)) 595 return false; 596 597 if (!nf_ct_get_tuple_skb(&tuple, skb)) 598 return false; 599 600 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; 601 if (ip != keys->addrs.v4addrs.src) { 602 keys->addrs.v4addrs.src = ip; 603 upd = true; 604 } 605 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; 606 if (ip != keys->addrs.v4addrs.dst) { 607 keys->addrs.v4addrs.dst = ip; 608 upd = true; 609 } 610 611 if (keys->ports.ports) { 612 __be16 port; 613 614 port = rev ? tuple.dst.u.all : tuple.src.u.all; 615 if (port != keys->ports.src) { 616 keys->ports.src = port; 617 upd = true; 618 } 619 port = rev ? tuple.src.u.all : tuple.dst.u.all; 620 if (port != keys->ports.dst) { 621 port = keys->ports.dst; 622 upd = true; 623 } 624 } 625 return upd; 626 #else 627 return false; 628 #endif 629 } 630 631 /* Cake has several subtle multiple bit settings. In these cases you 632 * would be matching triple isolate mode as well. 633 */ 634 635 static bool cake_dsrc(int flow_mode) 636 { 637 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 638 } 639 640 static bool cake_ddst(int flow_mode) 641 { 642 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 643 } 644 645 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, 646 struct cake_flow *flow, 647 int flow_mode) 648 { 649 if (likely(cake_dsrc(flow_mode) && 650 q->hosts[flow->srchost].srchost_bulk_flow_count)) 651 q->hosts[flow->srchost].srchost_bulk_flow_count--; 652 } 653 654 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, 655 struct cake_flow *flow, 656 int flow_mode) 657 { 658 if (likely(cake_dsrc(flow_mode) && 659 q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) 660 q->hosts[flow->srchost].srchost_bulk_flow_count++; 661 } 662 663 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, 664 struct cake_flow *flow, 665 int flow_mode) 666 { 667 if (likely(cake_ddst(flow_mode) && 668 q->hosts[flow->dsthost].dsthost_bulk_flow_count)) 669 q->hosts[flow->dsthost].dsthost_bulk_flow_count--; 670 } 671 672 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, 673 struct cake_flow *flow, 674 int flow_mode) 675 { 676 if (likely(cake_ddst(flow_mode) && 677 q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) 678 q->hosts[flow->dsthost].dsthost_bulk_flow_count++; 679 } 680 681 static u16 cake_get_flow_quantum(struct cake_tin_data *q, 682 struct cake_flow *flow, 683 int flow_mode) 684 { 685 u16 host_load = 1; 686 687 if (cake_dsrc(flow_mode)) 688 host_load = max(host_load, 689 q->hosts[flow->srchost].srchost_bulk_flow_count); 690 691 if (cake_ddst(flow_mode)) 692 host_load = max(host_load, 693 q->hosts[flow->dsthost].dsthost_bulk_flow_count); 694 695 /* The get_random_u16() is a way to apply dithering to avoid 696 * accumulating roundoff errors 697 */ 698 return (q->flow_quantum * quantum_div[host_load] + 699 get_random_u16()) >> 16; 700 } 701 702 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 703 int flow_mode, u16 flow_override, u16 host_override) 704 { 705 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); 706 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); 707 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); 708 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; 709 u16 reduced_hash, srchost_idx, dsthost_idx; 710 struct flow_keys keys, host_keys; 711 bool use_skbhash = skb->l4_hash; 712 713 if (unlikely(flow_mode == CAKE_FLOW_NONE)) 714 return 0; 715 716 /* If both overrides are set, or we can use the SKB hash and nat mode is 717 * disabled, we can skip packet dissection entirely. If nat mode is 718 * enabled there's another check below after doing the conntrack lookup. 719 */ 720 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) 721 goto skip_hash; 722 723 skb_flow_dissect_flow_keys(skb, &keys, 724 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 725 726 /* Don't use the SKB hash if we change the lookup keys from conntrack */ 727 if (nat_enabled && cake_update_flowkeys(&keys, skb)) 728 use_skbhash = false; 729 730 /* If we can still use the SKB hash and don't need the host hash, we can 731 * skip the rest of the hashing procedure 732 */ 733 if (use_skbhash && !hash_hosts) 734 goto skip_hash; 735 736 /* flow_hash_from_keys() sorts the addresses by value, so we have 737 * to preserve their order in a separate data structure to treat 738 * src and dst host addresses as independently selectable. 739 */ 740 host_keys = keys; 741 host_keys.ports.ports = 0; 742 host_keys.basic.ip_proto = 0; 743 host_keys.keyid.keyid = 0; 744 host_keys.tags.flow_label = 0; 745 746 switch (host_keys.control.addr_type) { 747 case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 748 host_keys.addrs.v4addrs.src = 0; 749 dsthost_hash = flow_hash_from_keys(&host_keys); 750 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 751 host_keys.addrs.v4addrs.dst = 0; 752 srchost_hash = flow_hash_from_keys(&host_keys); 753 break; 754 755 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 756 memset(&host_keys.addrs.v6addrs.src, 0, 757 sizeof(host_keys.addrs.v6addrs.src)); 758 dsthost_hash = flow_hash_from_keys(&host_keys); 759 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 760 memset(&host_keys.addrs.v6addrs.dst, 0, 761 sizeof(host_keys.addrs.v6addrs.dst)); 762 srchost_hash = flow_hash_from_keys(&host_keys); 763 break; 764 765 default: 766 dsthost_hash = 0; 767 srchost_hash = 0; 768 } 769 770 /* This *must* be after the above switch, since as a 771 * side-effect it sorts the src and dst addresses. 772 */ 773 if (hash_flows && !use_skbhash) 774 flow_hash = flow_hash_from_keys(&keys); 775 776 skip_hash: 777 if (flow_override) 778 flow_hash = flow_override - 1; 779 else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) 780 flow_hash = skb->hash; 781 if (host_override) { 782 dsthost_hash = host_override - 1; 783 srchost_hash = host_override - 1; 784 } 785 786 if (!(flow_mode & CAKE_FLOW_FLOWS)) { 787 if (flow_mode & CAKE_FLOW_SRC_IP) 788 flow_hash ^= srchost_hash; 789 790 if (flow_mode & CAKE_FLOW_DST_IP) 791 flow_hash ^= dsthost_hash; 792 } 793 794 reduced_hash = flow_hash % CAKE_QUEUES; 795 796 /* set-associative hashing */ 797 /* fast path if no hash collision (direct lookup succeeds) */ 798 if (likely(q->tags[reduced_hash] == flow_hash && 799 q->flows[reduced_hash].set)) { 800 q->way_directs++; 801 } else { 802 u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 803 u32 outer_hash = reduced_hash - inner_hash; 804 bool allocate_src = false; 805 bool allocate_dst = false; 806 u32 i, k; 807 808 /* check if any active queue in the set is reserved for 809 * this flow. 810 */ 811 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 812 i++, k = (k + 1) % CAKE_SET_WAYS) { 813 if (q->tags[outer_hash + k] == flow_hash) { 814 if (i) 815 q->way_hits++; 816 817 if (!q->flows[outer_hash + k].set) { 818 /* need to increment host refcnts */ 819 allocate_src = cake_dsrc(flow_mode); 820 allocate_dst = cake_ddst(flow_mode); 821 } 822 823 goto found; 824 } 825 } 826 827 /* no queue is reserved for this flow, look for an 828 * empty one. 829 */ 830 for (i = 0; i < CAKE_SET_WAYS; 831 i++, k = (k + 1) % CAKE_SET_WAYS) { 832 if (!q->flows[outer_hash + k].set) { 833 q->way_misses++; 834 allocate_src = cake_dsrc(flow_mode); 835 allocate_dst = cake_ddst(flow_mode); 836 goto found; 837 } 838 } 839 840 /* With no empty queues, default to the original 841 * queue, accept the collision, update the host tags. 842 */ 843 q->way_collisions++; 844 allocate_src = cake_dsrc(flow_mode); 845 allocate_dst = cake_ddst(flow_mode); 846 847 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { 848 cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); 849 cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); 850 } 851 found: 852 /* reserve queue for future packets in same flow */ 853 reduced_hash = outer_hash + k; 854 q->tags[reduced_hash] = flow_hash; 855 856 if (allocate_src) { 857 srchost_idx = srchost_hash % CAKE_QUEUES; 858 inner_hash = srchost_idx % CAKE_SET_WAYS; 859 outer_hash = srchost_idx - inner_hash; 860 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 861 i++, k = (k + 1) % CAKE_SET_WAYS) { 862 if (q->hosts[outer_hash + k].srchost_tag == 863 srchost_hash) 864 goto found_src; 865 } 866 for (i = 0; i < CAKE_SET_WAYS; 867 i++, k = (k + 1) % CAKE_SET_WAYS) { 868 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) 869 break; 870 } 871 q->hosts[outer_hash + k].srchost_tag = srchost_hash; 872 found_src: 873 srchost_idx = outer_hash + k; 874 q->flows[reduced_hash].srchost = srchost_idx; 875 876 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 877 cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); 878 } 879 880 if (allocate_dst) { 881 dsthost_idx = dsthost_hash % CAKE_QUEUES; 882 inner_hash = dsthost_idx % CAKE_SET_WAYS; 883 outer_hash = dsthost_idx - inner_hash; 884 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 885 i++, k = (k + 1) % CAKE_SET_WAYS) { 886 if (q->hosts[outer_hash + k].dsthost_tag == 887 dsthost_hash) 888 goto found_dst; 889 } 890 for (i = 0; i < CAKE_SET_WAYS; 891 i++, k = (k + 1) % CAKE_SET_WAYS) { 892 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) 893 break; 894 } 895 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 896 found_dst: 897 dsthost_idx = outer_hash + k; 898 q->flows[reduced_hash].dsthost = dsthost_idx; 899 900 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 901 cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); 902 } 903 } 904 905 return reduced_hash; 906 } 907 908 /* helper functions : might be changed when/if skb use a standard list_head */ 909 /* remove one skb from head of slot queue */ 910 911 static struct sk_buff *dequeue_head(struct cake_flow *flow) 912 { 913 struct sk_buff *skb = flow->head; 914 915 if (skb) { 916 flow->head = skb->next; 917 skb_mark_not_on_list(skb); 918 } 919 920 return skb; 921 } 922 923 /* add skb to flow queue (tail add) */ 924 925 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 926 { 927 if (!flow->head) 928 flow->head = skb; 929 else 930 flow->tail->next = skb; 931 flow->tail = skb; 932 skb->next = NULL; 933 } 934 935 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 936 struct ipv6hdr *buf) 937 { 938 unsigned int offset = skb_network_offset(skb); 939 struct iphdr *iph; 940 941 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 942 943 if (!iph) 944 return NULL; 945 946 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 947 return skb_header_pointer(skb, offset + iph->ihl * 4, 948 sizeof(struct ipv6hdr), buf); 949 950 else if (iph->version == 4) 951 return iph; 952 953 else if (iph->version == 6) 954 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 955 buf); 956 957 return NULL; 958 } 959 960 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 961 void *buf, unsigned int bufsize) 962 { 963 unsigned int offset = skb_network_offset(skb); 964 const struct ipv6hdr *ipv6h; 965 const struct tcphdr *tcph; 966 const struct iphdr *iph; 967 struct ipv6hdr _ipv6h; 968 struct tcphdr _tcph; 969 970 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 971 972 if (!ipv6h) 973 return NULL; 974 975 if (ipv6h->version == 4) { 976 iph = (struct iphdr *)ipv6h; 977 offset += iph->ihl * 4; 978 979 /* special-case 6in4 tunnelling, as that is a common way to get 980 * v6 connectivity in the home 981 */ 982 if (iph->protocol == IPPROTO_IPV6) { 983 ipv6h = skb_header_pointer(skb, offset, 984 sizeof(_ipv6h), &_ipv6h); 985 986 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 987 return NULL; 988 989 offset += sizeof(struct ipv6hdr); 990 991 } else if (iph->protocol != IPPROTO_TCP) { 992 return NULL; 993 } 994 995 } else if (ipv6h->version == 6) { 996 if (ipv6h->nexthdr != IPPROTO_TCP) 997 return NULL; 998 999 offset += sizeof(struct ipv6hdr); 1000 } else { 1001 return NULL; 1002 } 1003 1004 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 1005 if (!tcph || tcph->doff < 5) 1006 return NULL; 1007 1008 return skb_header_pointer(skb, offset, 1009 min(__tcp_hdrlen(tcph), bufsize), buf); 1010 } 1011 1012 static const void *cake_get_tcpopt(const struct tcphdr *tcph, 1013 int code, int *oplen) 1014 { 1015 /* inspired by tcp_parse_options in tcp_input.c */ 1016 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1017 const u8 *ptr = (const u8 *)(tcph + 1); 1018 1019 while (length > 0) { 1020 int opcode = *ptr++; 1021 int opsize; 1022 1023 if (opcode == TCPOPT_EOL) 1024 break; 1025 if (opcode == TCPOPT_NOP) { 1026 length--; 1027 continue; 1028 } 1029 if (length < 2) 1030 break; 1031 opsize = *ptr++; 1032 if (opsize < 2 || opsize > length) 1033 break; 1034 1035 if (opcode == code) { 1036 *oplen = opsize; 1037 return ptr; 1038 } 1039 1040 ptr += opsize - 2; 1041 length -= opsize; 1042 } 1043 1044 return NULL; 1045 } 1046 1047 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 1048 * bytes than the other. In the case where both sequences ACKs bytes that the 1049 * other doesn't, A is considered greater. DSACKs in A also makes A be 1050 * considered greater. 1051 * 1052 * @return -1, 0 or 1 as normal compare functions 1053 */ 1054 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 1055 const struct tcphdr *tcph_b) 1056 { 1057 const struct tcp_sack_block_wire *sack_a, *sack_b; 1058 u32 ack_seq_a = ntohl(tcph_a->ack_seq); 1059 u32 bytes_a = 0, bytes_b = 0; 1060 int oplen_a, oplen_b; 1061 bool first = true; 1062 1063 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 1064 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 1065 1066 /* pointers point to option contents */ 1067 oplen_a -= TCPOLEN_SACK_BASE; 1068 oplen_b -= TCPOLEN_SACK_BASE; 1069 1070 if (sack_a && oplen_a >= sizeof(*sack_a) && 1071 (!sack_b || oplen_b < sizeof(*sack_b))) 1072 return -1; 1073 else if (sack_b && oplen_b >= sizeof(*sack_b) && 1074 (!sack_a || oplen_a < sizeof(*sack_a))) 1075 return 1; 1076 else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 1077 (!sack_b || oplen_b < sizeof(*sack_b))) 1078 return 0; 1079 1080 while (oplen_a >= sizeof(*sack_a)) { 1081 const struct tcp_sack_block_wire *sack_tmp = sack_b; 1082 u32 start_a = get_unaligned_be32(&sack_a->start_seq); 1083 u32 end_a = get_unaligned_be32(&sack_a->end_seq); 1084 int oplen_tmp = oplen_b; 1085 bool found = false; 1086 1087 /* DSACK; always considered greater to prevent dropping */ 1088 if (before(start_a, ack_seq_a)) 1089 return -1; 1090 1091 bytes_a += end_a - start_a; 1092 1093 while (oplen_tmp >= sizeof(*sack_tmp)) { 1094 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 1095 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 1096 1097 /* first time through we count the total size */ 1098 if (first) 1099 bytes_b += end_b - start_b; 1100 1101 if (!after(start_b, start_a) && !before(end_b, end_a)) { 1102 found = true; 1103 if (!first) 1104 break; 1105 } 1106 oplen_tmp -= sizeof(*sack_tmp); 1107 sack_tmp++; 1108 } 1109 1110 if (!found) 1111 return -1; 1112 1113 oplen_a -= sizeof(*sack_a); 1114 sack_a++; 1115 first = false; 1116 } 1117 1118 /* If we made it this far, all ranges SACKed by A are covered by B, so 1119 * either the SACKs are equal, or B SACKs more bytes. 1120 */ 1121 return bytes_b > bytes_a ? 1 : 0; 1122 } 1123 1124 static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1125 u32 *tsval, u32 *tsecr) 1126 { 1127 const u8 *ptr; 1128 int opsize; 1129 1130 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1131 1132 if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1133 *tsval = get_unaligned_be32(ptr); 1134 *tsecr = get_unaligned_be32(ptr + 4); 1135 } 1136 } 1137 1138 static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1139 u32 tstamp_new, u32 tsecr_new) 1140 { 1141 /* inspired by tcp_parse_options in tcp_input.c */ 1142 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1143 const u8 *ptr = (const u8 *)(tcph + 1); 1144 u32 tstamp, tsecr; 1145 1146 /* 3 reserved flags must be unset to avoid future breakage 1147 * ACK must be set 1148 * ECE/CWR are handled separately 1149 * All other flags URG/PSH/RST/SYN/FIN must be unset 1150 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1151 * 0x00C00000 = CWR/ECE (handled separately) 1152 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1153 */ 1154 if (((tcp_flag_word(tcph) & 1155 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1156 return false; 1157 1158 while (length > 0) { 1159 int opcode = *ptr++; 1160 int opsize; 1161 1162 if (opcode == TCPOPT_EOL) 1163 break; 1164 if (opcode == TCPOPT_NOP) { 1165 length--; 1166 continue; 1167 } 1168 if (length < 2) 1169 break; 1170 opsize = *ptr++; 1171 if (opsize < 2 || opsize > length) 1172 break; 1173 1174 switch (opcode) { 1175 case TCPOPT_MD5SIG: /* doesn't influence state */ 1176 break; 1177 1178 case TCPOPT_SACK: /* stricter checking performed later */ 1179 if (opsize % 8 != 2) 1180 return false; 1181 break; 1182 1183 case TCPOPT_TIMESTAMP: 1184 /* only drop timestamps lower than new */ 1185 if (opsize != TCPOLEN_TIMESTAMP) 1186 return false; 1187 tstamp = get_unaligned_be32(ptr); 1188 tsecr = get_unaligned_be32(ptr + 4); 1189 if (after(tstamp, tstamp_new) || 1190 after(tsecr, tsecr_new)) 1191 return false; 1192 break; 1193 1194 case TCPOPT_MSS: /* these should only be set on SYN */ 1195 case TCPOPT_WINDOW: 1196 case TCPOPT_SACK_PERM: 1197 case TCPOPT_FASTOPEN: 1198 case TCPOPT_EXP: 1199 default: /* don't drop if any unknown options are present */ 1200 return false; 1201 } 1202 1203 ptr += opsize - 2; 1204 length -= opsize; 1205 } 1206 1207 return true; 1208 } 1209 1210 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1211 struct cake_flow *flow) 1212 { 1213 bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE; 1214 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1215 struct sk_buff *skb_check, *skb_prev = NULL; 1216 const struct ipv6hdr *ipv6h, *ipv6h_check; 1217 unsigned char _tcph[64], _tcph_check[64]; 1218 const struct tcphdr *tcph, *tcph_check; 1219 const struct iphdr *iph, *iph_check; 1220 struct ipv6hdr _iph, _iph_check; 1221 const struct sk_buff *skb; 1222 int seglen, num_found = 0; 1223 u32 tstamp = 0, tsecr = 0; 1224 __be32 elig_flags = 0; 1225 int sack_comp; 1226 1227 /* no other possible ACKs to filter */ 1228 if (flow->head == flow->tail) 1229 return NULL; 1230 1231 skb = flow->tail; 1232 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1233 iph = cake_get_iphdr(skb, &_iph); 1234 if (!tcph) 1235 return NULL; 1236 1237 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1238 1239 /* the 'triggering' packet need only have the ACK flag set. 1240 * also check that SYN is not set, as there won't be any previous ACKs. 1241 */ 1242 if ((tcp_flag_word(tcph) & 1243 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1244 return NULL; 1245 1246 /* the 'triggering' ACK is at the tail of the queue, we have already 1247 * returned if it is the only packet in the flow. loop through the rest 1248 * of the queue looking for pure ACKs with the same 5-tuple as the 1249 * triggering one. 1250 */ 1251 for (skb_check = flow->head; 1252 skb_check && skb_check != skb; 1253 skb_prev = skb_check, skb_check = skb_check->next) { 1254 iph_check = cake_get_iphdr(skb_check, &_iph_check); 1255 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1256 sizeof(_tcph_check)); 1257 1258 /* only TCP packets with matching 5-tuple are eligible, and only 1259 * drop safe headers 1260 */ 1261 if (!tcph_check || iph->version != iph_check->version || 1262 tcph_check->source != tcph->source || 1263 tcph_check->dest != tcph->dest) 1264 continue; 1265 1266 if (iph_check->version == 4) { 1267 if (iph_check->saddr != iph->saddr || 1268 iph_check->daddr != iph->daddr) 1269 continue; 1270 1271 seglen = iph_totlen(skb, iph_check) - 1272 (4 * iph_check->ihl); 1273 } else if (iph_check->version == 6) { 1274 ipv6h = (struct ipv6hdr *)iph; 1275 ipv6h_check = (struct ipv6hdr *)iph_check; 1276 1277 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1278 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1279 continue; 1280 1281 seglen = ntohs(ipv6h_check->payload_len); 1282 } else { 1283 WARN_ON(1); /* shouldn't happen */ 1284 continue; 1285 } 1286 1287 /* If the ECE/CWR flags changed from the previous eligible 1288 * packet in the same flow, we should no longer be dropping that 1289 * previous packet as this would lose information. 1290 */ 1291 if (elig_ack && (tcp_flag_word(tcph_check) & 1292 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1293 elig_ack = NULL; 1294 elig_ack_prev = NULL; 1295 num_found--; 1296 } 1297 1298 /* Check TCP options and flags, don't drop ACKs with segment 1299 * data, and don't drop ACKs with a higher cumulative ACK 1300 * counter than the triggering packet. Check ACK seqno here to 1301 * avoid parsing SACK options of packets we are going to exclude 1302 * anyway. 1303 */ 1304 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1305 (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1306 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1307 continue; 1308 1309 /* Check SACK options. The triggering packet must SACK more data 1310 * than the ACK under consideration, or SACK the same range but 1311 * have a larger cumulative ACK counter. The latter is a 1312 * pathological case, but is contained in the following check 1313 * anyway, just to be safe. 1314 */ 1315 sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1316 1317 if (sack_comp < 0 || 1318 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1319 sack_comp == 0)) 1320 continue; 1321 1322 /* At this point we have found an eligible pure ACK to drop; if 1323 * we are in aggressive mode, we are done. Otherwise, keep 1324 * searching unless this is the second eligible ACK we 1325 * found. 1326 * 1327 * Since we want to drop ACK closest to the head of the queue, 1328 * save the first eligible ACK we find, even if we need to loop 1329 * again. 1330 */ 1331 if (!elig_ack) { 1332 elig_ack = skb_check; 1333 elig_ack_prev = skb_prev; 1334 elig_flags = (tcp_flag_word(tcph_check) 1335 & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1336 } 1337 1338 if (num_found++ > 0) 1339 goto found; 1340 } 1341 1342 /* We made it through the queue without finding two eligible ACKs . If 1343 * we found a single eligible ACK we can drop it in aggressive mode if 1344 * we can guarantee that this does not interfere with ECN flag 1345 * information. We ensure this by dropping it only if the enqueued 1346 * packet is consecutive with the eligible ACK, and their flags match. 1347 */ 1348 if (elig_ack && aggressive && elig_ack->next == skb && 1349 (elig_flags == (tcp_flag_word(tcph) & 1350 (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1351 goto found; 1352 1353 return NULL; 1354 1355 found: 1356 if (elig_ack_prev) 1357 elig_ack_prev->next = elig_ack->next; 1358 else 1359 flow->head = elig_ack->next; 1360 1361 skb_mark_not_on_list(elig_ack); 1362 1363 return elig_ack; 1364 } 1365 1366 static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1367 { 1368 avg -= avg >> shift; 1369 avg += sample >> shift; 1370 return avg; 1371 } 1372 1373 static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off) 1374 { 1375 struct cake_sched_config *q = qd->config; 1376 1377 if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1378 len -= off; 1379 1380 if (qd->max_netlen < len) 1381 qd->max_netlen = len; 1382 if (qd->min_netlen > len) 1383 qd->min_netlen = len; 1384 1385 len += q->rate_overhead; 1386 1387 if (len < q->rate_mpu) 1388 len = q->rate_mpu; 1389 1390 if (q->atm_mode == CAKE_ATM_ATM) { 1391 len += 47; 1392 len /= 48; 1393 len *= 53; 1394 } else if (q->atm_mode == CAKE_ATM_PTM) { 1395 /* Add one byte per 64 bytes or part thereof. 1396 * This is conservative and easier to calculate than the 1397 * precise value. 1398 */ 1399 len += (len + 63) / 64; 1400 } 1401 1402 if (qd->max_adjlen < len) 1403 qd->max_adjlen = len; 1404 if (qd->min_adjlen > len) 1405 qd->min_adjlen = len; 1406 1407 return len; 1408 } 1409 1410 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1411 { 1412 const struct skb_shared_info *shinfo = skb_shinfo(skb); 1413 unsigned int hdr_len, last_len = 0; 1414 u32 off = skb_network_offset(skb); 1415 u16 segs = qdisc_pkt_segs(skb); 1416 u32 len = qdisc_pkt_len(skb); 1417 1418 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); 1419 1420 if (segs == 1) 1421 return cake_calc_overhead(q, len, off); 1422 1423 /* borrowed from qdisc_pkt_len_segs_init() */ 1424 if (!skb->encapsulation) 1425 hdr_len = skb_transport_offset(skb); 1426 else 1427 hdr_len = skb_inner_transport_offset(skb); 1428 1429 /* + transport layer */ 1430 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1431 SKB_GSO_TCPV6))) { 1432 const struct tcphdr *th; 1433 struct tcphdr _tcphdr; 1434 1435 th = skb_header_pointer(skb, hdr_len, 1436 sizeof(_tcphdr), &_tcphdr); 1437 if (likely(th)) 1438 hdr_len += __tcp_hdrlen(th); 1439 } else { 1440 struct udphdr _udphdr; 1441 1442 if (skb_header_pointer(skb, hdr_len, 1443 sizeof(_udphdr), &_udphdr)) 1444 hdr_len += sizeof(struct udphdr); 1445 } 1446 1447 len = shinfo->gso_size + hdr_len; 1448 last_len = skb->len - shinfo->gso_size * (segs - 1); 1449 1450 return (cake_calc_overhead(q, len, off) * (segs - 1) + 1451 cake_calc_overhead(q, last_len, off)); 1452 } 1453 1454 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1455 { 1456 struct cake_heap_entry ii = q->overflow_heap[i]; 1457 struct cake_heap_entry jj = q->overflow_heap[j]; 1458 1459 q->overflow_heap[i] = jj; 1460 q->overflow_heap[j] = ii; 1461 1462 q->tins[ii.t].overflow_idx[ii.b] = j; 1463 q->tins[jj.t].overflow_idx[jj.b] = i; 1464 } 1465 1466 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1467 { 1468 struct cake_heap_entry ii = q->overflow_heap[i]; 1469 1470 return q->tins[ii.t].backlogs[ii.b]; 1471 } 1472 1473 static void cake_heapify(struct cake_sched_data *q, u16 i) 1474 { 1475 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1476 u32 mb = cake_heap_get_backlog(q, i); 1477 u32 m = i; 1478 1479 while (m < a) { 1480 u32 l = m + m + 1; 1481 u32 r = l + 1; 1482 1483 if (l < a) { 1484 u32 lb = cake_heap_get_backlog(q, l); 1485 1486 if (lb > mb) { 1487 m = l; 1488 mb = lb; 1489 } 1490 } 1491 1492 if (r < a) { 1493 u32 rb = cake_heap_get_backlog(q, r); 1494 1495 if (rb > mb) { 1496 m = r; 1497 mb = rb; 1498 } 1499 } 1500 1501 if (m != i) { 1502 cake_heap_swap(q, i, m); 1503 i = m; 1504 } else { 1505 break; 1506 } 1507 } 1508 } 1509 1510 static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1511 { 1512 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1513 u16 p = (i - 1) >> 1; 1514 u32 ib = cake_heap_get_backlog(q, i); 1515 u32 pb = cake_heap_get_backlog(q, p); 1516 1517 if (ib > pb) { 1518 cake_heap_swap(q, i, p); 1519 i = p; 1520 } else { 1521 break; 1522 } 1523 } 1524 } 1525 1526 static int cake_advance_shaper(struct cake_sched_data *q, 1527 struct cake_tin_data *b, 1528 struct sk_buff *skb, 1529 ktime_t now, bool drop) 1530 { 1531 u32 len = get_cobalt_cb(skb)->adjusted_len; 1532 1533 /* charge packet bandwidth to this tin 1534 * and to the global shaper. 1535 */ 1536 if (q->rate_ns) { 1537 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1538 u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1539 u64 failsafe_dur = global_dur + (global_dur >> 1); 1540 1541 if (ktime_before(b->time_next_packet, now)) 1542 b->time_next_packet = ktime_add_ns(b->time_next_packet, 1543 tin_dur); 1544 1545 else if (ktime_before(b->time_next_packet, 1546 ktime_add_ns(now, tin_dur))) 1547 b->time_next_packet = ktime_add_ns(now, tin_dur); 1548 1549 q->time_next_packet = ktime_add_ns(q->time_next_packet, 1550 global_dur); 1551 if (!drop) 1552 q->failsafe_next_packet = \ 1553 ktime_add_ns(q->failsafe_next_packet, 1554 failsafe_dur); 1555 } 1556 return len; 1557 } 1558 1559 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1560 { 1561 struct cake_sched_data *q = qdisc_priv(sch); 1562 ktime_t now = ktime_get(); 1563 u32 idx = 0, tin = 0, len; 1564 struct cake_heap_entry qq; 1565 struct cake_tin_data *b; 1566 struct cake_flow *flow; 1567 struct sk_buff *skb; 1568 1569 if (!q->overflow_timeout) { 1570 int i; 1571 /* Build fresh max-heap */ 1572 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) 1573 cake_heapify(q, i); 1574 } 1575 q->overflow_timeout = 65535; 1576 1577 /* select longest queue for pruning */ 1578 qq = q->overflow_heap[0]; 1579 tin = qq.t; 1580 idx = qq.b; 1581 1582 b = &q->tins[tin]; 1583 flow = &b->flows[idx]; 1584 skb = dequeue_head(flow); 1585 if (unlikely(!skb)) { 1586 /* heap has gone wrong, rebuild it next time */ 1587 q->overflow_timeout = 0; 1588 return idx + (tin << 16); 1589 } 1590 1591 if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1592 b->unresponsive_flow_count++; 1593 1594 len = qdisc_pkt_len(skb); 1595 q->buffer_used -= skb->truesize; 1596 b->backlogs[idx] -= len; 1597 b->tin_backlog -= len; 1598 sch->qstats.backlog -= len; 1599 1600 flow->dropped++; 1601 b->tin_dropped++; 1602 1603 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 1604 cake_advance_shaper(q, b, skb, now, true); 1605 1606 qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); 1607 sch->q.qlen--; 1608 1609 cake_heapify(q, 0); 1610 1611 return idx + (tin << 16); 1612 } 1613 1614 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) 1615 { 1616 const int offset = skb_network_offset(skb); 1617 u16 *buf, buf_; 1618 u8 dscp; 1619 1620 switch (skb_protocol(skb, true)) { 1621 case htons(ETH_P_IP): 1622 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1623 if (unlikely(!buf)) 1624 return 0; 1625 1626 /* ToS is in the second byte of iphdr */ 1627 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; 1628 1629 if (wash && dscp) { 1630 const int wlen = offset + sizeof(struct iphdr); 1631 1632 if (!pskb_may_pull(skb, wlen) || 1633 skb_try_make_writable(skb, wlen)) 1634 return 0; 1635 1636 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1637 } 1638 1639 return dscp; 1640 1641 case htons(ETH_P_IPV6): 1642 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1643 if (unlikely(!buf)) 1644 return 0; 1645 1646 /* Traffic class is in the first and second bytes of ipv6hdr */ 1647 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; 1648 1649 if (wash && dscp) { 1650 const int wlen = offset + sizeof(struct ipv6hdr); 1651 1652 if (!pskb_may_pull(skb, wlen) || 1653 skb_try_make_writable(skb, wlen)) 1654 return 0; 1655 1656 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1657 } 1658 1659 return dscp; 1660 1661 case htons(ETH_P_ARP): 1662 return 0x38; /* CS7 - Net Control */ 1663 1664 default: 1665 /* If there is no Diffserv field, treat as best-effort */ 1666 return 0; 1667 } 1668 } 1669 1670 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1671 struct sk_buff *skb) 1672 { 1673 struct cake_sched_data *qd = qdisc_priv(sch); 1674 struct cake_sched_config *q = qd->config; 1675 u32 tin, mark; 1676 bool wash; 1677 u8 dscp; 1678 1679 /* Tin selection: Default to diffserv-based selection, allow overriding 1680 * using firewall marks or skb->priority. Call DSCP parsing early if 1681 * wash is enabled, otherwise defer to below to skip unneeded parsing. 1682 */ 1683 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; 1684 wash = !!(q->rate_flags & CAKE_FLAG_WASH); 1685 if (wash) 1686 dscp = cake_handle_diffserv(skb, wash); 1687 1688 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) 1689 tin = 0; 1690 1691 else if (mark && mark <= qd->tin_cnt) 1692 tin = qd->tin_order[mark - 1]; 1693 1694 else if (TC_H_MAJ(skb->priority) == sch->handle && 1695 TC_H_MIN(skb->priority) > 0 && 1696 TC_H_MIN(skb->priority) <= qd->tin_cnt) 1697 tin = qd->tin_order[TC_H_MIN(skb->priority) - 1]; 1698 1699 else { 1700 if (!wash) 1701 dscp = cake_handle_diffserv(skb, wash); 1702 tin = qd->tin_index[dscp]; 1703 1704 if (unlikely(tin >= qd->tin_cnt)) 1705 tin = 0; 1706 } 1707 1708 return &qd->tins[tin]; 1709 } 1710 1711 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1712 struct sk_buff *skb, int flow_mode, int *qerr) 1713 { 1714 struct cake_sched_data *q = qdisc_priv(sch); 1715 struct tcf_proto *filter; 1716 struct tcf_result res; 1717 u16 flow = 0, host = 0; 1718 int result; 1719 1720 filter = rcu_dereference_bh(q->filter_list); 1721 if (!filter) 1722 goto hash; 1723 1724 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1725 result = tcf_classify(skb, NULL, filter, &res, false); 1726 1727 if (result >= 0) { 1728 #ifdef CONFIG_NET_CLS_ACT 1729 switch (result) { 1730 case TC_ACT_STOLEN: 1731 case TC_ACT_QUEUED: 1732 case TC_ACT_TRAP: 1733 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1734 fallthrough; 1735 case TC_ACT_SHOT: 1736 return 0; 1737 } 1738 #endif 1739 if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1740 flow = TC_H_MIN(res.classid); 1741 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) 1742 host = TC_H_MAJ(res.classid) >> 16; 1743 } 1744 hash: 1745 *t = cake_select_tin(sch, skb); 1746 return cake_hash(*t, skb, flow_mode, flow, host) + 1; 1747 } 1748 1749 static void cake_reconfigure(struct Qdisc *sch); 1750 1751 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1752 struct sk_buff **to_free) 1753 { 1754 u32 idx, tin, prev_qlen, prev_backlog, drop_id; 1755 struct cake_sched_data *q = qdisc_priv(sch); 1756 int len = qdisc_pkt_len(skb), ret; 1757 struct sk_buff *ack = NULL; 1758 ktime_t now = ktime_get(); 1759 struct cake_tin_data *b; 1760 struct cake_flow *flow; 1761 bool same_flow = false; 1762 1763 /* choose flow to insert into */ 1764 idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret); 1765 if (idx == 0) { 1766 if (ret & __NET_XMIT_BYPASS) 1767 qdisc_qstats_drop(sch); 1768 __qdisc_drop(skb, to_free); 1769 return ret; 1770 } 1771 tin = (u32)(b - q->tins); 1772 idx--; 1773 flow = &b->flows[idx]; 1774 1775 /* ensure shaper state isn't stale */ 1776 if (!b->tin_backlog) { 1777 if (ktime_before(b->time_next_packet, now)) 1778 b->time_next_packet = now; 1779 1780 if (!sch->q.qlen) { 1781 if (ktime_before(q->time_next_packet, now)) { 1782 q->failsafe_next_packet = now; 1783 q->time_next_packet = now; 1784 } else if (ktime_after(q->time_next_packet, now) && 1785 ktime_after(q->failsafe_next_packet, now)) { 1786 u64 next = \ 1787 min(ktime_to_ns(q->time_next_packet), 1788 ktime_to_ns( 1789 q->failsafe_next_packet)); 1790 sch->qstats.overlimits++; 1791 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1792 } 1793 } 1794 } 1795 1796 if (unlikely(len > b->max_skblen)) 1797 b->max_skblen = len; 1798 1799 if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1800 struct sk_buff *segs, *nskb; 1801 netdev_features_t features = netif_skb_features(skb); 1802 unsigned int slen = 0, numsegs = 0; 1803 1804 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1805 if (IS_ERR_OR_NULL(segs)) 1806 return qdisc_drop(skb, sch, to_free); 1807 1808 skb_list_walk_safe(segs, segs, nskb) { 1809 skb_mark_not_on_list(segs); 1810 qdisc_skb_cb(segs)->pkt_len = segs->len; 1811 qdisc_skb_cb(segs)->pkt_segs = 1; 1812 cobalt_set_enqueue_time(segs, now); 1813 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1814 segs); 1815 flow_queue_add(flow, segs); 1816 1817 sch->q.qlen++; 1818 numsegs++; 1819 slen += segs->len; 1820 q->buffer_used += segs->truesize; 1821 b->packets++; 1822 } 1823 1824 /* stats */ 1825 b->bytes += slen; 1826 b->backlogs[idx] += slen; 1827 b->tin_backlog += slen; 1828 sch->qstats.backlog += slen; 1829 q->avg_window_bytes += slen; 1830 1831 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); 1832 consume_skb(skb); 1833 } else { 1834 /* not splitting */ 1835 int ack_pkt_len = 0; 1836 1837 cobalt_set_enqueue_time(skb, now); 1838 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1839 flow_queue_add(flow, skb); 1840 1841 if (q->config->ack_filter) 1842 ack = cake_ack_filter(q, flow); 1843 1844 if (ack) { 1845 b->ack_drops++; 1846 sch->qstats.drops++; 1847 ack_pkt_len = qdisc_pkt_len(ack); 1848 b->bytes += ack_pkt_len; 1849 q->buffer_used += skb->truesize - ack->truesize; 1850 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 1851 cake_advance_shaper(q, b, ack, now, true); 1852 1853 qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len); 1854 consume_skb(ack); 1855 } else { 1856 sch->q.qlen++; 1857 q->buffer_used += skb->truesize; 1858 } 1859 1860 /* stats */ 1861 b->packets++; 1862 b->bytes += len - ack_pkt_len; 1863 b->backlogs[idx] += len - ack_pkt_len; 1864 b->tin_backlog += len - ack_pkt_len; 1865 sch->qstats.backlog += len - ack_pkt_len; 1866 q->avg_window_bytes += len - ack_pkt_len; 1867 } 1868 1869 if (q->overflow_timeout) 1870 cake_heapify_up(q, b->overflow_idx[idx]); 1871 1872 /* incoming bandwidth capacity estimate */ 1873 if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1874 u64 packet_interval = \ 1875 ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1876 1877 if (packet_interval > NSEC_PER_SEC) 1878 packet_interval = NSEC_PER_SEC; 1879 1880 /* filter out short-term bursts, eg. wifi aggregation */ 1881 q->avg_packet_interval = \ 1882 cake_ewma(q->avg_packet_interval, 1883 packet_interval, 1884 (packet_interval > q->avg_packet_interval ? 1885 2 : 8)); 1886 1887 q->last_packet_time = now; 1888 1889 if (packet_interval > q->avg_packet_interval) { 1890 u64 window_interval = \ 1891 ktime_to_ns(ktime_sub(now, 1892 q->avg_window_begin)); 1893 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1894 1895 b = div64_u64(b, window_interval); 1896 q->avg_peak_bandwidth = 1897 cake_ewma(q->avg_peak_bandwidth, b, 1898 b > q->avg_peak_bandwidth ? 2 : 8); 1899 q->avg_window_bytes = 0; 1900 q->avg_window_begin = now; 1901 1902 if (ktime_after(now, 1903 ktime_add_ms(q->last_reconfig_time, 1904 250))) { 1905 q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1906 cake_reconfigure(sch); 1907 } 1908 } 1909 } else { 1910 q->avg_window_bytes = 0; 1911 q->last_packet_time = now; 1912 } 1913 1914 /* flowchain */ 1915 if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1916 if (!flow->set) { 1917 list_add_tail(&flow->flowchain, &b->new_flows); 1918 } else { 1919 b->decaying_flow_count--; 1920 list_move_tail(&flow->flowchain, &b->new_flows); 1921 } 1922 flow->set = CAKE_SET_SPARSE; 1923 b->sparse_flow_count++; 1924 1925 flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); 1926 } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 1927 /* this flow was empty, accounted as a sparse flow, but actually 1928 * in the bulk rotation. 1929 */ 1930 flow->set = CAKE_SET_BULK; 1931 b->sparse_flow_count--; 1932 b->bulk_flow_count++; 1933 1934 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 1935 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 1936 } 1937 1938 if (q->buffer_used > q->buffer_max_used) 1939 q->buffer_max_used = q->buffer_used; 1940 1941 if (q->buffer_used <= q->buffer_limit) 1942 return NET_XMIT_SUCCESS; 1943 1944 prev_qlen = sch->q.qlen; 1945 prev_backlog = sch->qstats.backlog; 1946 1947 while (q->buffer_used > q->buffer_limit) { 1948 drop_id = cake_drop(sch, to_free); 1949 if ((drop_id >> 16) == tin && 1950 (drop_id & 0xFFFF) == idx) 1951 same_flow = true; 1952 } 1953 1954 prev_qlen -= sch->q.qlen; 1955 prev_backlog -= sch->qstats.backlog; 1956 b->drop_overlimit += prev_qlen; 1957 1958 if (same_flow) { 1959 qdisc_tree_reduce_backlog(sch, prev_qlen - 1, 1960 prev_backlog - len); 1961 return NET_XMIT_CN; 1962 } 1963 qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog); 1964 return NET_XMIT_SUCCESS; 1965 } 1966 1967 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 1968 { 1969 struct cake_sched_data *q = qdisc_priv(sch); 1970 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1971 struct cake_flow *flow = &b->flows[q->cur_flow]; 1972 struct sk_buff *skb = NULL; 1973 u32 len; 1974 1975 if (flow->head) { 1976 skb = dequeue_head(flow); 1977 len = qdisc_pkt_len(skb); 1978 b->backlogs[q->cur_flow] -= len; 1979 b->tin_backlog -= len; 1980 sch->qstats.backlog -= len; 1981 q->buffer_used -= skb->truesize; 1982 sch->q.qlen--; 1983 1984 if (q->overflow_timeout) 1985 cake_heapify(q, b->overflow_idx[q->cur_flow]); 1986 } 1987 return skb; 1988 } 1989 1990 /* Discard leftover packets from a tin no longer in use. */ 1991 static void cake_clear_tin(struct Qdisc *sch, u16 tin) 1992 { 1993 struct cake_sched_data *q = qdisc_priv(sch); 1994 struct sk_buff *skb; 1995 1996 q->cur_tin = tin; 1997 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 1998 while (!!(skb = cake_dequeue_one(sch))) 1999 kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); 2000 } 2001 2002 static struct sk_buff *cake_dequeue(struct Qdisc *sch) 2003 { 2004 struct cake_sched_data *q = qdisc_priv(sch); 2005 struct cake_tin_data *b = &q->tins[q->cur_tin]; 2006 enum skb_drop_reason reason; 2007 ktime_t now = ktime_get(); 2008 struct cake_flow *flow; 2009 struct list_head *head; 2010 bool first_flow = true; 2011 struct sk_buff *skb; 2012 u64 delay; 2013 u32 len; 2014 2015 if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) { 2016 struct net_device *dev = qdisc_dev(sch); 2017 struct cake_sched_data *other_priv; 2018 u64 new_rate = q->config->rate_bps; 2019 u64 other_qlen, other_last_active; 2020 struct Qdisc *other_sch; 2021 u32 num_active_qs = 1; 2022 unsigned int ntx; 2023 2024 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 2025 other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); 2026 other_priv = qdisc_priv(other_sch); 2027 2028 if (other_priv == q) 2029 continue; 2030 2031 other_qlen = READ_ONCE(other_sch->q.qlen); 2032 other_last_active = READ_ONCE(other_priv->last_active); 2033 2034 if (other_qlen || other_last_active > q->last_checked_active) 2035 num_active_qs++; 2036 } 2037 2038 if (num_active_qs > 1) 2039 new_rate = div64_u64(q->config->rate_bps, num_active_qs); 2040 2041 /* mtu = 0 is used to only update the rate and not mess with cobalt params */ 2042 cake_set_rate(b, new_rate, 0, 0, 0); 2043 q->last_checked_active = now; 2044 q->active_queues = num_active_qs; 2045 q->rate_ns = b->tin_rate_ns; 2046 q->rate_shft = b->tin_rate_shft; 2047 } 2048 2049 begin: 2050 if (!sch->q.qlen) 2051 return NULL; 2052 2053 /* global hard shaper */ 2054 if (ktime_after(q->time_next_packet, now) && 2055 ktime_after(q->failsafe_next_packet, now)) { 2056 u64 next = min(ktime_to_ns(q->time_next_packet), 2057 ktime_to_ns(q->failsafe_next_packet)); 2058 2059 sch->qstats.overlimits++; 2060 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2061 return NULL; 2062 } 2063 2064 /* Choose a class to work on. */ 2065 if (!q->rate_ns) { 2066 /* In unlimited mode, can't rely on shaper timings, just balance 2067 * with DRR 2068 */ 2069 bool wrapped = false, empty = true; 2070 2071 while (b->tin_deficit < 0 || 2072 !(b->sparse_flow_count + b->bulk_flow_count)) { 2073 if (b->tin_deficit <= 0) 2074 b->tin_deficit += b->tin_quantum; 2075 if (b->sparse_flow_count + b->bulk_flow_count) 2076 empty = false; 2077 2078 q->cur_tin++; 2079 b++; 2080 if (q->cur_tin >= q->tin_cnt) { 2081 q->cur_tin = 0; 2082 b = q->tins; 2083 2084 if (wrapped) { 2085 /* It's possible for q->qlen to be 2086 * nonzero when we actually have no 2087 * packets anywhere. 2088 */ 2089 if (empty) 2090 return NULL; 2091 } else { 2092 wrapped = true; 2093 } 2094 } 2095 } 2096 } else { 2097 /* In shaped mode, choose: 2098 * - Highest-priority tin with queue and meeting schedule, or 2099 * - The earliest-scheduled tin with queue. 2100 */ 2101 ktime_t best_time = KTIME_MAX; 2102 int tin, best_tin = 0; 2103 2104 for (tin = 0; tin < q->tin_cnt; tin++) { 2105 b = q->tins + tin; 2106 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 2107 ktime_t time_to_pkt = \ 2108 ktime_sub(b->time_next_packet, now); 2109 2110 if (ktime_to_ns(time_to_pkt) <= 0 || 2111 ktime_compare(time_to_pkt, 2112 best_time) <= 0) { 2113 best_time = time_to_pkt; 2114 best_tin = tin; 2115 } 2116 } 2117 } 2118 2119 q->cur_tin = best_tin; 2120 b = q->tins + best_tin; 2121 2122 /* No point in going further if no packets to deliver. */ 2123 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 2124 return NULL; 2125 } 2126 2127 retry: 2128 /* service this class */ 2129 head = &b->decaying_flows; 2130 if (!first_flow || list_empty(head)) { 2131 head = &b->new_flows; 2132 if (list_empty(head)) { 2133 head = &b->old_flows; 2134 if (unlikely(list_empty(head))) { 2135 head = &b->decaying_flows; 2136 if (unlikely(list_empty(head))) 2137 goto begin; 2138 } 2139 } 2140 } 2141 flow = list_first_entry(head, struct cake_flow, flowchain); 2142 q->cur_flow = flow - b->flows; 2143 first_flow = false; 2144 2145 /* flow isolation (DRR++) */ 2146 if (flow->deficit <= 0) { 2147 /* Keep all flows with deficits out of the sparse and decaying 2148 * rotations. No non-empty flow can go into the decaying 2149 * rotation, so they can't get deficits 2150 */ 2151 if (flow->set == CAKE_SET_SPARSE) { 2152 if (flow->head) { 2153 b->sparse_flow_count--; 2154 b->bulk_flow_count++; 2155 2156 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2157 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2158 2159 flow->set = CAKE_SET_BULK; 2160 } else { 2161 /* we've moved it to the bulk rotation for 2162 * correct deficit accounting but we still want 2163 * to count it as a sparse flow, not a bulk one. 2164 */ 2165 flow->set = CAKE_SET_SPARSE_WAIT; 2166 } 2167 } 2168 2169 flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode); 2170 list_move_tail(&flow->flowchain, &b->old_flows); 2171 2172 goto retry; 2173 } 2174 2175 /* Retrieve a packet via the AQM */ 2176 while (1) { 2177 skb = cake_dequeue_one(sch); 2178 if (!skb) { 2179 /* this queue was actually empty */ 2180 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2181 b->unresponsive_flow_count--; 2182 2183 if (flow->cvars.p_drop || flow->cvars.count || 2184 ktime_before(now, flow->cvars.drop_next)) { 2185 /* keep in the flowchain until the state has 2186 * decayed to rest 2187 */ 2188 list_move_tail(&flow->flowchain, 2189 &b->decaying_flows); 2190 if (flow->set == CAKE_SET_BULK) { 2191 b->bulk_flow_count--; 2192 2193 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2194 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2195 2196 b->decaying_flow_count++; 2197 } else if (flow->set == CAKE_SET_SPARSE || 2198 flow->set == CAKE_SET_SPARSE_WAIT) { 2199 b->sparse_flow_count--; 2200 b->decaying_flow_count++; 2201 } 2202 flow->set = CAKE_SET_DECAYING; 2203 } else { 2204 /* remove empty queue from the flowchain */ 2205 list_del_init(&flow->flowchain); 2206 if (flow->set == CAKE_SET_SPARSE || 2207 flow->set == CAKE_SET_SPARSE_WAIT) 2208 b->sparse_flow_count--; 2209 else if (flow->set == CAKE_SET_BULK) { 2210 b->bulk_flow_count--; 2211 2212 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2213 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2214 } else 2215 b->decaying_flow_count--; 2216 2217 flow->set = CAKE_SET_NONE; 2218 } 2219 goto begin; 2220 } 2221 2222 reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2223 (b->bulk_flow_count * 2224 !!(q->config->rate_flags & 2225 CAKE_FLAG_INGRESS))); 2226 /* Last packet in queue may be marked, shouldn't be dropped */ 2227 if (reason == SKB_NOT_DROPPED_YET || !flow->head) 2228 break; 2229 2230 /* drop this packet, get another one */ 2231 if (q->config->rate_flags & CAKE_FLAG_INGRESS) { 2232 len = cake_advance_shaper(q, b, skb, 2233 now, true); 2234 flow->deficit -= len; 2235 b->tin_deficit -= len; 2236 } 2237 flow->dropped++; 2238 b->tin_dropped++; 2239 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2240 qdisc_qstats_drop(sch); 2241 qdisc_dequeue_drop(sch, skb, reason); 2242 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 2243 goto retry; 2244 } 2245 2246 b->tin_ecn_mark += !!flow->cvars.ecn_marked; 2247 qdisc_bstats_update(sch, skb); 2248 WRITE_ONCE(q->last_active, now); 2249 2250 /* collect delay stats */ 2251 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2252 b->avge_delay = cake_ewma(b->avge_delay, delay, 8); 2253 b->peak_delay = cake_ewma(b->peak_delay, delay, 2254 delay > b->peak_delay ? 2 : 8); 2255 b->base_delay = cake_ewma(b->base_delay, delay, 2256 delay < b->base_delay ? 2 : 8); 2257 2258 len = cake_advance_shaper(q, b, skb, now, false); 2259 flow->deficit -= len; 2260 b->tin_deficit -= len; 2261 2262 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2263 u64 next = min(ktime_to_ns(q->time_next_packet), 2264 ktime_to_ns(q->failsafe_next_packet)); 2265 2266 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2267 } else if (!sch->q.qlen) { 2268 int i; 2269 2270 for (i = 0; i < q->tin_cnt; i++) { 2271 if (q->tins[i].decaying_flow_count) { 2272 ktime_t next = \ 2273 ktime_add_ns(now, 2274 q->tins[i].cparams.target); 2275 2276 qdisc_watchdog_schedule_ns(&q->watchdog, 2277 ktime_to_ns(next)); 2278 break; 2279 } 2280 } 2281 } 2282 2283 if (q->overflow_timeout) 2284 q->overflow_timeout--; 2285 2286 return skb; 2287 } 2288 2289 static void cake_reset(struct Qdisc *sch) 2290 { 2291 struct cake_sched_data *q = qdisc_priv(sch); 2292 u32 c; 2293 2294 if (!q->tins) 2295 return; 2296 2297 for (c = 0; c < CAKE_MAX_TINS; c++) 2298 cake_clear_tin(sch, c); 2299 } 2300 2301 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2302 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2303 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2304 [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2305 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2306 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2307 [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2308 [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2309 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2310 [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2311 [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2312 [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2313 [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2314 [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2315 [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2316 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2317 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, 2318 [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, 2319 }; 2320 2321 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2322 u64 target_ns, u64 rtt_est_ns) 2323 { 2324 /* convert byte-rate into time-per-byte 2325 * so it will always unwedge in reasonable time. 2326 */ 2327 static const u64 MIN_RATE = 64; 2328 u32 byte_target = mtu; 2329 u64 byte_target_ns; 2330 u8 rate_shft = 0; 2331 u64 rate_ns = 0; 2332 2333 b->flow_quantum = 1514; 2334 if (rate) { 2335 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); 2336 rate_shft = 34; 2337 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2338 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2339 while (!!(rate_ns >> 34)) { 2340 rate_ns >>= 1; 2341 rate_shft--; 2342 } 2343 } /* else unlimited, ie. zero delay */ 2344 2345 b->tin_rate_bps = rate; 2346 b->tin_rate_ns = rate_ns; 2347 b->tin_rate_shft = rate_shft; 2348 2349 if (mtu == 0) 2350 return; 2351 2352 byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2353 2354 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); 2355 b->cparams.interval = max(rtt_est_ns + 2356 b->cparams.target - target_ns, 2357 b->cparams.target * 2); 2358 b->cparams.mtu_time = byte_target_ns; 2359 b->cparams.p_inc = 1 << 24; /* 1/256 */ 2360 b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2361 } 2362 2363 static int cake_config_besteffort(struct Qdisc *sch) 2364 { 2365 struct cake_sched_data *q = qdisc_priv(sch); 2366 struct cake_tin_data *b = &q->tins[0]; 2367 u32 mtu = psched_mtu(qdisc_dev(sch)); 2368 u64 rate = q->config->rate_bps; 2369 2370 q->tin_cnt = 1; 2371 2372 q->tin_index = besteffort; 2373 q->tin_order = normal_order; 2374 2375 cake_set_rate(b, rate, mtu, 2376 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2377 b->tin_quantum = 65535; 2378 2379 return 0; 2380 } 2381 2382 static int cake_config_precedence(struct Qdisc *sch) 2383 { 2384 /* convert high-level (user visible) parameters into internal format */ 2385 struct cake_sched_data *q = qdisc_priv(sch); 2386 u32 mtu = psched_mtu(qdisc_dev(sch)); 2387 u64 rate = q->config->rate_bps; 2388 u32 quantum = 256; 2389 u32 i; 2390 2391 q->tin_cnt = 8; 2392 q->tin_index = precedence; 2393 q->tin_order = normal_order; 2394 2395 for (i = 0; i < q->tin_cnt; i++) { 2396 struct cake_tin_data *b = &q->tins[i]; 2397 2398 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), 2399 us_to_ns(q->config->interval)); 2400 2401 b->tin_quantum = max_t(u16, 1U, quantum); 2402 2403 /* calculate next class's parameters */ 2404 rate *= 7; 2405 rate >>= 3; 2406 2407 quantum *= 7; 2408 quantum >>= 3; 2409 } 2410 2411 return 0; 2412 } 2413 2414 /* List of known Diffserv codepoints: 2415 * 2416 * Default Forwarding (DF/CS0) - Best Effort 2417 * Max Throughput (TOS2) 2418 * Min Delay (TOS4) 2419 * LLT "La" (TOS5) 2420 * Assured Forwarding 1 (AF1x) - x3 2421 * Assured Forwarding 2 (AF2x) - x3 2422 * Assured Forwarding 3 (AF3x) - x3 2423 * Assured Forwarding 4 (AF4x) - x3 2424 * Precedence Class 1 (CS1) 2425 * Precedence Class 2 (CS2) 2426 * Precedence Class 3 (CS3) 2427 * Precedence Class 4 (CS4) 2428 * Precedence Class 5 (CS5) 2429 * Precedence Class 6 (CS6) 2430 * Precedence Class 7 (CS7) 2431 * Voice Admit (VA) 2432 * Expedited Forwarding (EF) 2433 * Lower Effort (LE) 2434 * 2435 * Total 26 codepoints. 2436 */ 2437 2438 /* List of traffic classes in RFC 4594, updated by RFC 8622: 2439 * (roughly descending order of contended priority) 2440 * (roughly ascending order of uncontended throughput) 2441 * 2442 * Network Control (CS6,CS7) - routing traffic 2443 * Telephony (EF,VA) - aka. VoIP streams 2444 * Signalling (CS5) - VoIP setup 2445 * Multimedia Conferencing (AF4x) - aka. video calls 2446 * Realtime Interactive (CS4) - eg. games 2447 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2448 * Broadcast Video (CS3) 2449 * Low-Latency Data (AF2x,TOS4) - eg. database 2450 * Ops, Admin, Management (CS2) - eg. ssh 2451 * Standard Service (DF & unrecognised codepoints) 2452 * High-Throughput Data (AF1x,TOS2) - eg. web traffic 2453 * Low-Priority Data (LE,CS1) - eg. BitTorrent 2454 * 2455 * Total 12 traffic classes. 2456 */ 2457 2458 static int cake_config_diffserv8(struct Qdisc *sch) 2459 { 2460 /* Pruned list of traffic classes for typical applications: 2461 * 2462 * Network Control (CS6, CS7) 2463 * Minimum Latency (EF, VA, CS5, CS4) 2464 * Interactive Shell (CS2) 2465 * Low Latency Transactions (AF2x, TOS4) 2466 * Video Streaming (AF4x, AF3x, CS3) 2467 * Bog Standard (DF etc.) 2468 * High Throughput (AF1x, TOS2, CS1) 2469 * Background Traffic (LE) 2470 * 2471 * Total 8 traffic classes. 2472 */ 2473 2474 struct cake_sched_data *q = qdisc_priv(sch); 2475 u32 mtu = psched_mtu(qdisc_dev(sch)); 2476 u64 rate = q->config->rate_bps; 2477 u32 quantum = 256; 2478 u32 i; 2479 2480 q->tin_cnt = 8; 2481 2482 /* codepoint to class mapping */ 2483 q->tin_index = diffserv8; 2484 q->tin_order = normal_order; 2485 2486 /* class characteristics */ 2487 for (i = 0; i < q->tin_cnt; i++) { 2488 struct cake_tin_data *b = &q->tins[i]; 2489 2490 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), 2491 us_to_ns(q->config->interval)); 2492 2493 b->tin_quantum = max_t(u16, 1U, quantum); 2494 2495 /* calculate next class's parameters */ 2496 rate *= 7; 2497 rate >>= 3; 2498 2499 quantum *= 7; 2500 quantum >>= 3; 2501 } 2502 2503 return 0; 2504 } 2505 2506 static int cake_config_diffserv4(struct Qdisc *sch) 2507 { 2508 /* Further pruned list of traffic classes for four-class system: 2509 * 2510 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2511 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) 2512 * Best Effort (DF, AF1x, TOS2, and those not specified) 2513 * Background Traffic (LE, CS1) 2514 * 2515 * Total 4 traffic classes. 2516 */ 2517 2518 struct cake_sched_data *q = qdisc_priv(sch); 2519 u32 mtu = psched_mtu(qdisc_dev(sch)); 2520 u64 rate = q->config->rate_bps; 2521 u32 quantum = 1024; 2522 2523 q->tin_cnt = 4; 2524 2525 /* codepoint to class mapping */ 2526 q->tin_index = diffserv4; 2527 q->tin_order = bulk_order; 2528 2529 /* class characteristics */ 2530 cake_set_rate(&q->tins[0], rate, mtu, 2531 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2532 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2533 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2534 cake_set_rate(&q->tins[2], rate >> 1, mtu, 2535 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2536 cake_set_rate(&q->tins[3], rate >> 2, mtu, 2537 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2538 2539 /* bandwidth-sharing weights */ 2540 q->tins[0].tin_quantum = quantum; 2541 q->tins[1].tin_quantum = quantum >> 4; 2542 q->tins[2].tin_quantum = quantum >> 1; 2543 q->tins[3].tin_quantum = quantum >> 2; 2544 2545 return 0; 2546 } 2547 2548 static int cake_config_diffserv3(struct Qdisc *sch) 2549 { 2550 /* Simplified Diffserv structure with 3 tins. 2551 * Latency Sensitive (CS7, CS6, EF, VA, TOS4) 2552 * Best Effort 2553 * Low Priority (LE, CS1) 2554 */ 2555 struct cake_sched_data *q = qdisc_priv(sch); 2556 u32 mtu = psched_mtu(qdisc_dev(sch)); 2557 u64 rate = q->config->rate_bps; 2558 u32 quantum = 1024; 2559 2560 q->tin_cnt = 3; 2561 2562 /* codepoint to class mapping */ 2563 q->tin_index = diffserv3; 2564 q->tin_order = bulk_order; 2565 2566 /* class characteristics */ 2567 cake_set_rate(&q->tins[0], rate, mtu, 2568 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2569 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2570 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2571 cake_set_rate(&q->tins[2], rate >> 2, mtu, 2572 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2573 2574 /* bandwidth-sharing weights */ 2575 q->tins[0].tin_quantum = quantum; 2576 q->tins[1].tin_quantum = quantum >> 4; 2577 q->tins[2].tin_quantum = quantum >> 2; 2578 2579 return 0; 2580 } 2581 2582 static void cake_reconfigure(struct Qdisc *sch) 2583 { 2584 struct cake_sched_data *qd = qdisc_priv(sch); 2585 struct cake_sched_config *q = qd->config; 2586 int c, ft; 2587 2588 switch (q->tin_mode) { 2589 case CAKE_DIFFSERV_BESTEFFORT: 2590 ft = cake_config_besteffort(sch); 2591 break; 2592 2593 case CAKE_DIFFSERV_PRECEDENCE: 2594 ft = cake_config_precedence(sch); 2595 break; 2596 2597 case CAKE_DIFFSERV_DIFFSERV8: 2598 ft = cake_config_diffserv8(sch); 2599 break; 2600 2601 case CAKE_DIFFSERV_DIFFSERV4: 2602 ft = cake_config_diffserv4(sch); 2603 break; 2604 2605 case CAKE_DIFFSERV_DIFFSERV3: 2606 default: 2607 ft = cake_config_diffserv3(sch); 2608 break; 2609 } 2610 2611 for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) { 2612 cake_clear_tin(sch, c); 2613 qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time; 2614 } 2615 2616 qd->rate_ns = qd->tins[ft].tin_rate_ns; 2617 qd->rate_shft = qd->tins[ft].tin_rate_shft; 2618 2619 if (q->buffer_config_limit) { 2620 qd->buffer_limit = q->buffer_config_limit; 2621 } else if (q->rate_bps) { 2622 u64 t = q->rate_bps * q->interval; 2623 2624 do_div(t, USEC_PER_SEC / 4); 2625 qd->buffer_limit = max_t(u32, t, 4U << 20); 2626 } else { 2627 qd->buffer_limit = ~0; 2628 } 2629 2630 sch->flags &= ~TCQ_F_CAN_BYPASS; 2631 2632 qd->buffer_limit = min(qd->buffer_limit, 2633 max(sch->limit * psched_mtu(qdisc_dev(sch)), 2634 q->buffer_config_limit)); 2635 } 2636 2637 static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt, 2638 struct netlink_ext_ack *extack, bool *overhead_changed) 2639 { 2640 struct nlattr *tb[TCA_CAKE_MAX + 1]; 2641 u16 rate_flags = q->rate_flags; 2642 u8 flow_mode = q->flow_mode; 2643 int err; 2644 2645 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, 2646 extack); 2647 if (err < 0) 2648 return err; 2649 2650 if (tb[TCA_CAKE_NAT]) { 2651 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 2652 flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2653 flow_mode |= CAKE_FLOW_NAT_FLAG * 2654 !!nla_get_u32(tb[TCA_CAKE_NAT]); 2655 #else 2656 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2657 "No conntrack support in kernel"); 2658 return -EOPNOTSUPP; 2659 #endif 2660 } 2661 2662 if (tb[TCA_CAKE_AUTORATE]) { 2663 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) { 2664 if (q->is_shared) { 2665 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE], 2666 "Can't use autorate-ingress with cake_mq"); 2667 return -EOPNOTSUPP; 2668 } 2669 rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2670 } else { 2671 rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2672 } 2673 } 2674 2675 if (tb[TCA_CAKE_BASE_RATE64]) 2676 WRITE_ONCE(q->rate_bps, 2677 nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); 2678 2679 if (tb[TCA_CAKE_DIFFSERV_MODE]) 2680 WRITE_ONCE(q->tin_mode, 2681 nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); 2682 2683 if (tb[TCA_CAKE_WASH]) { 2684 if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2685 rate_flags |= CAKE_FLAG_WASH; 2686 else 2687 rate_flags &= ~CAKE_FLAG_WASH; 2688 } 2689 2690 if (tb[TCA_CAKE_FLOW_MODE]) 2691 flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | 2692 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2693 CAKE_FLOW_MASK)); 2694 2695 if (tb[TCA_CAKE_ATM]) 2696 WRITE_ONCE(q->atm_mode, 2697 nla_get_u32(tb[TCA_CAKE_ATM])); 2698 2699 if (tb[TCA_CAKE_OVERHEAD]) { 2700 WRITE_ONCE(q->rate_overhead, 2701 nla_get_s32(tb[TCA_CAKE_OVERHEAD])); 2702 rate_flags |= CAKE_FLAG_OVERHEAD; 2703 *overhead_changed = true; 2704 } 2705 2706 if (tb[TCA_CAKE_RAW]) { 2707 rate_flags &= ~CAKE_FLAG_OVERHEAD; 2708 *overhead_changed = true; 2709 } 2710 2711 if (tb[TCA_CAKE_MPU]) 2712 WRITE_ONCE(q->rate_mpu, 2713 nla_get_u32(tb[TCA_CAKE_MPU])); 2714 2715 if (tb[TCA_CAKE_RTT]) { 2716 u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2717 2718 WRITE_ONCE(q->interval, max(interval, 1U)); 2719 } 2720 2721 if (tb[TCA_CAKE_TARGET]) { 2722 u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2723 2724 WRITE_ONCE(q->target, max(target, 1U)); 2725 } 2726 2727 if (tb[TCA_CAKE_INGRESS]) { 2728 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2729 rate_flags |= CAKE_FLAG_INGRESS; 2730 else 2731 rate_flags &= ~CAKE_FLAG_INGRESS; 2732 } 2733 2734 if (tb[TCA_CAKE_ACK_FILTER]) 2735 WRITE_ONCE(q->ack_filter, 2736 nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); 2737 2738 if (tb[TCA_CAKE_MEMORY]) 2739 WRITE_ONCE(q->buffer_config_limit, 2740 nla_get_u32(tb[TCA_CAKE_MEMORY])); 2741 2742 if (tb[TCA_CAKE_SPLIT_GSO]) { 2743 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) 2744 rate_flags |= CAKE_FLAG_SPLIT_GSO; 2745 else 2746 rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2747 } 2748 2749 if (tb[TCA_CAKE_FWMARK]) { 2750 WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); 2751 WRITE_ONCE(q->fwmark_shft, 2752 q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); 2753 } 2754 2755 WRITE_ONCE(q->rate_flags, rate_flags); 2756 WRITE_ONCE(q->flow_mode, flow_mode); 2757 2758 return 0; 2759 } 2760 2761 static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2762 struct netlink_ext_ack *extack) 2763 { 2764 struct cake_sched_data *qd = qdisc_priv(sch); 2765 struct cake_sched_config *q = qd->config; 2766 bool overhead_changed = false; 2767 int ret; 2768 2769 if (q->is_shared) { 2770 NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs"); 2771 return -EOPNOTSUPP; 2772 } 2773 2774 ret = cake_config_change(q, opt, extack, &overhead_changed); 2775 if (ret) 2776 return ret; 2777 2778 if (overhead_changed) { 2779 qd->max_netlen = 0; 2780 qd->max_adjlen = 0; 2781 qd->min_netlen = ~0; 2782 qd->min_adjlen = ~0; 2783 } 2784 2785 if (qd->tins) { 2786 sch_tree_lock(sch); 2787 cake_reconfigure(sch); 2788 sch_tree_unlock(sch); 2789 } 2790 2791 return 0; 2792 } 2793 2794 static void cake_destroy(struct Qdisc *sch) 2795 { 2796 struct cake_sched_data *q = qdisc_priv(sch); 2797 2798 qdisc_watchdog_cancel(&q->watchdog); 2799 tcf_block_put(q->block); 2800 kvfree(q->tins); 2801 if (q->config && !q->config->is_shared) 2802 kvfree(q->config); 2803 } 2804 2805 static void cake_config_init(struct cake_sched_config *q, bool is_shared) 2806 { 2807 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2808 q->flow_mode = CAKE_FLOW_TRIPLE; 2809 2810 q->rate_bps = 0; /* unlimited by default */ 2811 2812 q->interval = 100000; /* 100ms default */ 2813 q->target = 5000; /* 5ms: codel RFC argues 2814 * for 5 to 10% of interval 2815 */ 2816 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2817 q->is_shared = is_shared; 2818 q->sync_time = 200 * NSEC_PER_USEC; 2819 } 2820 2821 static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2822 struct netlink_ext_ack *extack) 2823 { 2824 struct cake_sched_data *qd = qdisc_priv(sch); 2825 struct cake_sched_config *q; 2826 int i, j, err; 2827 2828 q = kzalloc(sizeof(*q), GFP_KERNEL); 2829 if (!q) 2830 return -ENOMEM; 2831 2832 cake_config_init(q, false); 2833 2834 sch->limit = 10240; 2835 sch->flags |= TCQ_F_DEQUEUE_DROPS; 2836 2837 qd->cur_tin = 0; 2838 qd->cur_flow = 0; 2839 qd->config = q; 2840 2841 qdisc_watchdog_init(&qd->watchdog, sch); 2842 2843 if (opt) { 2844 err = cake_change(sch, opt, extack); 2845 2846 if (err) 2847 goto err; 2848 } 2849 2850 err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack); 2851 if (err) 2852 goto err; 2853 2854 quantum_div[0] = ~0; 2855 for (i = 1; i <= CAKE_QUEUES; i++) 2856 quantum_div[i] = 65535 / i; 2857 2858 qd->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), 2859 GFP_KERNEL); 2860 if (!qd->tins) { 2861 err = -ENOMEM; 2862 goto err; 2863 } 2864 2865 for (i = 0; i < CAKE_MAX_TINS; i++) { 2866 struct cake_tin_data *b = qd->tins + i; 2867 2868 INIT_LIST_HEAD(&b->new_flows); 2869 INIT_LIST_HEAD(&b->old_flows); 2870 INIT_LIST_HEAD(&b->decaying_flows); 2871 b->sparse_flow_count = 0; 2872 b->bulk_flow_count = 0; 2873 b->decaying_flow_count = 0; 2874 2875 for (j = 0; j < CAKE_QUEUES; j++) { 2876 struct cake_flow *flow = b->flows + j; 2877 u32 k = j * CAKE_MAX_TINS + i; 2878 2879 INIT_LIST_HEAD(&flow->flowchain); 2880 cobalt_vars_init(&flow->cvars); 2881 2882 qd->overflow_heap[k].t = i; 2883 qd->overflow_heap[k].b = j; 2884 b->overflow_idx[j] = k; 2885 } 2886 } 2887 2888 cake_reconfigure(sch); 2889 qd->avg_peak_bandwidth = q->rate_bps; 2890 qd->min_netlen = ~0; 2891 qd->min_adjlen = ~0; 2892 qd->active_queues = 0; 2893 qd->last_checked_active = 0; 2894 2895 return 0; 2896 err: 2897 kvfree(qd->config); 2898 qd->config = NULL; 2899 return err; 2900 } 2901 2902 static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg) 2903 { 2904 struct cake_sched_data *qd = qdisc_priv(sch); 2905 struct cake_sched_config *q = qd->config; 2906 2907 qd->config = cfg; 2908 2909 if (!q->is_shared) 2910 kvfree(q); 2911 2912 cake_reconfigure(sch); 2913 } 2914 2915 static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb) 2916 { 2917 struct nlattr *opts; 2918 u16 rate_flags; 2919 u8 flow_mode; 2920 2921 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 2922 if (!opts) 2923 goto nla_put_failure; 2924 2925 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, 2926 READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) 2927 goto nla_put_failure; 2928 2929 flow_mode = READ_ONCE(q->flow_mode); 2930 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) 2931 goto nla_put_failure; 2932 2933 if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) 2934 goto nla_put_failure; 2935 2936 if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) 2937 goto nla_put_failure; 2938 2939 if (nla_put_u32(skb, TCA_CAKE_MEMORY, 2940 READ_ONCE(q->buffer_config_limit))) 2941 goto nla_put_failure; 2942 2943 rate_flags = READ_ONCE(q->rate_flags); 2944 if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2945 !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2946 goto nla_put_failure; 2947 2948 if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2949 !!(rate_flags & CAKE_FLAG_INGRESS))) 2950 goto nla_put_failure; 2951 2952 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) 2953 goto nla_put_failure; 2954 2955 if (nla_put_u32(skb, TCA_CAKE_NAT, 2956 !!(flow_mode & CAKE_FLOW_NAT_FLAG))) 2957 goto nla_put_failure; 2958 2959 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) 2960 goto nla_put_failure; 2961 2962 if (nla_put_u32(skb, TCA_CAKE_WASH, 2963 !!(rate_flags & CAKE_FLAG_WASH))) 2964 goto nla_put_failure; 2965 2966 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) 2967 goto nla_put_failure; 2968 2969 if (!(rate_flags & CAKE_FLAG_OVERHEAD)) 2970 if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2971 goto nla_put_failure; 2972 2973 if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) 2974 goto nla_put_failure; 2975 2976 if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) 2977 goto nla_put_failure; 2978 2979 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 2980 !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) 2981 goto nla_put_failure; 2982 2983 if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) 2984 goto nla_put_failure; 2985 2986 return nla_nest_end(skb, opts); 2987 2988 nla_put_failure: 2989 return -1; 2990 } 2991 2992 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2993 { 2994 struct cake_sched_data *qd = qdisc_priv(sch); 2995 2996 return cake_config_dump(qd->config, skb); 2997 } 2998 2999 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 3000 { 3001 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 3002 struct cake_sched_data *q = qdisc_priv(sch); 3003 struct nlattr *tstats, *ts; 3004 int i; 3005 3006 if (!stats) 3007 return -1; 3008 3009 #define PUT_STAT_U32(attr, data) do { \ 3010 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3011 goto nla_put_failure; \ 3012 } while (0) 3013 #define PUT_STAT_U64(attr, data) do { \ 3014 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 3015 data, TCA_CAKE_STATS_PAD)) \ 3016 goto nla_put_failure; \ 3017 } while (0) 3018 3019 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); 3020 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); 3021 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); 3022 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); 3023 PUT_STAT_U32(MAX_NETLEN, q->max_netlen); 3024 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); 3025 PUT_STAT_U32(MIN_NETLEN, q->min_netlen); 3026 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); 3027 PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues); 3028 3029 #undef PUT_STAT_U32 3030 #undef PUT_STAT_U64 3031 3032 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); 3033 if (!tstats) 3034 goto nla_put_failure; 3035 3036 #define PUT_TSTAT_U32(attr, data) do { \ 3037 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 3038 goto nla_put_failure; \ 3039 } while (0) 3040 #define PUT_TSTAT_U64(attr, data) do { \ 3041 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 3042 data, TCA_CAKE_TIN_STATS_PAD)) \ 3043 goto nla_put_failure; \ 3044 } while (0) 3045 3046 for (i = 0; i < q->tin_cnt; i++) { 3047 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3048 3049 ts = nla_nest_start_noflag(d->skb, i + 1); 3050 if (!ts) 3051 goto nla_put_failure; 3052 3053 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); 3054 PUT_TSTAT_U64(SENT_BYTES64, b->bytes); 3055 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); 3056 3057 PUT_TSTAT_U32(TARGET_US, 3058 ktime_to_us(ns_to_ktime(b->cparams.target))); 3059 PUT_TSTAT_U32(INTERVAL_US, 3060 ktime_to_us(ns_to_ktime(b->cparams.interval))); 3061 3062 PUT_TSTAT_U32(SENT_PACKETS, b->packets); 3063 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); 3064 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); 3065 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); 3066 3067 PUT_TSTAT_U32(PEAK_DELAY_US, 3068 ktime_to_us(ns_to_ktime(b->peak_delay))); 3069 PUT_TSTAT_U32(AVG_DELAY_US, 3070 ktime_to_us(ns_to_ktime(b->avge_delay))); 3071 PUT_TSTAT_U32(BASE_DELAY_US, 3072 ktime_to_us(ns_to_ktime(b->base_delay))); 3073 3074 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); 3075 PUT_TSTAT_U32(WAY_MISSES, b->way_misses); 3076 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); 3077 3078 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + 3079 b->decaying_flow_count); 3080 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); 3081 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); 3082 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); 3083 3084 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); 3085 nla_nest_end(d->skb, ts); 3086 } 3087 3088 #undef PUT_TSTAT_U32 3089 #undef PUT_TSTAT_U64 3090 3091 nla_nest_end(d->skb, tstats); 3092 return nla_nest_end(d->skb, stats); 3093 3094 nla_put_failure: 3095 nla_nest_cancel(d->skb, stats); 3096 return -1; 3097 } 3098 3099 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 3100 { 3101 return NULL; 3102 } 3103 3104 static unsigned long cake_find(struct Qdisc *sch, u32 classid) 3105 { 3106 return 0; 3107 } 3108 3109 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 3110 u32 classid) 3111 { 3112 return 0; 3113 } 3114 3115 static void cake_unbind(struct Qdisc *q, unsigned long cl) 3116 { 3117 } 3118 3119 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 3120 struct netlink_ext_ack *extack) 3121 { 3122 struct cake_sched_data *q = qdisc_priv(sch); 3123 3124 if (cl) 3125 return NULL; 3126 return q->block; 3127 } 3128 3129 static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 3130 struct sk_buff *skb, struct tcmsg *tcm) 3131 { 3132 tcm->tcm_handle |= TC_H_MIN(cl); 3133 return 0; 3134 } 3135 3136 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 3137 struct gnet_dump *d) 3138 { 3139 struct cake_sched_data *q = qdisc_priv(sch); 3140 const struct cake_flow *flow = NULL; 3141 struct gnet_stats_queue qs = { 0 }; 3142 struct nlattr *stats; 3143 u32 idx = cl - 1; 3144 3145 if (idx < CAKE_QUEUES * q->tin_cnt) { 3146 const struct cake_tin_data *b = \ 3147 &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 3148 const struct sk_buff *skb; 3149 3150 flow = &b->flows[idx % CAKE_QUEUES]; 3151 3152 if (flow->head) { 3153 sch_tree_lock(sch); 3154 skb = flow->head; 3155 while (skb) { 3156 qs.qlen++; 3157 skb = skb->next; 3158 } 3159 sch_tree_unlock(sch); 3160 } 3161 qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 3162 qs.drops = flow->dropped; 3163 } 3164 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 3165 return -1; 3166 if (flow) { 3167 ktime_t now = ktime_get(); 3168 3169 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 3170 if (!stats) 3171 return -1; 3172 3173 #define PUT_STAT_U32(attr, data) do { \ 3174 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3175 goto nla_put_failure; \ 3176 } while (0) 3177 #define PUT_STAT_S32(attr, data) do { \ 3178 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3179 goto nla_put_failure; \ 3180 } while (0) 3181 3182 PUT_STAT_S32(DEFICIT, flow->deficit); 3183 PUT_STAT_U32(DROPPING, flow->cvars.dropping); 3184 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 3185 PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 3186 if (flow->cvars.p_drop) { 3187 PUT_STAT_S32(BLUE_TIMER_US, 3188 ktime_to_us( 3189 ktime_sub(now, 3190 flow->cvars.blue_timer))); 3191 } 3192 if (flow->cvars.dropping) { 3193 PUT_STAT_S32(DROP_NEXT_US, 3194 ktime_to_us( 3195 ktime_sub(now, 3196 flow->cvars.drop_next))); 3197 } 3198 3199 if (nla_nest_end(d->skb, stats) < 0) 3200 return -1; 3201 } 3202 3203 return 0; 3204 3205 nla_put_failure: 3206 nla_nest_cancel(d->skb, stats); 3207 return -1; 3208 } 3209 3210 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 3211 { 3212 struct cake_sched_data *q = qdisc_priv(sch); 3213 unsigned int i, j; 3214 3215 if (arg->stop) 3216 return; 3217 3218 for (i = 0; i < q->tin_cnt; i++) { 3219 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3220 3221 for (j = 0; j < CAKE_QUEUES; j++) { 3222 if (list_empty(&b->flows[j].flowchain)) { 3223 arg->count++; 3224 continue; 3225 } 3226 if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, 3227 arg)) 3228 break; 3229 } 3230 } 3231 } 3232 3233 static const struct Qdisc_class_ops cake_class_ops = { 3234 .leaf = cake_leaf, 3235 .find = cake_find, 3236 .tcf_block = cake_tcf_block, 3237 .bind_tcf = cake_bind, 3238 .unbind_tcf = cake_unbind, 3239 .dump = cake_dump_class, 3240 .dump_stats = cake_dump_class_stats, 3241 .walk = cake_walk, 3242 }; 3243 3244 static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 3245 .cl_ops = &cake_class_ops, 3246 .id = "cake", 3247 .priv_size = sizeof(struct cake_sched_data), 3248 .enqueue = cake_enqueue, 3249 .dequeue = cake_dequeue, 3250 .peek = qdisc_peek_dequeued, 3251 .init = cake_init, 3252 .reset = cake_reset, 3253 .destroy = cake_destroy, 3254 .change = cake_change, 3255 .dump = cake_dump, 3256 .dump_stats = cake_dump_stats, 3257 .owner = THIS_MODULE, 3258 }; 3259 MODULE_ALIAS_NET_SCH("cake"); 3260 3261 struct cake_mq_sched { 3262 struct mq_sched mq_priv; /* must be first */ 3263 struct cake_sched_config cake_config; 3264 }; 3265 3266 static void cake_mq_destroy(struct Qdisc *sch) 3267 { 3268 mq_destroy_common(sch); 3269 } 3270 3271 static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt, 3272 struct netlink_ext_ack *extack) 3273 { 3274 struct cake_mq_sched *priv = qdisc_priv(sch); 3275 struct net_device *dev = qdisc_dev(sch); 3276 int ret, ntx; 3277 bool _unused; 3278 3279 cake_config_init(&priv->cake_config, true); 3280 if (opt) { 3281 ret = cake_config_change(&priv->cake_config, opt, extack, &_unused); 3282 if (ret) 3283 return ret; 3284 } 3285 3286 ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops); 3287 if (ret) 3288 return ret; 3289 3290 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) 3291 cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config); 3292 3293 return 0; 3294 } 3295 3296 static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb) 3297 { 3298 struct cake_mq_sched *priv = qdisc_priv(sch); 3299 3300 mq_dump_common(sch, skb); 3301 return cake_config_dump(&priv->cake_config, skb); 3302 } 3303 3304 static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt, 3305 struct netlink_ext_ack *extack) 3306 { 3307 struct cake_mq_sched *priv = qdisc_priv(sch); 3308 struct net_device *dev = qdisc_dev(sch); 3309 bool overhead_changed = false; 3310 unsigned int ntx; 3311 int ret; 3312 3313 ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed); 3314 if (ret) 3315 return ret; 3316 3317 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 3318 struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); 3319 struct cake_sched_data *qd = qdisc_priv(chld); 3320 3321 if (overhead_changed) { 3322 qd->max_netlen = 0; 3323 qd->max_adjlen = 0; 3324 qd->min_netlen = ~0; 3325 qd->min_adjlen = ~0; 3326 } 3327 3328 if (qd->tins) { 3329 sch_tree_lock(chld); 3330 cake_reconfigure(chld); 3331 sch_tree_unlock(chld); 3332 } 3333 } 3334 3335 return 0; 3336 } 3337 3338 static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, 3339 struct Qdisc **old, struct netlink_ext_ack *extack) 3340 { 3341 NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs"); 3342 return -EOPNOTSUPP; 3343 } 3344 3345 static const struct Qdisc_class_ops cake_mq_class_ops = { 3346 .select_queue = mq_select_queue, 3347 .graft = cake_mq_graft, 3348 .leaf = mq_leaf, 3349 .find = mq_find, 3350 .walk = mq_walk, 3351 .dump = mq_dump_class, 3352 .dump_stats = mq_dump_class_stats, 3353 }; 3354 3355 static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = { 3356 .cl_ops = &cake_mq_class_ops, 3357 .id = "cake_mq", 3358 .priv_size = sizeof(struct cake_mq_sched), 3359 .init = cake_mq_init, 3360 .destroy = cake_mq_destroy, 3361 .attach = mq_attach, 3362 .change = cake_mq_change, 3363 .change_real_num_tx = mq_change_real_num_tx, 3364 .dump = cake_mq_dump, 3365 .owner = THIS_MODULE, 3366 }; 3367 MODULE_ALIAS_NET_SCH("cake_mq"); 3368 3369 static int __init cake_module_init(void) 3370 { 3371 int ret; 3372 3373 ret = register_qdisc(&cake_qdisc_ops); 3374 if (ret) 3375 return ret; 3376 3377 ret = register_qdisc(&cake_mq_qdisc_ops); 3378 if (ret) 3379 unregister_qdisc(&cake_qdisc_ops); 3380 3381 return ret; 3382 } 3383 3384 static void __exit cake_module_exit(void) 3385 { 3386 unregister_qdisc(&cake_qdisc_ops); 3387 unregister_qdisc(&cake_mq_qdisc_ops); 3388 } 3389 3390 module_init(cake_module_init) 3391 module_exit(cake_module_exit) 3392 MODULE_AUTHOR("Jonathan Morton"); 3393 MODULE_LICENSE("Dual BSD/GPL"); 3394 MODULE_DESCRIPTION("The CAKE shaper."); 3395 MODULE_IMPORT_NS("NET_SCHED_INTERNAL"); 3396